diff --git a/Makefile b/Makefile
index 9711dc1a4..de5dc185b 100644
--- a/Makefile
+++ b/Makefile
@@ -17,8 +17,8 @@ ROCM_SRCS := $(wildcard rocm/*.cuh)
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
-CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_metal.o
-CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
+CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_metal.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
 else
 CFLAGS += -D_GNU_SOURCE -fno-finite-math-only
 CUDA_HOME ?= /usr/local/cuda
@@ -28,8 +28,8 @@ ifneq ($(strip $(CUDA_ARCH)),)
 NVCC_ARCH_FLAGS := -arch=$(CUDA_ARCH)
 endif
 NVCCFLAGS ?= -O3 -g -lineinfo --use_fast_math $(NVCC_ARCH_FLAGS) -Xcompiler $(NATIVE_CPU_FLAG) -Xcompiler -pthread
-CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_cuda.o
-CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o
+CORE_OBJS = ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_cuda.o
+CPU_CORE_OBJS = ds4_cpu.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o
 CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$(CUDA_HOME)/lib64 -lcudart -lcublas
 HIPCC ?= $(shell command -v hipcc 2>/dev/null || echo /opt/rocm/bin/hipcc)
 ROCM_ARCH ?= gfx1151
@@ -106,7 +106,7 @@ cuda:
 
 strix-halo:
 	$(MAKE) -B ds4 ds4-server ds4-bench ds4-eval ds4-agent \
-		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_rocm.o" \
+		CORE_OBJS="ds4.o ds4_distributed.o ds4_ssd.o ds4_dspark_runtime.o ds4_rocm.o" \
 		CFLAGS="$(CFLAGS) -DDS4_ROCM_BUILD" \
 		DS4_LINK="$(HIPCC) $(ROCM_CFLAGS)" \
 		DS4_LINK_LIBS="$(ROCM_LDLIBS)"
@@ -139,11 +139,13 @@ cuda-regression: tests/cuda_long_context_smoke
 	./tests/cuda_long_context_smoke
 endif
 
-ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
 	$(CC) $(CFLAGS) -c -o $@ ds4.c
 
 ds4_ssd.o: ds4_ssd.c ds4_ssd.h
-	$(CC) $(CFLAGS) -c -o $@ ds4_ssd.c
+
+ds4_dspark_runtime.o: ds4_dspark_runtime.c ds4_dspark_runtime.h ds4.h
+	$(CC) $(CFLAGS) -c -o $@ ds4_dspark_runtime.c
 
 ds4_cli.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ ds4_cli.c
@@ -187,7 +189,7 @@ rax.o: rax.c rax.h rax_malloc.h
 linenoise.o: linenoise.c linenoise.h
 	$(CC) $(CFLAGS) -c -o $@ linenoise.c
 
-ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_gpu.h
+ds4_cpu.o: ds4.c ds4.h ds4_ssd.h ds4_distributed.h ds4_dspark_runtime.h ds4_gpu.h
 	$(CC) $(CFLAGS) -DDS4_NO_GPU -c -o $@ ds4.c
 
 ds4_cli_cpu.o: ds4_cli.c ds4.h ds4_ssd.h ds4_distributed.h ds4_help.h linenoise.h
diff --git a/README.md b/README.md
index 785695284..5508ea8df 100644
--- a/README.md
+++ b/README.md
@@ -133,11 +133,37 @@ weights. Flash GGUF generation is supported by the local tools. PRO GGUF
 production currently still depends on the external `llama.cpp`-based workflow;
 native tooling can be added later.
 
-`./download_model.sh mtp` fetches the optional speculative decoding support
-GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and q4-imatrix,
-but must be enabled explicitly with `--mtp`. The current MTP/speculative
-decoding path is still experimental: it is correctness-gated and currently
-provides at most a slight speedup, not a meaningful generation-speed win.
+`./download_model.sh mtp` fetches the optional legacy speculative decoding
+support GGUF for Flash. It can be used with q2-imatrix, q2-q4-imatrix, and
+q4-imatrix, but must be enabled explicitly with `--mtp`. Legacy one-step MTP is
+correctness-gated and experimental: it currently provides at most a slight
+speedup, not a meaningful generation-speed win. Official DeepSeek-V4-Flash
+DSpark/DeepSpec Markov draft shards can be converted with
+`gguf-tools/deepseek4-quantize --dspark-only`. Passing the converted DSpark GGUF
+with `--mtp DSpark.gguf` enables an experimental Metal block speculative decode
+path: draft blocks are target-verified before commit, but acceptance and speed
+depend on the base/draft quantization and prompt. DSpark GGUFs are additional
+draft-model weights, so higher draft precision trades directly against
+long-context headroom. CPU builds do not run MTP, and CUDA/ROCm currently load
+DSpark GGUFs without enabling the DSpark runtime.
+
+For DeepSpec training experiments, `ds4 --dspark-target-cache-dataset FILE
+--dspark-target-cache-out DIR --dspark-target-cache-target-model HF_OR_PATH`
+consumes the same rendered prompt dataset format used by imatrix collection and
+writes a DeepSpec-compatible target cache (`manifest.json`, `samples.idx`, and
+shard data) containing prompt token ids, attention/loss masks, target-layer
+hidden states, and last hidden states. Use
+`--dspark-target-cache-chat-template NAME` to stamp the cache manifest with the
+DeepSpec training template identity.
+Validate the cache contract with
+`python3 gguf-tools/deepspec/ds4_deepspec.py DIR --target-model HF_OR_PATH`
+before handing it to a DeepSpec checkout. The same helper can emit the DS4-side
+non-Markov DeepSpec config scaffold with
+`python3 gguf-tools/deepspec/ds4_deepspec.py --emit-nonseq-config dspark_v4_nonseq.py --target-cache DIR`.
+This target-cache export path remains useful for DSpark/DeepSpec training
+experiments; the built-in Metal runtime uses already converted official DSpark
+Markov draft GGUFs and should still be benchmarked with `DS4_MTP_TIMING=1` on
+the exact base/draft quant pair before treating it as a throughput win.
 
 Then build:
 
@@ -689,10 +715,12 @@ conversation. Useful commands are `/help`, `/think`, `/think-max`, `/nothink`,
 and returns to `ds4>`.
 
 The CLI defaults to thinking mode. Use `/nothink` or `--nothink` for direct
-answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional MTP speculative
-path; it is useful only for greedy decoding, currently uses a confidence gate
-(`--mtp-margin`) to avoid slow partial accepts, and should be treated as an
-experimental slight-speedup path.
+answers. `--mtp MTP.gguf --mtp-draft 2` enables the optional legacy one-step
+MTP speculative path. Passing a converted official DSpark/DeepSpec Markov GGUF
+with `--mtp DSpark.gguf` opts into the experimental Metal block-draft runtime,
+which verifies proposed blocks against the target model before committing them.
+It is correctness-gated, not a guaranteed speedup; measure acceptance and wall
+time for the exact quantized base/draft pair.
 
 ## Server
 
diff --git a/download_model.sh b/download_model.sh
index 51d368a58..b9f410232 100755
--- a/download_model.sh
+++ b/download_model.sh
@@ -65,9 +65,9 @@ Targets:
        Downloads both PRO Q4 split files into the download directory. About
        838 GB total. This target does not update ./ds4flash.gguf.
 
-  mtp  Optional speculative decoding component, about 3.5 GB on disk.
-       It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but must be
-       enabled explicitly with --mtp when running ds4 or ds4-server.
+  mtp  Optional legacy one-step speculative decoding component, about 3.5 GB on
+       disk. It is useful with q2-imatrix, q2-q4-imatrix, and q4-imatrix, but
+       must be enabled explicitly with --mtp when running ds4 or ds4-server.
 
 Options:
   --token TOKEN  Hugging Face token. Otherwise HF_TOKEN or the local HF token
@@ -259,9 +259,10 @@ fi
 
 if [ "$MODEL" = "mtp" ]; then
     echo
-    echo "MTP is an optional component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
+    echo "MTP is an optional legacy one-step component for q2-imatrix, q2-q4-imatrix, and q4-imatrix."
     echo "Enable it explicitly, for example:"
     echo "  ./ds4 --mtp $OUT_DIR/$MTP_FILE --mtp-draft 2"
+    echo "DeepSpec/DSpark GGUFs are recognized separately by the loader but speculative block drafting remains disabled until validated."
 elif [ "$MODEL" = "pro-q4-layers00-30" ] || [ "$MODEL" = "pro-q4-layers31-output" ] || [ "$MODEL" = "pro-q4-split" ]; then
     echo
     echo "Downloaded PRO Q4 distributed split file(s). Use them with --layers,"
diff --git a/ds4.c b/ds4.c
index 640511eb0..ec206a0c9 100644
--- a/ds4.c
+++ b/ds4.c
@@ -15,6 +15,7 @@
  */
 
 #include <errno.h>
+#include <dirent.h>
 #include <fcntl.h>
 #include <float.h>
 #include <inttypes.h>
@@ -38,6 +39,11 @@
 
 #include "ds4.h"
 #include "ds4_distributed.h"
+#include "ds4_dspark_runtime.h"
+
+#ifndef DS4_GIT_COMMIT
+#define DS4_GIT_COMMIT "unknown"
+#endif
 
 #ifndef DS4_NO_GPU
 #include "ds4_gpu.h"
@@ -322,6 +328,7 @@ static uint32_t g_ds4_compress_ratios[DS4_MAX_LAYER] = {0};
 #define DS4_COMPRESS_ROPE_FREQ_BASE   (g_ds4_shape.compress_rope_freq_base)
 #define DS4_ROPE_ORIG_CTX             (g_ds4_shape.rope_orig_ctx)
 
+enum { DS4_DSPARK_MAX_BLOCK_SIZE = 16 };
 static int g_ds4_lock_fd = -1;
 
 #if defined(__GNUC__) || defined(__clang__)
@@ -607,6 +614,9 @@ typedef struct {
 } ds4_str;
 
 typedef ds4_tokens token_vec;
+static void token_vec_push(token_vec *tv, int token);
+static void token_vec_free(token_vec *tv);
+
 
 typedef struct {
     const uint8_t *base;
@@ -1594,6 +1604,7 @@ enum {
     DS4_TENSOR_Q4_K     = 12,
     DS4_TENSOR_IQ2_XXS  = 16,
     DS4_TENSOR_I32      = 26,
+    DS4_TENSOR_BF16     = 30,
 };
 
 typedef struct {
@@ -1617,6 +1628,7 @@ typedef struct {
     int fd;
     const uint8_t *map;
     uint64_t size;
+    char *path;
 
     uint32_t version;
     uint64_t n_kv;
@@ -1824,6 +1836,7 @@ static void model_close(ds4_model *m) {
     if (!m) return;
     free(m->kv);
     free(m->tensors);
+    free(m->path);
     if (m->map) munmap((void *)m->map, (size_t)m->size);
     if (m->fd >= 0) close(m->fd);
     memset(m, 0, sizeof(*m));
@@ -1973,6 +1986,7 @@ static void model_open(ds4_model *m, const char *path, bool metal_mapping,
     m->fd = fd;
     m->map = map;
     m->size = (uint64_t)st.st_size;
+    m->path = ds4_strdup(path);
 
     ds4_cursor c = cursor_at(m, 0);
     uint32_t magic;
@@ -2437,6 +2451,14 @@ static inline uint16_t f32_to_f16(float f) {
 #endif
 }
 
+static inline uint16_t f32_to_bf16(float f) {
+    uint32_t bits;
+    memcpy(&bits, &f, sizeof(bits));
+    const uint32_t lsb = (bits >> 16) & 1u;
+    bits += 0x7fffu + lsb;
+    return (uint16_t)(bits >> 16);
+}
+
 static void f16_round_inplace_cpu(float *x, uint32_t n) {
     for (uint32_t i = 0; i < n; i++) x[i] = f16_to_f32(f32_to_f16(x[i]));
 }
@@ -3061,16 +3083,26 @@ typedef struct {
     ds4_layer_weights layer[DS4_MAX_LAYER];
 } ds4_weights;
 
+enum { DS4_DSPARK_MTP_LAYERS = 3 };
+
 typedef struct {
-    ds4_tensor *e_proj;
-    ds4_tensor *h_proj;
-    ds4_tensor *enorm;
-    ds4_tensor *hnorm;
-    ds4_tensor *norm;
-    ds4_tensor *hc_head_base;
-    ds4_tensor *hc_head_fn;
-    ds4_tensor *hc_head_scale;
-    ds4_layer_weights block;
+    ds4_mtp_draft_kind kind;
+    ds4_dspark_config  dspark;
+    ds4_tensor        *e_proj;
+    ds4_tensor        *h_proj;
+    ds4_tensor        *enorm;
+    ds4_tensor        *hnorm;
+    ds4_tensor        *norm;
+    ds4_tensor        *hc_head_base;
+    ds4_tensor        *hc_head_fn;
+    ds4_tensor        *hc_head_scale;
+    ds4_tensor        *main_proj;
+    ds4_tensor        *main_norm;
+    ds4_tensor        *markov_w1;
+    ds4_tensor        *markov_w2;
+    ds4_tensor        *confidence_proj;
+    ds4_layer_weights  block;
+    ds4_layer_weights  stage[DS4_DSPARK_MTP_LAYERS];
 } ds4_mtp_weights;
 
 /* =========================================================================
@@ -3202,6 +3234,29 @@ static void tensor_expect_plain_layout(
     tensor_expect_layout(t, t->type, ndim, d0, d1, d2);
 }
 
+static bool tensor_type_is_plain_or_bf16(uint32_t type) {
+    return type == DS4_TENSOR_F16 || type == DS4_TENSOR_F32 ||
+           type == DS4_TENSOR_BF16;
+}
+
+static void tensor_expect_plain_or_bf16_layout(
+        const ds4_tensor *t,
+        uint32_t          ndim,
+        uint64_t          d0,
+        uint64_t          d1,
+        uint64_t          d2) {
+    if (!t) ds4_die("internal error: missing tensor while validating layout");
+    if (!tensor_type_is_plain_or_bf16(t->type)) {
+        fprintf(stderr,
+                "ds4: tensor %.*s has type %s, expected F16, F32, or BF16\n",
+                (int)t->name.len,
+                t->name.ptr,
+                tensor_type_name(t->type));
+        exit(1);
+    }
+    tensor_expect_layout(t, t->type, ndim, d0, d1, d2);
+}
+
 static bool tensor_type_is_f16_or_q8_0(uint32_t type) {
     return type == DS4_TENSOR_F16 || type == DS4_TENSOR_Q8_0;
 }
@@ -3639,21 +3694,106 @@ static void weights_validate_layout(
     }
 }
 
-static void mtp_weights_validate_layout(const ds4_mtp_weights *w) {
+
+void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg) {
+    if (!cfg) return;
+    memset(cfg, 0, sizeof(*cfg));
+    cfg->n_mtp_layers = 3;
+    cfg->block_size = 5;
+    cfg->noise_token_id = 128799u;
+    cfg->markov_rank = 256;
+    cfg->target_layer_ids[0] = 40;
+    cfg->target_layer_ids[1] = 41;
+    cfg->target_layer_ids[2] = 42;
+}
+
+const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind) {
+    switch (kind) {
+    case DS4_MTP_DRAFT_LEGACY: return "legacy-mtp";
+    case DS4_MTP_DRAFT_DSPARK:  return "dspark";
+    case DS4_MTP_DRAFT_DSPARK_NONSEQ: return "dspark-nonseq";
+    default:                    return "none";
+    }
+}
+
+ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj,
+                                                bool has_main_proj,
+                                                bool has_markov_w1,
+                                                bool markov_rank_set,
+                                                uint32_t markov_rank) {
+    if (has_main_proj && has_markov_w1) return DS4_MTP_DRAFT_DSPARK;
+    if (has_main_proj && markov_rank_set && markov_rank == 0) return DS4_MTP_DRAFT_DSPARK_NONSEQ;
+    if (has_e_proj) return DS4_MTP_DRAFT_LEGACY;
+    return DS4_MTP_DRAFT_NONE;
+}
+
+ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1) {
+    return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov_w1, false, 0);
+}
+
+static void dspark_config_apply_metadata(ds4_dspark_config *cfg, const ds4_model *m) {
+    ds4_dspark_config_init_defaults(cfg);
+    uint32_t v = 0;
+    if (model_get_u32(m, "deepseek4.dspark.n_mtp_layers", &v)) {
+        if (v != DS4_DSPARK_MTP_LAYERS) {
+            fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n",
+                    DS4_DSPARK_MTP_LAYERS, v);
+            exit(1);
+        }
+        cfg->n_mtp_layers = v;
+    }
+    if (model_get_u32(m, "deepseek4.dspark.block_size", &v) && v > 0) cfg->block_size = v;
+    if (model_get_u32(m, "deepseek4.dspark.noise_token_id", &v)) cfg->noise_token_id = v;
+    if (model_get_u32(m, "deepseek4.dspark.markov_rank", &v)) cfg->markov_rank = v;
+    for (uint32_t i = 0; i < 3; i++) {
+        char key[64];
+        snprintf(key, sizeof(key), "deepseek4.dspark.target_layer_ids.%u", i);
+        if (model_get_u32(m, key, &v)) cfg->target_layer_ids[i] = v;
+    }
+}
+
+static ds4_mtp_draft_kind mtp_model_detect_kind(const ds4_model *m) {
+    uint32_t markov_rank = 0;
+    const bool markov_rank_set = model_get_u32(m, "deepseek4.dspark.markov_rank", &markov_rank);
+    const bool has_e_proj = model_find_tensor(m, "mtp.0.e_proj.weight") != NULL;
+    const bool has_main_proj = model_find_tensor(m, "mtp.0.main_proj.weight") != NULL;
+    const bool has_markov = model_find_tensor(m, "mtp.2.markov_head.markov_w1.weight") != NULL;
+    return ds4_mtp_draft_kind_guess_ex(has_e_proj, has_main_proj, has_markov,
+                                       markov_rank_set, markov_rank);
+}
+
+static void mtp_weights_bind_mtp_layer(ds4_layer_weights *l, const ds4_model *m, uint32_t stage) {
+    l->hc_attn_fn      = required_tensorf(m, "mtp.%u.hc_attn_fn.weight", stage);
+    l->hc_attn_scale   = required_tensorf(m, "mtp.%u.hc_attn_scale.weight", stage);
+    l->hc_attn_base    = required_tensorf(m, "mtp.%u.hc_attn_base.weight", stage);
+    l->attn_norm       = required_tensorf(m, "mtp.%u.attn_norm.weight", stage);
+    l->attn_q_a        = required_tensorf(m, "mtp.%u.attn_q_a.weight", stage);
+    l->attn_q_a_norm   = required_tensorf(m, "mtp.%u.attn_q_a_norm.weight", stage);
+    l->attn_q_b        = required_tensorf(m, "mtp.%u.attn_q_b.weight", stage);
+    l->attn_kv         = required_tensorf(m, "mtp.%u.attn_kv.weight", stage);
+    l->attn_kv_a_norm  = required_tensorf(m, "mtp.%u.attn_kv_a_norm.weight", stage);
+    l->attn_sinks      = required_tensorf(m, "mtp.%u.attn_sinks.weight", stage);
+    l->attn_output_a   = required_tensorf(m, "mtp.%u.attn_output_a.weight", stage);
+    l->attn_output_b   = required_tensorf(m, "mtp.%u.attn_output_b.weight", stage);
+    l->hc_ffn_fn       = required_tensorf(m, "mtp.%u.hc_ffn_fn.weight", stage);
+    l->hc_ffn_scale    = required_tensorf(m, "mtp.%u.hc_ffn_scale.weight", stage);
+    l->hc_ffn_base     = required_tensorf(m, "mtp.%u.hc_ffn_base.weight", stage);
+    l->ffn_norm        = required_tensorf(m, "mtp.%u.ffn_norm.weight", stage);
+    l->ffn_gate_inp    = required_tensorf(m, "mtp.%u.ffn_gate_inp.weight", stage);
+    l->ffn_exp_probs_b = tensor_by_namef(m, "mtp.%u.exp_probs_b.bias", stage);
+    l->ffn_gate_exps   = required_tensorf(m, "mtp.%u.ffn_gate_exps.weight", stage);
+    l->ffn_up_exps     = required_tensorf(m, "mtp.%u.ffn_up_exps.weight", stage);
+    l->ffn_down_exps   = required_tensorf(m, "mtp.%u.ffn_down_exps.weight", stage);
+    l->ffn_gate_shexp  = required_tensorf(m, "mtp.%u.ffn_gate_shexp.weight", stage);
+    l->ffn_up_shexp    = required_tensorf(m, "mtp.%u.ffn_up_shexp.weight", stage);
+    l->ffn_down_shexp  = required_tensorf(m, "mtp.%u.ffn_down_shexp.weight", stage);
+}
+
+static void mtp_layer_validate_layout(const ds4_layer_weights *l, bool require_exp_probs_b) {
     const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC;
     const uint64_t hc_mix_dim = 2u * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC;
     const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM;
     const uint64_t out_low_dim = (uint64_t)DS4_N_OUT_GROUP * DS4_N_LORA_O;
-    const ds4_layer_weights *l = &w->block;
-
-    tensor_expect_layout(w->hc_head_base,  DS4_TENSOR_F32,  1, DS4_N_HC, 0, 0);
-    tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0);
-    tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32,  1, 1, 0, 0);
-    tensor_expect_layout(w->e_proj,        DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0);
-    tensor_expect_layout(w->h_proj,        DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0);
-    tensor_expect_layout(w->enorm,         DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
-    tensor_expect_layout(w->hnorm,         DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
-    tensor_expect_layout(w->norm,          DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
 
     tensor_expect_plain_layout(l->hc_attn_fn, 2, hc_dim, hc_mix_dim, 0);
     tensor_expect_layout(l->hc_attn_scale,  DS4_TENSOR_F32,  1, 3, 0, 0);
@@ -3667,13 +3807,16 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) {
     tensor_expect_layout(l->attn_sinks,     DS4_TENSOR_F32,  1, DS4_N_HEAD, 0, 0);
     tensor_expect_layout(l->attn_output_a,  DS4_TENSOR_Q8_0, 2, DS4_N_HEAD_DIM * (DS4_N_HEAD / DS4_N_OUT_GROUP), out_low_dim, 0);
     tensor_expect_layout(l->attn_output_b,  DS4_TENSOR_Q8_0, 2, out_low_dim, DS4_N_EMBD, 0);
-
     tensor_expect_plain_layout(l->hc_ffn_fn, 2, hc_dim, hc_mix_dim, 0);
     tensor_expect_layout(l->hc_ffn_scale,   DS4_TENSOR_F32,  1, 3, 0, 0);
     tensor_expect_layout(l->hc_ffn_base,    DS4_TENSOR_F32,  1, hc_mix_dim, 0, 0);
     tensor_expect_layout(l->ffn_norm,       DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
     tensor_expect_plain_layout(l->ffn_gate_inp, 2, DS4_N_EMBD, DS4_N_EXPERT, 0);
-    tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0);
+    if (require_exp_probs_b) {
+        tensor_expect_layout(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0);
+    } else {
+        tensor_expect_optional(l->ffn_exp_probs_b, DS4_TENSOR_F32, 1, DS4_N_EXPERT, 0, 0);
+    }
     tensor_expect_routed_expert(l->ffn_gate_exps, 3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT);
     tensor_expect_routed_expert(l->ffn_up_exps,   3, DS4_N_EMBD, DS4_N_FF_EXP, DS4_N_EXPERT);
     tensor_expect_routed_expert(l->ffn_down_exps, 3, DS4_N_FF_EXP, DS4_N_EMBD, DS4_N_EXPERT);
@@ -3685,6 +3828,93 @@ static void mtp_weights_validate_layout(const ds4_mtp_weights *w) {
     tensor_expect_layout(l->ffn_down_shexp, DS4_TENSOR_Q8_0, 2, DS4_N_FF_EXP, DS4_N_EMBD, 0);
 }
 
+static void mtp_weights_validate_legacy_layout(const ds4_mtp_weights *w) {
+    const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC;
+
+    tensor_expect_layout(w->hc_head_base,  DS4_TENSOR_F32,  1, DS4_N_HC, 0, 0);
+    tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0);
+    tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32,  1, 1, 0, 0);
+    tensor_expect_layout(w->e_proj,        DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0);
+    tensor_expect_layout(w->h_proj,        DS4_TENSOR_Q8_0, 2, DS4_N_EMBD, DS4_N_EMBD, 0);
+    tensor_expect_layout(w->enorm,         DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
+    tensor_expect_layout(w->hnorm,         DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
+    tensor_expect_layout(w->norm,          DS4_TENSOR_F32,  1, DS4_N_EMBD, 0, 0);
+    mtp_layer_validate_layout(&w->block, true);
+}
+
+static void mtp_weights_validate_dspark_layout(const ds4_mtp_weights *w) {
+    const uint64_t hc_dim = (uint64_t)DS4_N_EMBD * DS4_N_HC;
+    const uint64_t main_in = 3u * DS4_N_EMBD;
+    const bool has_markov_head = w->kind == DS4_MTP_DRAFT_DSPARK;
+    if (w->dspark.block_size == 0 || w->dspark.block_size > 16) {
+        ds4_die("DSpark block_size must be in 1..16");
+    }
+
+    tensor_expect_layout(w->main_proj, DS4_TENSOR_Q8_0, 2, main_in, DS4_N_EMBD, 0);
+    tensor_expect_layout(w->main_norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0);
+    for (uint32_t s = 0; s < w->dspark.n_mtp_layers; s++) {
+        mtp_layer_validate_layout(&w->stage[s], false);
+    }
+    tensor_expect_layout(w->norm, DS4_TENSOR_F32, 1, DS4_N_EMBD, 0, 0);
+    tensor_expect_layout(w->hc_head_base,  DS4_TENSOR_F32,  1, DS4_N_HC, 0, 0);
+    tensor_expect_plain_layout(w->hc_head_fn, 2, hc_dim, DS4_N_HC, 0);
+    tensor_expect_layout(w->hc_head_scale, DS4_TENSOR_F32,  1, 1, 0, 0);
+    if (has_markov_head) {
+        const uint64_t conf_in = DS4_N_EMBD + (uint64_t)w->dspark.markov_rank;
+        if (w->dspark.markov_rank == 0) ds4_die("official DSpark Markov head has zero markov rank");
+        tensor_expect_plain_or_bf16_layout(w->markov_w1, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0);
+        tensor_expect_plain_or_bf16_layout(w->markov_w2, 2, w->dspark.markov_rank, DS4_N_VOCAB, 0);
+        if (!w->confidence_proj) ds4_die("internal error: missing DSpark confidence projection");
+        if (w->confidence_proj->ndim == 1) {
+            tensor_expect_plain_or_bf16_layout(w->confidence_proj, 1, conf_in, 0, 0);
+        } else {
+            tensor_expect_plain_or_bf16_layout(w->confidence_proj, 2, conf_in, 1, 0);
+        }
+    } else if (w->dspark.markov_rank != 0) {
+        ds4_die("nonseq DSpark draft must declare deepseek4.dspark.markov_rank=0");
+    }
+}
+
+static void mtp_weights_bind_legacy(ds4_mtp_weights *w, const ds4_model *m) {
+    w->kind = DS4_MTP_DRAFT_LEGACY;
+    w->hc_head_base  = required_tensor(m, "mtp.0.hc_head_base.weight");
+    w->hc_head_fn    = required_tensor(m, "mtp.0.hc_head_fn.weight");
+    w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight");
+    w->e_proj        = required_tensor(m, "mtp.0.e_proj.weight");
+    w->h_proj        = required_tensor(m, "mtp.0.h_proj.weight");
+    w->enorm         = required_tensor(m, "mtp.0.enorm.weight");
+    w->hnorm         = required_tensor(m, "mtp.0.hnorm.weight");
+    w->norm          = required_tensor(m, "mtp.0.norm.weight");
+    mtp_weights_bind_mtp_layer(&w->block, m, 0);
+    mtp_weights_validate_legacy_layout(w);
+}
+
+static void mtp_weights_bind_dspark(ds4_mtp_weights *w, const ds4_model *m) {
+    w->kind = mtp_model_detect_kind(m);
+    dspark_config_apply_metadata(&w->dspark, m);
+    if (w->dspark.n_mtp_layers != DS4_DSPARK_MTP_LAYERS) {
+        fprintf(stderr, "ds4: DSpark draft expects %u stages, GGUF has n_mtp_layers=%u\n",
+                DS4_DSPARK_MTP_LAYERS, w->dspark.n_mtp_layers);
+        exit(1);
+    }
+    w->main_proj = required_tensor(m, "mtp.0.main_proj.weight");
+    w->main_norm = required_tensor(m, "mtp.0.main_norm.weight");
+    for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+        mtp_weights_bind_mtp_layer(&w->stage[s], m, s);
+    }
+    w->norm = required_tensor(m, "mtp.2.norm.weight");
+    w->hc_head_base  = required_tensor(m, "mtp.2.hc_head_base.weight");
+    w->hc_head_fn    = required_tensor(m, "mtp.2.hc_head_fn.weight");
+    w->hc_head_scale = required_tensor(m, "mtp.2.hc_head_scale.weight");
+    if (w->kind == DS4_MTP_DRAFT_DSPARK) {
+        w->markov_w1 = required_tensor(m, "mtp.2.markov_head.markov_w1.weight");
+        w->markov_w2 = required_tensor(m, "mtp.2.markov_head.markov_w2.weight");
+        w->confidence_proj = required_tensor(m, "mtp.2.confidence_head.proj.weight");
+    }
+    mtp_weights_validate_dspark_layout(w);
+}
+
+
 static bool ds4_shape_matches_metadata(
         const ds4_shape *s,
         uint32_t n_layer,
@@ -4433,45 +4663,34 @@ static DS4_MAYBE_UNUSED bool weights_model_map_output_spans(
     return model_map_span_vec_finish(spans);
 }
 
-static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) {
-    memset(w, 0, sizeof(*w));
 
-    w->hc_head_base  = required_tensor(m, "mtp.0.hc_head_base.weight");
-    w->hc_head_fn    = required_tensor(m, "mtp.0.hc_head_fn.weight");
-    w->hc_head_scale = required_tensor(m, "mtp.0.hc_head_scale.weight");
-    w->e_proj        = required_tensor(m, "mtp.0.e_proj.weight");
-    w->h_proj        = required_tensor(m, "mtp.0.h_proj.weight");
-    w->enorm         = required_tensor(m, "mtp.0.enorm.weight");
-    w->hnorm         = required_tensor(m, "mtp.0.hnorm.weight");
-    w->norm          = required_tensor(m, "mtp.0.norm.weight");
+bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind) {
+    return kind == DS4_MTP_DRAFT_LEGACY || kind == DS4_MTP_DRAFT_DSPARK;
+}
 
-    ds4_layer_weights *l = &w->block;
-    l->hc_attn_fn      = required_tensor(m, "mtp.0.hc_attn_fn.weight");
-    l->hc_attn_scale   = required_tensor(m, "mtp.0.hc_attn_scale.weight");
-    l->hc_attn_base    = required_tensor(m, "mtp.0.hc_attn_base.weight");
-    l->attn_norm       = required_tensor(m, "mtp.0.attn_norm.weight");
-    l->attn_q_a        = required_tensor(m, "mtp.0.attn_q_a.weight");
-    l->attn_q_a_norm   = required_tensor(m, "mtp.0.attn_q_a_norm.weight");
-    l->attn_q_b        = required_tensor(m, "mtp.0.attn_q_b.weight");
-    l->attn_kv         = required_tensor(m, "mtp.0.attn_kv.weight");
-    l->attn_kv_a_norm  = required_tensor(m, "mtp.0.attn_kv_a_norm.weight");
-    l->attn_sinks      = required_tensor(m, "mtp.0.attn_sinks.weight");
-    l->attn_output_a   = required_tensor(m, "mtp.0.attn_output_a.weight");
-    l->attn_output_b   = required_tensor(m, "mtp.0.attn_output_b.weight");
-    l->hc_ffn_fn       = required_tensor(m, "mtp.0.hc_ffn_fn.weight");
-    l->hc_ffn_scale    = required_tensor(m, "mtp.0.hc_ffn_scale.weight");
-    l->hc_ffn_base     = required_tensor(m, "mtp.0.hc_ffn_base.weight");
-    l->ffn_norm        = required_tensor(m, "mtp.0.ffn_norm.weight");
-    l->ffn_gate_inp    = required_tensor(m, "mtp.0.ffn_gate_inp.weight");
-    l->ffn_exp_probs_b = required_tensor(m, "mtp.0.exp_probs_b.bias");
-    l->ffn_gate_exps   = required_tensor(m, "mtp.0.ffn_gate_exps.weight");
-    l->ffn_up_exps     = required_tensor(m, "mtp.0.ffn_up_exps.weight");
-    l->ffn_down_exps   = required_tensor(m, "mtp.0.ffn_down_exps.weight");
-    l->ffn_gate_shexp  = required_tensor(m, "mtp.0.ffn_gate_shexp.weight");
-    l->ffn_up_shexp    = required_tensor(m, "mtp.0.ffn_up_shexp.weight");
-    l->ffn_down_shexp  = required_tensor(m, "mtp.0.ffn_down_shexp.weight");
-
-    mtp_weights_validate_layout(w);
+bool ds4_mtp_draft_runtime_supported(ds4_backend backend, ds4_mtp_draft_kind kind) {
+    if (backend == DS4_BACKEND_CPU) return false;
+    if (!ds4_mtp_speculative_draft_ready(kind)) return false;
+    const bool dspark_family = kind == DS4_MTP_DRAFT_DSPARK ||
+                               kind == DS4_MTP_DRAFT_DSPARK_NONSEQ;
+    if (dspark_family && backend != DS4_BACKEND_METAL) return false;
+    return true;
+}
+
+static void mtp_weights_bind(ds4_mtp_weights *w, const ds4_model *m) {
+    memset(w, 0, sizeof(*w));
+    const ds4_mtp_draft_kind kind = mtp_model_detect_kind(m);
+    if (kind == DS4_MTP_DRAFT_DSPARK || kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) {
+        mtp_weights_bind_dspark(w, m);
+        return;
+    }
+    if (kind == DS4_MTP_DRAFT_LEGACY) {
+        mtp_weights_bind_legacy(w, m);
+        return;
+    }
+    fprintf(stderr,
+            "ds4: unsupported draft GGUF: need legacy mtp.0.e_proj, official DSpark mtp.0.main_proj + mtp.2.markov_head, or nonseq DSpark mtp.0.main_proj + deepseek4.dspark.markov_rank=0\n");
+    exit(1);
 }
 
 static void weights_free(ds4_weights *w) {
@@ -4592,6 +4811,115 @@ static void matvec_f16_serial(float *out, const ds4_model *m, const ds4_tensor *
     }
 }
 
+static inline float tensor_plain_value(const ds4_model *m, const ds4_tensor *w, uint64_t idx) {
+    const void *data = tensor_data(m, w);
+    if (w->type == DS4_TENSOR_F32) {
+        const float *x = data;
+        return x[idx];
+    }
+    if (w->type == DS4_TENSOR_F16) {
+        const uint16_t *x = data;
+        return f16_to_f32(x[idx]);
+    }
+    if (w->type == DS4_TENSOR_BF16) {
+        const uint16_t *x = data;
+        return ds4_dspark_bf16_to_f32(x[idx]);
+    }
+    ds4_die("expected an F16, F32, or BF16 tensor");
+    return 0.0f;
+}
+
+static void tensor_plain_row_to_f32(float *out,
+                                    const ds4_model *m,
+                                    const ds4_tensor *w,
+                                    uint64_t row) {
+    if (w->ndim != 2) ds4_die("expected a 2D plain tensor");
+    const uint64_t n = w->dim[0];
+    const uint64_t offset = row * n;
+    for (uint64_t i = 0; i < n; i++) out[i] = tensor_plain_value(m, w, offset + i);
+}
+
+typedef struct {
+    float *logits;
+    const void *weights;
+    const float *latent;
+    uint64_t rank;
+    uint32_t type;
+} dspark_markov_bias_ctx;
+
+static void dspark_markov_bias_worker(void *vctx, uint64_t row0, uint64_t row1) {
+    dspark_markov_bias_ctx *ctx = vctx;
+    const uint64_t rank = ctx->rank;
+
+    if (ctx->type == DS4_TENSOR_F32) {
+        const float *w = ctx->weights;
+        for (uint64_t vocab = row0; vocab < row1; vocab++) {
+            const float *row = w + vocab * rank;
+            float bias = 0.0f;
+            for (uint64_t i = 0; i < rank; i++) bias += row[i] * ctx->latent[i];
+            ctx->logits[vocab] += bias;
+        }
+        return;
+    }
+
+    if (ctx->type == DS4_TENSOR_F16) {
+        const uint16_t *w = ctx->weights;
+        for (uint64_t vocab = row0; vocab < row1; vocab++) {
+            const uint16_t *row = w + vocab * rank;
+            float bias = 0.0f;
+            for (uint64_t i = 0; i < rank; i++) bias += f16_to_f32(row[i]) * ctx->latent[i];
+            ctx->logits[vocab] += bias;
+        }
+        return;
+    }
+
+    if (ctx->type == DS4_TENSOR_BF16) {
+        const uint16_t *w = ctx->weights;
+        for (uint64_t vocab = row0; vocab < row1; vocab++) {
+            const uint16_t *row = w + vocab * rank;
+            float bias = 0.0f;
+            for (uint64_t i = 0; i < rank; i++) bias += ds4_dspark_bf16_to_f32(row[i]) * ctx->latent[i];
+            ctx->logits[vocab] += bias;
+        }
+        return;
+    }
+
+    ds4_die("expected an F16, F32, or BF16 tensor");
+}
+
+static void dspark_apply_markov_bias(float *logits,
+                                     const ds4_model *m,
+                                     const ds4_mtp_weights *mtp,
+                                     int prev_token) {
+    if (!logits || !m || !mtp || !mtp->markov_w1 || !mtp->markov_w2 ||
+        prev_token < 0 || prev_token >= (int)DS4_N_VOCAB) {
+        return;
+    }
+
+    const uint64_t rank = mtp->dspark.markov_rank;
+    if (rank == 0) return;
+    if (mtp->markov_w1->ndim != 2 || mtp->markov_w2->ndim != 2 ||
+        mtp->markov_w1->dim[0] != rank || mtp->markov_w1->dim[1] != DS4_N_VOCAB ||
+        mtp->markov_w2->dim[0] != rank || mtp->markov_w2->dim[1] != DS4_N_VOCAB) {
+        ds4_die("invalid DSpark Markov tensor layout");
+    }
+
+    float latent[512];
+    if (rank > sizeof(latent) / sizeof(latent[0])) {
+        ds4_die("DSpark Markov rank exceeds local buffer");
+    }
+    tensor_plain_row_to_f32(latent, m, mtp->markov_w1, (uint64_t)prev_token);
+
+    dspark_markov_bias_ctx ctx = {
+        .logits = logits,
+        .weights = tensor_data(m, mtp->markov_w2),
+        .latent = latent,
+        .rank = rank,
+        .type = mtp->markov_w2->type,
+    };
+    ds4_parallel_for_min_rows(DS4_N_VOCAB, dspark_markov_bias_worker, &ctx, 1024);
+}
+
 typedef struct {
     float *out;
     const uint8_t *data;
@@ -8320,6 +8648,7 @@ typedef struct {
     uint32_t head_dim;
 } ds4_kv_cache;
 
+
 static uint32_t ds4_default_raw_cap(uint32_t ctx_size) {
     uint32_t raw_cap = DS4_N_SWA;
     if (raw_cap > ctx_size) raw_cap = ctx_size;
@@ -10421,6 +10750,18 @@ typedef struct {
     ds4_gpu_tensor *mtp_next_hc;
     ds4_gpu_tensor *mtp_raw_cache;
     uint32_t mtp_n_raw;
+
+    /* Optional DSpark block-draft state.  The target decoder captures mean-HC
+     * hidden rows at the configured target layers, then the drafter consumes
+     * that 3-row feature to propose a block of candidate tokens. */
+    ds4_gpu_tensor *dspark_main_hidden;
+    ds4_gpu_tensor *dspark_main_x;
+    ds4_gpu_tensor *dspark_verify_hidden;
+    ds4_gpu_tensor *dspark_verify_main_x;
+    ds4_gpu_tensor *dspark_mean_weights;
+    ds4_gpu_tensor *dspark_kv_cache[DS4_DSPARK_MTP_LAYERS];
+    uint32_t dspark_target_layer_ids[DS4_DSPARK_MTP_LAYERS];
+    uint32_t dspark_n_real;
     uint32_t prefill_cap;
     uint32_t raw_window;
 
@@ -10491,6 +10832,7 @@ typedef struct {
     bool ssd_streaming_cold;
     bool streaming_static_decode_map_current;
     bool mtp_enabled;
+    bool dspark_enabled;
     float *cpu_router_norm;
 } ds4_gpu_graph;
 
@@ -10530,7 +10872,6 @@ static void graph_power_note_decode_token(ds4_gpu_graph *g, double elapsed_sec)
         graph_power_update_avg(g->decode_token_avg_sec, elapsed_sec);
     graph_power_sleep(g->decode_token_avg_sec, g->power_percent);
 }
-
 /* Release every Metal tensor owned by the whole-model graph runtime. */
 static void metal_graph_free(ds4_gpu_graph *g) {
     ds4_gpu_tensor_free(g->directional_steering_dirs);
@@ -10575,6 +10916,14 @@ static void metal_graph_free(ds4_gpu_graph *g) {
     ds4_gpu_tensor_free(g->batch_next_hc);
     ds4_gpu_tensor_free(g->batch_cur_hc);
     ds4_gpu_tensor_free(g->prefill_tokens);
+    for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+        ds4_gpu_tensor_free(g->dspark_kv_cache[s]);
+    }
+    ds4_gpu_tensor_free(g->dspark_mean_weights);
+    ds4_gpu_tensor_free(g->dspark_main_x);
+    ds4_gpu_tensor_free(g->dspark_verify_main_x);
+    ds4_gpu_tensor_free(g->dspark_verify_hidden);
+    ds4_gpu_tensor_free(g->dspark_main_hidden);
     ds4_gpu_tensor_free(g->logits);
     ds4_gpu_tensor_free(g->mtp_raw_cache);
     ds4_gpu_tensor_free(g->mtp_next_hc);
@@ -10956,14 +11305,23 @@ static bool metal_graph_ensure_batch_ffn_out(ds4_gpu_graph *g) {
  * weights are not copied here; tensors reference the mapped GGUF. */
 static bool metal_graph_alloc_raw_cap(
         ds4_gpu_graph *g,
-        const ds4_weights     *weights,
+        const ds4_weights       *weights,
         const ds4_layer_weights *layer,
-        uint32_t                raw_cap,
-        uint32_t                ctx_size,
-        uint32_t                prefill_cap,
-        bool                    enable_mtp) {
+        const ds4_mtp_weights   *mtp_weights,
+        uint32_t                 raw_cap,
+        uint32_t                 ctx_size,
+        uint32_t                 prefill_cap,
+        bool                     enable_mtp) {
     memset(g, 0, sizeof(*g));
     g->mtp_enabled = enable_mtp;
+    const bool enable_dspark =
+        enable_mtp && mtp_weights && mtp_weights->kind == DS4_MTP_DRAFT_DSPARK;
+    g->dspark_enabled = enable_dspark;
+    if (enable_dspark) {
+        for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+            g->dspark_target_layer_ids[s] = mtp_weights->dspark.target_layer_ids[s];
+        }
+    }
     if (raw_cap == 0) raw_cap = 1;
     if (ctx_size == 0) ctx_size = raw_cap;
     if (prefill_cap == 0) prefill_cap = 1;
@@ -11169,6 +11527,30 @@ static bool metal_graph_alloc_raw_cap(
         g->spec_logits = ds4_gpu_tensor_alloc((uint64_t)16 * DS4_N_VOCAB * sizeof(float));
         g->mtp_n_raw = 0;
     }
+    if (enable_dspark) {
+        g->dspark_main_hidden = ds4_gpu_tensor_alloc(
+                (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float));
+        g->dspark_main_x = ds4_gpu_tensor_alloc((uint64_t)DS4_N_EMBD * sizeof(float));
+        g->dspark_verify_hidden = ds4_gpu_tensor_alloc(
+                (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE *
+                DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float));
+        g->dspark_verify_main_x = ds4_gpu_tensor_alloc(
+                (uint64_t)DS4_DSPARK_MAX_BLOCK_SIZE * DS4_N_EMBD * sizeof(float));
+        g->dspark_mean_weights = ds4_gpu_tensor_alloc((uint64_t)DS4_N_HC * sizeof(float));
+        for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+            g->dspark_kv_cache[s] = metal_graph_alloc_kv_cache_tensor(
+                    managed_kv_cache,
+                    (uint64_t)(DS4_N_SWA + mtp_weights->dspark.block_size) *
+                    DS4_N_HEAD_DIM * sizeof(float));
+        }
+        if (g->dspark_mean_weights) {
+            state_init_ok = state_init_ok &&
+                metal_tensor_fill_f32(g->dspark_mean_weights,
+                                      1.0f / (float)DS4_N_HC,
+                                      DS4_N_HC);
+        }
+        g->dspark_n_real = 0;
+    }
 
     g->prefill_tokens = ds4_gpu_tensor_alloc(pc * sizeof(int32_t));
     g->batch_cur_hc = ds4_gpu_tensor_alloc(pc * hc_dim * sizeof(float));
@@ -11265,6 +11647,12 @@ static bool metal_graph_alloc_raw_cap(
                       g->mtp_eproj_hc && g->mtp_hnorm_hc && g->mtp_hproj_hc &&
                       g->mtp_input_hc && g->mtp_state_hc && g->mtp_next_hc &&
                       g->mtp_raw_cache && g->spec_logits)) &&
+                    (!enable_dspark ||
+                     (g->dspark_main_hidden && g->dspark_main_x &&
+                      g->dspark_verify_hidden && g->dspark_verify_main_x &&
+                      g->dspark_mean_weights &&
+                      g->dspark_kv_cache[0] && g->dspark_kv_cache[1] &&
+                      g->dspark_kv_cache[2])) &&
                     g->prefill_tokens &&
                     g->batch_cur_hc && g->batch_next_hc && g->batch_flat_hc &&
                     g->batch_hc_mix && g->batch_hc_split &&
@@ -11292,7 +11680,8 @@ static bool metal_graph_alloc(
         ds4_gpu_graph *g,
         const ds4_weights     *weights,
         const ds4_layer_weights *layer) {
-    return metal_graph_alloc_raw_cap(g, weights, layer, DS4_N_SWA, DS4_N_SWA, 1, false);
+    return metal_graph_alloc_raw_cap(g, weights, layer, NULL,
+                                     DS4_N_SWA, DS4_N_SWA, 1, false);
 }
 
 static bool metal_graph_install_model_spans(
@@ -16312,6 +16701,79 @@ static bool metal_graph_encode_output_head_mtp(
     return ok;
 }
 
+static DS4_MAYBE_UNUSED bool metal_graph_encode_output_head_mtp_batch(
+        ds4_gpu_graph       *g,
+        const ds4_model       *base_model,
+        const ds4_weights     *base_weights,
+        const ds4_model       *mtp_model,
+        const ds4_mtp_weights *mtp,
+        uint32_t               n_tokens,
+        uint64_t               vocab_dim) {
+    if (n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) return false;
+
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view(
+            g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float));
+    ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view(
+            g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float));
+    ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view(
+            g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float));
+    ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view(
+            g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float));
+    ds4_gpu_tensor *logits = ds4_gpu_tensor_view(
+            g->spec_logits, 0, (uint64_t)n_tokens * vocab_dim * sizeof(float));
+    bool ok = output_pre && output_weights && output_embd && output_norm && logits;
+
+    if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc,
+                                                     g->batch_cur_hc,
+                                                     (uint32_t)hc_dim,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(output_pre,
+                                                  mtp_model,
+                                                  mtp->hc_head_fn,
+                                                  hc_dim,
+                                                  DS4_N_HC,
+                                                  g->batch_flat_hc,
+                                                  n_tokens);
+    if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights,
+                                                   output_pre,
+                                                   mtp_model->map,
+                                                   mtp_model->size,
+                                                   mtp->hc_head_scale->abs_offset,
+                                                   mtp->hc_head_base->abs_offset,
+                                                   DS4_N_HC,
+                                                   DS4_HC_EPS) != 0;
+    if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd,
+                                                 g->batch_cur_hc,
+                                                 output_weights,
+                                                 DS4_N_EMBD,
+                                                 DS4_N_HC) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm,
+                                                     output_embd,
+                                                     mtp_model->map,
+                                                     mtp_model->size,
+                                                     mtp->norm->abs_offset,
+                                                     DS4_N_EMBD,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits,
+                                             base_model->map,
+                                             base_model->size,
+                                             base_weights->output->abs_offset,
+                                             DS4_N_EMBD,
+                                             vocab_dim,
+                                             output_norm,
+                                             n_tokens) != 0;
+
+    ds4_gpu_tensor_free(logits);
+    ds4_gpu_tensor_free(output_norm);
+    ds4_gpu_tensor_free(output_embd);
+    ds4_gpu_tensor_free(output_weights);
+    ds4_gpu_tensor_free(output_pre);
+    return ok;
+}
+
 /* =========================================================================
  * Metal Diagnostic Comparisons.
  * =========================================================================
@@ -16941,6 +17403,67 @@ static uint32_t metal_graph_token_split_after_layers(void) {
     return split_after_layers;
 }
 
+static bool metal_graph_capture_dspark_main_hidden(ds4_gpu_graph *g, uint32_t il) {
+    if (!g || !g->dspark_enabled) return true;
+    if (!g->cur_hc || !g->dspark_main_hidden || !g->dspark_mean_weights) return false;
+
+    for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+        if (g->dspark_target_layer_ids[s] != il) continue;
+        ds4_gpu_tensor *dst = ds4_gpu_tensor_view(
+                g->dspark_main_hidden,
+                (uint64_t)s * DS4_N_EMBD * sizeof(float),
+                (uint64_t)DS4_N_EMBD * sizeof(float));
+        const bool ok = dst &&
+                        ds4_gpu_hc_weighted_sum_tensor(dst,
+                                                       g->cur_hc,
+                                                       g->dspark_mean_weights,
+                                                       DS4_N_EMBD,
+                                                       DS4_N_HC) != 0;
+        ds4_gpu_tensor_free(dst);
+        return ok;
+    }
+    return true;
+}
+
+static bool metal_graph_capture_dspark_batch_main_hidden(ds4_gpu_graph *g,
+                                                         uint32_t il,
+                                                         uint32_t n_tokens) {
+    if (!g || !g->dspark_enabled) return true;
+    if (!g->batch_cur_hc || !g->dspark_verify_hidden || !g->dspark_mean_weights ||
+        n_tokens == 0 || n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE) {
+        return false;
+    }
+
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    const uint64_t hidden_row_bytes =
+        (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float);
+    const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float);
+
+    for (uint32_t s = 0; s < DS4_DSPARK_MTP_LAYERS; s++) {
+        if (g->dspark_target_layer_ids[s] != il) continue;
+        for (uint32_t row = 0; row < n_tokens; row++) {
+            ds4_gpu_tensor *src = ds4_gpu_tensor_view(
+                    g->batch_cur_hc,
+                    (uint64_t)row * hc_dim * sizeof(float),
+                    hc_dim * sizeof(float));
+            ds4_gpu_tensor *dst = ds4_gpu_tensor_view(
+                    g->dspark_verify_hidden,
+                    (uint64_t)row * hidden_row_bytes + (uint64_t)s * stage_bytes,
+                    stage_bytes);
+            const bool ok = src && dst &&
+                            ds4_gpu_hc_weighted_sum_tensor(dst,
+                                                           src,
+                                                           g->dspark_mean_weights,
+                                                           DS4_N_EMBD,
+                                                           DS4_N_HC) != 0;
+            ds4_gpu_tensor_free(dst);
+            ds4_gpu_tensor_free(src);
+            if (!ok) return false;
+        }
+    }
+    return true;
+}
+
 /* Encode a full single-token decode step on Metal.  This is the generation
  * hot path: update caches, run all layers, then produce logits. */
 static bool metal_graph_encode_token_raw_swa(
@@ -16990,6 +17513,7 @@ static bool metal_graph_encode_token_raw_swa(
         ds4_gpu_tensor *tmp = g->cur_hc;
         g->cur_hc = g->after_ffn_hc;
         g->after_ffn_hc = tmp;
+        if (ok) ok = metal_graph_capture_dspark_main_hidden(g, il);
         if (ok && allow_split_flush && split_after_layers != 0 && il + 1u == split_after_layers) {
             ok = ds4_gpu_flush_commands() != 0;
         }
@@ -19283,45 +19807,650 @@ static bool metal_graph_encode_layer_batch(
     return ok;
 }
 
-static bool metal_graph_eval_token_raw_swa_streaming(
-        ds4_gpu_graph *g,
-        const ds4_model       *model,
-        const ds4_weights     *weights,
-        int                    token,
-        uint32_t               pos,
-        float                 *logits) {
-    if (g->raw_cap == 0) {
-        fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n");
+static bool metal_graph_dspark_input_stage(
+        ds4_gpu_graph          *g,
+        const ds4_model        *target_model,
+        const ds4_weights      *target_weights,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        int                     anchor_token,
+        uint32_t                block_size) {
+    if (!g || !target_model || !target_weights || !dspark_model || !mtp ||
+        !g->dspark_main_hidden || !g->dspark_main_x || !g->batch_cur_hc ||
+        block_size == 0 || block_size > g->prefill_cap) {
         return false;
     }
 
-    const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL;
-    const bool throttle = graph_power_throttle_enabled(g);
-    const double t0 = (profile || throttle) ? now_sec() : 0.0;
-    const uint32_t raw_row = pos % g->raw_cap;
-    const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1);
+    bool ok = ds4_gpu_begin_commands() != 0;
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->dspark_main_x,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             mtp->main_proj->abs_offset,
+                                             3ull * DS4_N_EMBD,
+                                             (uint64_t)DS4_N_EMBD,
+                                             g->dspark_main_hidden,
+                                             1) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_tensor(g->dspark_main_x,
+                                                 g->dspark_main_x,
+                                                 dspark_model->map,
+                                                 dspark_model->size,
+                                                 mtp->main_norm->abs_offset,
+                                                 DS4_N_EMBD,
+                                                 DS4_RMS_EPS) != 0;
+    if (ds4_gpu_end_commands() == 0) ok = false;
+    if (!ok) return false;
 
-    const bool static_decode_map = metal_graph_stream_decode_static_map_enabled();
-    const bool static_map_state_cache =
-        static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled();
-    const bool batch_static_decode =
-        static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g);
-    bool ok = true;
-    if (static_decode_map) {
-        if (!static_map_state_cache || !g->streaming_static_decode_map_current) {
-            ok = metal_graph_stream_map_decode_static_all(model, weights);
-            if (ok) g->streaming_static_decode_map_current = static_map_state_cache;
-        }
-    } else {
-        g->streaming_static_decode_map_current = false;
-        ok = metal_graph_stream_map_token(model, weights);
-    }
-    if (ok && !static_decode_map && DS4_N_LAYER > 0) {
-        metal_graph_stream_readahead_layer_decode(model, weights, 0);
+    token_vec draft_ids = {0};
+    token_vec_push(&draft_ids, anchor_token);
+    for (uint32_t i = 1; i < block_size; i++) {
+        token_vec_push(&draft_ids, (int)mtp->dspark.noise_token_id);
     }
-    if (ok) ok = ds4_gpu_begin_commands() != 0;
-    if (ok) {
-        ok = ds4_gpu_embed_token_hc_tensor(g->cur_hc,
+
+    ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, &draft_ids, 0u, block_size);
+    if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc,
+                                                          g->prefill_tokens,
+                                                          target_model,
+                                                          target_weights,
+                                                          &draft_ids,
+                                                          0u,
+                                                          block_size);
+    token_vec_free(&draft_ids);
+    return ok;
+}
+
+static bool metal_graph_dspark_encode_attention(
+        ds4_gpu_graph           *g,
+        const ds4_model         *dspark_model,
+        const ds4_layer_weights *layer,
+        uint32_t                 stage,
+        uint32_t                 start_pos,
+        uint32_t                 n_tokens) {
+    if (!g || !dspark_model || !layer || stage >= DS4_DSPARK_MTP_LAYERS ||
+        n_tokens == 0 || n_tokens > g->prefill_cap ||
+        !g->dspark_kv_cache[stage] || !g->batch_cur_hc || !g->dspark_main_x) {
+        return false;
+    }
+
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    const uint64_t mix_hc = 2ull * DS4_N_HC + (uint64_t)DS4_N_HC * DS4_N_HC;
+    const uint64_t q_rank = layer->attn_q_a->dim[1];
+    const uint64_t q_dim = (uint64_t)DS4_N_HEAD * DS4_N_HEAD_DIM;
+    const uint32_t n_groups = DS4_N_OUT_GROUP;
+    const uint32_t group_heads = DS4_N_HEAD / n_groups;
+    const uint32_t group_dim = DS4_N_HEAD_DIM * group_heads;
+    const uint32_t rank = DS4_N_LORA_O;
+    const uint32_t raw_cap = DS4_N_SWA + n_tokens;
+    uint32_t n_real = g->dspark_n_real;
+    if (n_real + 1u + n_tokens > raw_cap) n_real = raw_cap - 1u - n_tokens;
+
+    ds4_gpu_tensor *hc_mix_view = ds4_gpu_tensor_view(
+            g->batch_hc_mix, 0, (uint64_t)n_tokens * mix_hc * sizeof(float));
+    ds4_gpu_tensor *hc_split_view = ds4_gpu_tensor_view(
+            g->batch_hc_split, 0, (uint64_t)n_tokens * mix_hc * sizeof(float));
+    ds4_gpu_tensor *attn_cur_view = ds4_gpu_tensor_view(
+            g->batch_attn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float));
+    ds4_gpu_tensor *after_attn_hc_view = ds4_gpu_tensor_view(
+            g->batch_after_attn_hc, 0, (uint64_t)n_tokens * hc_dim * sizeof(float));
+    bool ok = hc_mix_view && hc_split_view && attn_cur_view && after_attn_hc_view;
+
+    if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc,
+                                                     g->batch_cur_hc,
+                                                     (uint32_t)hc_dim,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(hc_mix_view,
+                                                  dspark_model,
+                                                  layer->hc_attn_fn,
+                                                  hc_dim,
+                                                  mix_hc,
+                                                  g->batch_flat_hc,
+                                                  n_tokens);
+    if (ok) ok = ds4_gpu_hc_split_weighted_sum_tensor(attn_cur_view,
+                                                       hc_split_view,
+                                                       hc_mix_view,
+                                                       g->batch_cur_hc,
+                                                       dspark_model->map,
+                                                       dspark_model->size,
+                                                       layer->hc_attn_scale->abs_offset,
+                                                       layer->hc_attn_base->abs_offset,
+                                                       DS4_N_EMBD,
+                                                       DS4_N_HC,
+                                                       DS4_N_HC_SINKHORN_ITER,
+                                                       DS4_HC_EPS) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_attn_norm,
+                                                     g->batch_attn_cur,
+                                                     dspark_model->map,
+                                                     dspark_model->size,
+                                                     layer->attn_norm->abs_offset,
+                                                     DS4_N_EMBD,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_qr,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             layer->attn_q_a->abs_offset,
+                                             DS4_N_EMBD,
+                                             q_rank,
+                                             g->batch_attn_norm,
+                                             n_tokens) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_qr_norm,
+                                                     g->batch_qr,
+                                                     dspark_model->map,
+                                                     dspark_model->size,
+                                                     layer->attn_q_a_norm->abs_offset,
+                                                     (uint32_t)q_rank,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_q,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             layer->attn_q_b->abs_offset,
+                                             q_rank,
+                                             q_dim,
+                                             g->batch_qr_norm,
+                                             n_tokens) != 0;
+    if (ok) ok = ds4_gpu_head_rms_norm_tensor(g->batch_q,
+                                               n_tokens,
+                                               DS4_N_HEAD,
+                                               DS4_N_HEAD_DIM,
+                                               DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_q,
+                                           n_tokens,
+                                           DS4_N_HEAD,
+                                           DS4_N_HEAD_DIM,
+                                           DS4_N_ROT,
+                                           start_pos + 1u,
+                                           0u,
+                                           false,
+                                           DS4_ROPE_FREQ_BASE,
+                                           1.0f,
+                                           0.0f,
+                                           1.0f,
+                                           DS4_ROPE_YARN_BETA_FAST,
+                                           DS4_ROPE_YARN_BETA_SLOW) != 0;
+
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             layer->attn_kv->abs_offset,
+                                             DS4_N_EMBD,
+                                             DS4_N_HEAD_DIM,
+                                             g->batch_attn_norm,
+                                             n_tokens) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv,
+                                                     g->batch_kv_raw,
+                                                     dspark_model->map,
+                                                     dspark_model->size,
+                                                     layer->attn_kv_a_norm->abs_offset,
+                                                     DS4_N_HEAD_DIM,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv,
+                                           n_tokens,
+                                           DS4_N_HEAD_KV,
+                                           DS4_N_HEAD_DIM,
+                                           DS4_N_ROT,
+                                           start_pos + 1u,
+                                           0u,
+                                           false,
+                                           DS4_ROPE_FREQ_BASE,
+                                           1.0f,
+                                           0.0f,
+                                           1.0f,
+                                           DS4_ROPE_YARN_BETA_FAST,
+                                           DS4_ROPE_YARN_BETA_SLOW) != 0;
+    if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv,
+                                                       n_tokens,
+                                                       DS4_N_HEAD_DIM,
+                                                       DS4_N_ROT) != 0;
+    if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage],
+                                                    g->batch_kv,
+                                                    raw_cap,
+                                                    n_real + 1u,
+                                                    n_tokens,
+                                                    DS4_N_HEAD_DIM) != 0;
+
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             layer->attn_kv->abs_offset,
+                                             DS4_N_EMBD,
+                                             DS4_N_HEAD_DIM,
+                                             g->dspark_main_x,
+                                             1) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv,
+                                                     g->batch_kv_raw,
+                                                     dspark_model->map,
+                                                     dspark_model->size,
+                                                     layer->attn_kv_a_norm->abs_offset,
+                                                     DS4_N_HEAD_DIM,
+                                                     1,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv,
+                                           1,
+                                           DS4_N_HEAD_KV,
+                                           DS4_N_HEAD_DIM,
+                                           DS4_N_ROT,
+                                           start_pos,
+                                           0u,
+                                           false,
+                                           DS4_ROPE_FREQ_BASE,
+                                           1.0f,
+                                           0.0f,
+                                           1.0f,
+                                           DS4_ROPE_YARN_BETA_FAST,
+                                           DS4_ROPE_YARN_BETA_SLOW) != 0;
+    if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv,
+                                                       1,
+                                                       DS4_N_HEAD_DIM,
+                                                       DS4_N_ROT) != 0;
+    if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(g->dspark_kv_cache[stage],
+                                                    g->batch_kv,
+                                                    raw_cap,
+                                                    n_real,
+                                                    1,
+                                                    DS4_N_HEAD_DIM) != 0;
+
+    if (ok) ok = ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor(
+            g->batch_heads,
+            dspark_model->map,
+            dspark_model->size,
+            layer->attn_sinks->abs_offset,
+            g->batch_q,
+            g->dspark_kv_cache[stage],
+            n_tokens,
+            n_real + 1u + n_tokens,
+            raw_cap,
+            0u,
+            DS4_N_HEAD,
+            DS4_N_HEAD_DIM) != 0;
+    if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_heads,
+                                           n_tokens,
+                                           DS4_N_HEAD,
+                                           DS4_N_HEAD_DIM,
+                                           DS4_N_ROT,
+                                           start_pos + 1u,
+                                           0u,
+                                           true,
+                                           DS4_ROPE_FREQ_BASE,
+                                           1.0f,
+                                           0.0f,
+                                           1.0f,
+                                           DS4_ROPE_YARN_BETA_FAST,
+                                           DS4_ROPE_YARN_BETA_SLOW) != 0;
+    if (ok) ok = ds4_gpu_attention_output_q8_batch_tensor(g->batch_attn_out,
+                                                           g->batch_attn_low,
+                                                           g->batch_group_tmp,
+                                                           g->batch_low_tmp,
+                                                           dspark_model->map,
+                                                           dspark_model->size,
+                                                           layer->attn_output_a->abs_offset,
+                                                           layer->attn_output_b->abs_offset,
+                                                           group_dim,
+                                                           rank,
+                                                           n_groups,
+                                                           DS4_N_EMBD,
+                                                           g->batch_heads,
+                                                           n_tokens) != 0;
+    if (ok) ok = ds4_gpu_hc_expand_split_tensor(after_attn_hc_view,
+                                                 g->batch_attn_out,
+                                                 g->batch_cur_hc,
+                                                 hc_split_view,
+                                                 DS4_N_EMBD,
+                                                 DS4_N_HC) != 0;
+
+    ds4_gpu_tensor_free(after_attn_hc_view);
+    ds4_gpu_tensor_free(attn_cur_view);
+    ds4_gpu_tensor_free(hc_split_view);
+    ds4_gpu_tensor_free(hc_mix_view);
+    return ok;
+}
+
+static bool metal_graph_dspark_refresh_main_rows(
+        ds4_gpu_graph          *g,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        ds4_gpu_tensor         *main_hidden,
+        ds4_gpu_tensor         *main_x,
+        uint32_t                pos0,
+        uint32_t                row0,
+        uint32_t                n_tokens,
+        bool                    keep_last_hidden) {
+    if (n_tokens == 0) return true;
+    if (!g || !g->dspark_enabled || !dspark_model || !mtp || !main_hidden ||
+        !main_x || !g->batch_kv_raw || !g->batch_kv ||
+        n_tokens > DS4_DSPARK_MAX_BLOCK_SIZE ||
+        row0 + n_tokens > DS4_N_SWA + mtp->dspark.block_size) {
+        return false;
+    }
+
+    bool ok = ds4_gpu_begin_commands() != 0;
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(main_x,
+                                             dspark_model->map,
+                                             dspark_model->size,
+                                             mtp->main_proj->abs_offset,
+                                             3ull * DS4_N_EMBD,
+                                             (uint64_t)DS4_N_EMBD,
+                                             main_hidden,
+                                             n_tokens) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(main_x,
+                                                     main_x,
+                                                     dspark_model->map,
+                                                     dspark_model->size,
+                                                     mtp->main_norm->abs_offset,
+                                                     DS4_N_EMBD,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+
+    for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) {
+        const ds4_layer_weights *layer = &mtp->stage[stage];
+        ok = ds4_gpu_matmul_q8_0_tensor(g->batch_kv_raw,
+                                        dspark_model->map,
+                                        dspark_model->size,
+                                        layer->attn_kv->abs_offset,
+                                        DS4_N_EMBD,
+                                        DS4_N_HEAD_DIM,
+                                        main_x,
+                                        n_tokens) != 0;
+        if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(g->batch_kv,
+                                                         g->batch_kv_raw,
+                                                         dspark_model->map,
+                                                         dspark_model->size,
+                                                         layer->attn_kv_a_norm->abs_offset,
+                                                         DS4_N_HEAD_DIM,
+                                                         n_tokens,
+                                                         DS4_RMS_EPS) != 0;
+        if (ok) ok = ds4_gpu_rope_tail_tensor(g->batch_kv,
+                                              n_tokens,
+                                              DS4_N_HEAD_KV,
+                                              DS4_N_HEAD_DIM,
+                                              DS4_N_ROT,
+                                              pos0,
+                                              0u,
+                                              false,
+                                              DS4_ROPE_FREQ_BASE,
+                                              1.0f,
+                                              0.0f,
+                                              1.0f,
+                                              DS4_ROPE_YARN_BETA_FAST,
+                                              DS4_ROPE_YARN_BETA_SLOW) != 0;
+        if (ok) ok = ds4_gpu_dsv4_fp8_kv_quantize_tensor(g->batch_kv,
+                                                          n_tokens,
+                                                          DS4_N_HEAD_DIM,
+                                                          DS4_N_ROT) != 0;
+        if (ok) ok = ds4_gpu_store_raw_kv_batch_tensor(
+                g->dspark_kv_cache[stage],
+                g->batch_kv,
+                DS4_N_SWA + mtp->dspark.block_size,
+                row0,
+                n_tokens,
+                DS4_N_HEAD_DIM) != 0;
+    }
+
+    if (ok && keep_last_hidden && g->dspark_main_hidden) {
+        const uint64_t stage_bytes = (uint64_t)DS4_N_EMBD * sizeof(float);
+        const uint64_t hidden_row_bytes =
+            (uint64_t)DS4_DSPARK_MTP_LAYERS * DS4_N_EMBD * sizeof(float);
+        const uint64_t src_row = (uint64_t)(n_tokens - 1u) * hidden_row_bytes;
+        for (uint32_t s = 0; ok && s < DS4_DSPARK_MTP_LAYERS; s++) {
+            ok = ds4_gpu_tensor_copy(g->dspark_main_hidden,
+                                     (uint64_t)s * stage_bytes,
+                                     main_hidden,
+                                     src_row + (uint64_t)s * stage_bytes,
+                                     stage_bytes) != 0;
+        }
+    }
+
+    if (ds4_gpu_end_commands() == 0) ok = false;
+    if (!ok) (void)ds4_gpu_synchronize();
+    return ok;
+}
+
+static bool metal_graph_dspark_refresh_verified_rows(
+        ds4_gpu_graph          *g,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        uint32_t                row0,
+        uint32_t                pos0,
+        uint32_t                n_tokens) {
+    return metal_graph_dspark_refresh_main_rows(g,
+                                                dspark_model,
+                                                mtp,
+                                                g ? g->dspark_verify_hidden : NULL,
+                                                g ? g->dspark_verify_main_x : NULL,
+                                                pos0,
+                                                row0,
+                                                n_tokens,
+                                                true);
+}
+
+static bool metal_graph_dspark_refresh_current_row(
+        ds4_gpu_graph          *g,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        uint32_t                row,
+        uint32_t                pos) {
+    return metal_graph_dspark_refresh_main_rows(g,
+                                                dspark_model,
+                                                mtp,
+                                                g ? g->dspark_main_hidden : NULL,
+                                                g ? g->dspark_main_x : NULL,
+                                                pos,
+                                                row,
+                                                1,
+                                                false);
+}
+static bool metal_graph_encode_output_head_dspark_batch(
+        ds4_gpu_graph          *g,
+        const ds4_model        *target_model,
+        const ds4_weights      *target_weights,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        uint32_t                n_tokens) {
+    if (!g || !target_model || !target_weights || !dspark_model || !mtp ||
+        n_tokens == 0 || n_tokens > g->prefill_cap || !g->spec_logits) {
+        return false;
+    }
+
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    ds4_gpu_tensor *output_pre = ds4_gpu_tensor_view(
+            g->batch_hc_mix, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float));
+    ds4_gpu_tensor *output_weights = ds4_gpu_tensor_view(
+            g->batch_hc_split, 0, (uint64_t)n_tokens * DS4_N_HC * sizeof(float));
+    ds4_gpu_tensor *output_embd = ds4_gpu_tensor_view(
+            g->batch_ffn_cur, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float));
+    ds4_gpu_tensor *output_norm = ds4_gpu_tensor_view(
+            g->batch_ffn_norm, 0, (uint64_t)n_tokens * DS4_N_EMBD * sizeof(float));
+    ds4_gpu_tensor *logits = ds4_gpu_tensor_view(
+            g->spec_logits, 0, (uint64_t)n_tokens * DS4_N_VOCAB * sizeof(float));
+    bool ok = output_pre && output_weights && output_embd && output_norm && logits;
+
+    if (ok) ok = ds4_gpu_rms_norm_plain_rows_tensor(g->batch_flat_hc,
+                                                     g->batch_cur_hc,
+                                                     (uint32_t)hc_dim,
+                                                     n_tokens,
+                                                     DS4_RMS_EPS) != 0;
+    if (ok) ok = metal_graph_matmul_plain_tensor(output_pre,
+                                                  dspark_model,
+                                                  mtp->hc_head_fn,
+                                                  hc_dim,
+                                                  DS4_N_HC,
+                                                  g->batch_flat_hc,
+                                                  n_tokens);
+    if (ok) ok = ds4_gpu_output_hc_weights_tensor(output_weights,
+                                                   output_pre,
+                                                   dspark_model->map,
+                                                   dspark_model->size,
+                                                   mtp->hc_head_scale->abs_offset,
+                                                   mtp->hc_head_base->abs_offset,
+                                                   DS4_N_HC,
+                                                   DS4_HC_EPS) != 0;
+    if (ok) ok = ds4_gpu_hc_weighted_sum_tensor(output_embd,
+                                                 g->batch_cur_hc,
+                                                 output_weights,
+                                                 DS4_N_EMBD,
+                                                 DS4_N_HC) != 0;
+    if (ok) ok = ds4_gpu_rms_norm_weight_rows_tensor(output_norm,
+                                                      output_embd,
+                                                      dspark_model->map,
+                                                      dspark_model->size,
+                                                      mtp->norm->abs_offset,
+                                                      DS4_N_EMBD,
+                                                      n_tokens,
+                                                      DS4_RMS_EPS) != 0;
+    if (ok) ok = ds4_gpu_matmul_q8_0_tensor(logits,
+                                             target_model->map,
+                                             target_model->size,
+                                             target_weights->output->abs_offset,
+                                             DS4_N_EMBD,
+                                             DS4_N_VOCAB,
+                                             output_norm,
+                                             n_tokens) != 0;
+
+    ds4_gpu_tensor_free(logits);
+    ds4_gpu_tensor_free(output_norm);
+    ds4_gpu_tensor_free(output_embd);
+    ds4_gpu_tensor_free(output_weights);
+    ds4_gpu_tensor_free(output_pre);
+    return ok;
+}
+
+static bool metal_graph_eval_dspark_draft_block(
+        ds4_gpu_graph          *g,
+        const ds4_model        *target_model,
+        const ds4_weights      *target_weights,
+        const ds4_model        *dspark_model,
+        const ds4_mtp_weights  *mtp,
+        int                     anchor_token,
+        uint32_t                pos,
+        uint32_t                max_tokens,
+        int                    *drafts,
+        int                    *draft_n,
+        uint32_t               *base_real_out,
+        float                  *last_logits,
+        float                  *all_draft_logits) {
+    if (draft_n) *draft_n = 0;
+    if (base_real_out) *base_real_out = 0;
+    if (!g || !target_model || !target_weights || !dspark_model || !mtp ||
+        !drafts || !draft_n || mtp->kind != DS4_MTP_DRAFT_DSPARK) {
+        return false;
+    }
+
+    uint32_t block_size = mtp->dspark.block_size;
+    if (block_size > max_tokens) block_size = max_tokens;
+    if (block_size > g->prefill_cap) block_size = g->prefill_cap;
+    if (block_size == 0 || block_size > 16) return true;
+    if (g->dspark_n_real >= DS4_N_SWA) g->dspark_n_real = 0;
+    if (base_real_out) *base_real_out = g->dspark_n_real;
+
+    bool ok = metal_graph_dspark_input_stage(g,
+                                             target_model,
+                                             target_weights,
+                                             dspark_model,
+                                             mtp,
+                                             anchor_token,
+                                             block_size);
+    bool commands_open = false;
+    if (ok) {
+        ok = ds4_gpu_begin_commands() != 0;
+        commands_open = ok;
+    }
+    for (uint32_t stage = 0; ok && stage < mtp->dspark.n_mtp_layers; stage++) {
+        const ds4_layer_weights *layer = &mtp->stage[stage];
+        ok = metal_graph_dspark_encode_attention(g,
+                                                 dspark_model,
+                                                 layer,
+                                                 stage,
+                                                 pos,
+                                                 block_size);
+        if (ok) ok = metal_graph_encode_layer_ffn_batch(g,
+                                                        dspark_model,
+                                                        layer,
+                                                        stage,
+                                                        pos + 1u,
+                                                        block_size);
+        if (ok) {
+            ds4_gpu_tensor *tmp = g->batch_cur_hc;
+            g->batch_cur_hc = g->batch_next_hc;
+            g->batch_next_hc = tmp;
+        }
+    }
+    if (ok) ok = metal_graph_encode_output_head_dspark_batch(g,
+                                                             target_model,
+                                                             target_weights,
+                                                             dspark_model,
+                                                             mtp,
+                                                             block_size);
+    if (commands_open && ds4_gpu_end_commands() == 0) ok = false;
+    if (!ok) {
+        (void)ds4_gpu_synchronize();
+        return false;
+    }
+
+    const uint64_t row_bytes = (uint64_t)DS4_N_VOCAB * sizeof(float);
+    float *row_logits = xmalloc((size_t)row_bytes);
+    for (uint32_t i = 0; ok && i < block_size; i++) {
+        ok = ds4_gpu_tensor_read(g->spec_logits,
+                                 (uint64_t)i * row_bytes,
+                                 row_logits,
+                                 row_bytes) != 0;
+        if (!ok) break;
+        const int prev = i == 0 ? anchor_token : drafts[i - 1u];
+        dspark_apply_markov_bias(row_logits, dspark_model, mtp, prev);
+        drafts[i] = sample_argmax(row_logits, DS4_N_VOCAB);
+        if (all_draft_logits) {
+            memcpy(all_draft_logits + (uint64_t)i * DS4_N_VOCAB, row_logits, (size_t)row_bytes);
+        }
+        if (last_logits && i + 1u == block_size) {
+            memcpy(last_logits, row_logits, (size_t)row_bytes);
+        }
+    }
+    free(row_logits);
+    if (!ok) return false;
+    *draft_n = (int)block_size;
+    return true;
+}
+
+static bool metal_graph_eval_token_raw_swa_streaming(
+        ds4_gpu_graph *g,
+        const ds4_model       *model,
+        const ds4_weights     *weights,
+        int                    token,
+        uint32_t               pos,
+        float                 *logits) {
+    if (g->raw_cap == 0) {
+        fprintf(stderr, "ds4: Metal graph raw KV cache is not allocated\n");
+        return false;
+    }
+
+    const bool profile = getenv("DS4_METAL_GRAPH_TOKEN_PROFILE") != NULL;
+    const bool throttle = graph_power_throttle_enabled(g);
+    const double t0 = (profile || throttle) ? now_sec() : 0.0;
+    const uint32_t raw_row = pos % g->raw_cap;
+    const uint32_t n_raw = metal_graph_raw_span_for_batch(g, pos, 1);
+
+    const bool static_decode_map = metal_graph_stream_decode_static_map_enabled();
+    const bool static_map_state_cache =
+        static_decode_map && metal_graph_stream_decode_static_map_state_cache_enabled();
+    const bool batch_static_decode =
+        static_decode_map && metal_graph_stream_decode_layer_batch_enabled(g);
+    bool ok = true;
+    if (static_decode_map) {
+        if (!static_map_state_cache || !g->streaming_static_decode_map_current) {
+            ok = metal_graph_stream_map_decode_static_all(model, weights);
+            if (ok) g->streaming_static_decode_map_current = static_map_state_cache;
+        }
+    } else {
+        g->streaming_static_decode_map_current = false;
+        ok = metal_graph_stream_map_token(model, weights);
+    }
+    if (ok && !static_decode_map && DS4_N_LAYER > 0) {
+        metal_graph_stream_readahead_layer_decode(model, weights, 0);
+    }
+    if (ok) ok = ds4_gpu_begin_commands() != 0;
+    if (ok) {
+        ok = ds4_gpu_embed_token_hc_tensor(g->cur_hc,
                                            model->map,
                                            model->size,
                                            weights->token_embd->abs_offset,
@@ -19457,7 +20586,8 @@ static bool metal_graph_eval_token_raw_swa_streaming(
     return ok;
 }
 
-/* Execute one Metal decode token and read back logits. */
+/* Execute one Metal decode token and optionally capture the target hidden states
+ * that DSpark uses as the draft model's cross-token input. */
 static bool metal_graph_eval_token_raw_swa(
         ds4_gpu_graph *g,
         const ds4_model       *model,
@@ -19474,7 +20604,8 @@ static bool metal_graph_eval_token_raw_swa(
     const double t0 = (profile || throttle) ? now_sec() : 0.0;
 
     bool ok = ds4_gpu_begin_commands() != 0;
-    if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos, logits != NULL, true);
+    if (ok) ok = metal_graph_encode_token_raw_swa(g, model, weights, token, pos,
+                                                  logits != NULL, true);
     const double t_encoded = (profile || throttle) ? now_sec() : 0.0;
     if (ok) ok = ds4_gpu_end_commands() != 0;
     const double t_done = (profile || throttle) ? now_sec() : 0.0;
@@ -19502,6 +20633,8 @@ static bool metal_graph_eval_token_raw_swa(
     return ok;
 }
 
+/* Execute one Metal decode token and read back logits. */
+
 static bool metal_graph_streaming_decode_prefill_wide_default(
         const ds4_weights *weights) {
     return DS4_MODEL_VARIANT == DS4_VARIANT_FLASH &&
@@ -20272,6 +21405,7 @@ static bool metal_graph_reset_prefill_state(ds4_gpu_graph *g) {
     memset(g->layer_n_comp, 0, sizeof(g->layer_n_comp));
     memset(g->layer_n_index_comp, 0, sizeof(g->layer_n_index_comp));
     g->mtp_n_raw = 0;
+    g->dspark_n_real = 0;
     for (uint32_t il = 0; il < DS4_N_LAYER; il++) {
         const uint32_t ratio = ds4_layer_compress_ratio(il);
         if (ratio == 0) continue;
@@ -21150,6 +22284,7 @@ static bool metal_graph_verify_suffix_tops(
                                             il,
                                             start,
                                             n_tokens);
+        if (ok) ok = metal_graph_capture_dspark_batch_main_hidden(g, il, n_tokens);
     }
     if (ok) ok = ds4_gpu_end_commands() != 0;
     else (void)ds4_gpu_synchronize();
@@ -21510,7 +22645,8 @@ static int metal_graph_prompt_logits_test(
 
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, (uint32_t)n_test, false);
+                                        NULL, raw_cap, (uint32_t)ctx_size,
+                                        (uint32_t)n_test, false);
     if (!ok) {
         metal_graph_free(&g);
         fprintf(stderr, "ds4: failed to initialize Metal graph prompt test runtime\n");
@@ -22602,6 +23738,174 @@ static float sample_rng_f32(uint64_t *state) {
     return (float)((x >> 40) & 0xffffffu) / 16777216.0f;
 }
 
+/* =========================================================================
+ * B2 Rejection Sampling for DSpark Speculative Decoding.
+ * =========================================================================
+ *
+ * Implements Chen et al. (2023) / Leviathan et al. (2023) rejection sampling.
+ * At temp=0: pure argmax matching (token-identical to non-speculative decode).
+ * At temp>0: lossless samples from the target model's distribution.
+ *
+ * All computations use log-probabilities to avoid overflow on 129K vocab.
+ * Activated via DS4_SPEC_TEMP env var; greedy path is the unchanged default.
+ */
+
+/* Stable log-softmax: log_probs[i] = logits[i] - log(sum(exp(logits))).
+ * Uses max-subtraction for numerical stability on 129K vocab. */
+static void b2_log_softmax(const float *logits, uint32_t vocab, float *log_probs) {
+    float max_val = DS4_NEG_INF;
+    for (uint32_t i = 0; i < vocab; i++) {
+        if (logits[i] > max_val) max_val = logits[i];
+    }
+    float sum_exp = 0.0f;
+    for (uint32_t i = 0; i < vocab; i++) {
+        sum_exp += expf(logits[i] - max_val);
+    }
+    const float log_denom = max_val + logf(sum_exp);
+    for (uint32_t i = 0; i < vocab; i++) {
+        log_probs[i] = logits[i] - log_denom;
+    }
+}
+
+/* Sample from CDF of a log-probability vector. */
+static int b2_sample_from_log_probs(const float *log_probs, uint32_t vocab,
+                                     uint64_t *rng) {
+    const float u = sample_rng_f32(rng);
+    float cumsum = 0.0f;
+    for (uint32_t i = 0; i < vocab; i++) {
+        cumsum += expf(log_probs[i]);
+        if (cumsum >= u) return (int)i;
+    }
+    return (int)(vocab - 1);
+}
+
+/* Sample from the residual distribution max(0, target_prob - draft_prob).
+ * Both inputs are log-probability vectors. */
+static int b2_sample_residual(const float *log_target, const float *log_draft,
+                               uint32_t vocab, uint64_t *rng) {
+    /* Compute residual in probability space. Use a stack allocation guard:
+     * 129280 * 4 = ~504 KB, too large for stack. Heap allocate. */
+    float *residual = xmalloc((size_t)vocab * sizeof(float));
+    float residual_sum = 0.0f;
+
+    for (uint32_t i = 0; i < vocab; i++) {
+        const float t = expf(log_target[i]);
+        const float d = expf(log_draft[i]);
+        const float r = t - d;
+        residual[i] = r > 0.0f ? r : 0.0f;
+        residual_sum += residual[i];
+    }
+
+    int result;
+    if (residual_sum < 1e-10f) {
+        /* Residual is effectively zero — fall back to target distribution. */
+        free(residual);
+        return b2_sample_from_log_probs(log_target, vocab, rng);
+    }
+
+    /* CDF inversion over the unnormalized residual. */
+    const float threshold = sample_rng_f32(rng) * residual_sum;
+    float cumsum = 0.0f;
+    result = (int)(vocab - 1);
+    for (uint32_t i = 0; i < vocab; i++) {
+        cumsum += residual[i];
+        if (cumsum >= threshold) {
+            result = (int)i;
+            break;
+        }
+    }
+
+    free(residual);
+    return result;
+}
+
+typedef struct {
+    int   n_accepted;
+    int   accepted_tokens[16];
+    int   correction_token;
+    bool  has_correction;
+} b2_result;
+
+/* B2 rejection sampling for DSpark speculative decode.
+ *
+ * draft_tokens:  [n_draft] token ids proposed by the drafter
+ * draft_logits:  [n_draft * vocab] raw logits from the drafter (post-markov-bias)
+ * target_logits: [n_draft * vocab] raw logits from the target model (batch verify)
+ * vocab:         vocabulary size (DS4_N_VOCAB)
+ * n_draft:       number of draft tokens
+ * temperature:   sampling temperature (<=0 falls back to argmax matching)
+ * rng:           pointer to xorshift64* state (mutated)
+ */
+static b2_result b2_rejection_sample(
+    const int   *draft_tokens,
+    const float *draft_logits,
+    const float *target_logits,
+    uint32_t     vocab,
+    int          n_draft,
+    float        temperature,
+    uint64_t    *rng)
+{
+    b2_result result;
+    memset(&result, 0, sizeof(result));
+    if (n_draft <= 0 || n_draft > 16) return result;
+
+    /* Greedy path: pure argmax matching (temp <= 0). */
+    if (temperature <= 0.0f) {
+        for (int i = 0; i < n_draft; i++) {
+            const float *t_logits = target_logits + (uint64_t)i * vocab;
+            const int targ = sample_argmax(t_logits, vocab);
+            if (targ == draft_tokens[i]) {
+                result.accepted_tokens[result.n_accepted++] = draft_tokens[i];
+            } else {
+                result.correction_token = targ;
+                result.has_correction = true;
+                break;
+            }
+        }
+        return result;
+    }
+
+    /* Stochastic path: rejection sampling (temp > 0). */
+    const float inv_temp = 1.0f / temperature;
+
+    /* Scratch buffers for temperature-scaled log-softmax. */
+    float *log_draft  = xmalloc((size_t)vocab * sizeof(float));
+    float *log_target = xmalloc((size_t)vocab * sizeof(float));
+    float *scaled     = xmalloc((size_t)vocab * sizeof(float));
+
+    for (int i = 0; i < n_draft; i++) {
+        const float *d_logits = draft_logits  + (uint64_t)i * vocab;
+        const float *t_logits = target_logits + (uint64_t)i * vocab;
+
+        /* Apply temperature scaling before log-softmax. */
+        for (uint32_t v = 0; v < vocab; v++) scaled[v] = d_logits[v] * inv_temp;
+        b2_log_softmax(scaled, vocab, log_draft);
+        for (uint32_t v = 0; v < vocab; v++) scaled[v] = t_logits[v] * inv_temp;
+        b2_log_softmax(scaled, vocab, log_target);
+
+        const int token = draft_tokens[i];
+        const float log_ratio = log_target[token] - log_draft[token];
+
+        /* Accept with probability min(1, target_prob / draft_prob).
+         * In log space: accept if log(u) < min(0, log_ratio). */
+        const float u = sample_rng_f32(rng);
+        if (logf(u + 1e-30f) < fminf(0.0f, log_ratio)) {
+            result.accepted_tokens[result.n_accepted++] = token;
+        } else {
+            /* Reject: sample correction from residual(target - draft). */
+            result.correction_token = b2_sample_residual(
+                log_target, log_draft, vocab, rng);
+            result.has_correction = true;
+            break;
+        }
+    }
+
+    free(scaled);
+    free(log_target);
+    free(log_draft);
+    return result;
+}
+
 typedef struct {
     int id;
     float logit;
@@ -22956,7 +24260,8 @@ static int generate_metal_graph_raw_swa(
     }
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, prefill_cap, false);
+                                        NULL, raw_cap, (uint32_t)ctx_size,
+                                        prefill_cap, false);
     if (!ok) {
         fprintf(stderr, "ds4: failed to allocate GPU graph runtime\n");
         return 1;
@@ -23268,6 +24573,13 @@ struct ds4_session {
     float *logits;
     float *mtp_logits;
     int mtp_draft_token;
+    int dspark_draft_tokens[16];
+    int dspark_draft_count;
+    uint32_t dspark_draft_base_real;
+    float *dspark_b2_draft_logits;   /* [block_size * DS4_N_VOCAB] post-markov-bias logits for B2 */
+    uint64_t dspark_b2_rng;          /* xorshift64* state for B2 rejection sampling (persisted across calls) */
+    int dspark_prev_accepted;        /* previous cycle accepted count (for adaptive block size) */
+    int dspark_prev_drafted;         /* previous cycle drafted count */
     uint64_t mtp_probe_total;
     uint64_t mtp_probe_hit;
     ds4_session_progress_fn progress;
@@ -24040,12 +25352,18 @@ bool ds4_engine_has_output_head(ds4_engine *e) {
     return e && weights_have_output_head(&e->weights);
 }
 
+ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e) {
+    return (e && e->mtp_ready) ? e->mtp_weights.kind : DS4_MTP_DRAFT_NONE;
+}
+
 bool ds4_engine_has_mtp(ds4_engine *e) {
     return e && e->backend != DS4_BACKEND_CPU &&
            e->distributed.role == DS4_DISTRIBUTED_NONE &&
-           e->mtp_ready;
+           e->mtp_ready &&
+           ds4_mtp_draft_runtime_supported(e->backend, e->mtp_weights.kind);
 }
 
+
 int ds4_engine_mtp_draft_tokens(ds4_engine *e) {
     return ds4_engine_has_mtp(e) ? e->mtp_draft_tokens : 0;
 }
@@ -24986,13 +26304,381 @@ static char *imatrix_trim_block(char *p, char *end) {
     *end = '\0';
     return p;
 }
-#endif
 
-int ds4_engine_collect_imatrix(ds4_engine *e,
-                               const char *dataset_path,
-                               const char *output_path,
-                               int ctx_size,
-                               int max_prompts,
+static bool dspark_target_cache_join_path(char *dst, size_t dst_size, const char *dir, const char *name) {
+    if (!dst || dst_size == 0 || !dir || !name) return false;
+    const int n = snprintf(dst, dst_size, "%s/%s", dir, name);
+    return n > 0 && (size_t)n < dst_size;
+}
+
+static bool dspark_target_cache_output_dir_prepare(const char *path) {
+    struct stat st;
+    if (stat(path, &st) == 0) {
+        if (!S_ISDIR(st.st_mode)) {
+            fprintf(stderr, "ds4: DSpark target cache output path is not a directory: %s\n", path);
+            return false;
+        }
+        DIR *dir = opendir(path);
+        if (!dir) {
+            fprintf(stderr, "ds4: failed to inspect DSpark target cache output dir %s: %s\n",
+                    path, strerror(errno));
+            return false;
+        }
+        bool empty = true;
+        struct dirent *ent = NULL;
+        while ((ent = readdir(dir)) != NULL) {
+            if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) {
+                empty = false;
+                break;
+            }
+        }
+        closedir(dir);
+        if (!empty) {
+            fprintf(stderr, "ds4: DSpark target cache output dir is not empty: %s\n", path);
+            return false;
+        }
+        return true;
+    }
+    if (errno != ENOENT) {
+        fprintf(stderr, "ds4: failed to stat DSpark target cache output dir %s: %s\n",
+                path, strerror(errno));
+        return false;
+    }
+    if (mkdir(path, 0777) != 0) {
+        fprintf(stderr, "ds4: failed to create DSpark target cache output dir %s: %s\n",
+                path, strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+static bool dspark_target_cache_file_pos(FILE *fp, uint64_t *out) {
+    if (!fp || !out) return false;
+    off_t pos = ftello(fp);
+    if (pos < 0) return false;
+    *out = (uint64_t)pos;
+    return true;
+}
+
+static bool dspark_target_cache_write_all(FILE *fp, const void *ptr, size_t bytes, const char *what) {
+    if (bytes == 0) return true;
+    if (fwrite(ptr, 1, bytes, fp) != bytes) {
+        fprintf(stderr, "ds4: failed to write DSpark target cache %s: %s\n",
+                what ? what : "payload", strerror(errno));
+        return false;
+    }
+    return true;
+}
+
+static void dspark_target_cache_store_le32(uint8_t *p, uint32_t v) {
+    p[0] = (uint8_t)(v & 0xffu);
+    p[1] = (uint8_t)((v >> 8) & 0xffu);
+    p[2] = (uint8_t)((v >> 16) & 0xffu);
+    p[3] = (uint8_t)((v >> 24) & 0xffu);
+}
+
+static void dspark_target_cache_store_le64(uint8_t *p, uint64_t v) {
+    for (uint32_t i = 0; i < 8; i++) p[i] = (uint8_t)((v >> (8u * i)) & 0xffu);
+}
+
+static bool dspark_target_cache_write_index_record(FILE *fp,
+                                                   uint64_t sample_id,
+                                                   uint32_t shard_id,
+                                                   uint32_t seq_len,
+                                                   uint64_t input_ids_offset,
+                                                   uint64_t attention_mask_offset,
+                                                   uint64_t loss_mask_offset,
+                                                   uint64_t target_hidden_states_offset,
+                                                   uint64_t target_last_hidden_states_offset) {
+    uint8_t rec[56];
+    dspark_target_cache_store_le64(rec + 0, sample_id);
+    dspark_target_cache_store_le32(rec + 8, shard_id);
+    dspark_target_cache_store_le32(rec + 12, seq_len);
+    dspark_target_cache_store_le64(rec + 16, input_ids_offset);
+    dspark_target_cache_store_le64(rec + 24, attention_mask_offset);
+    dspark_target_cache_store_le64(rec + 32, loss_mask_offset);
+    dspark_target_cache_store_le64(rec + 40, target_hidden_states_offset);
+    dspark_target_cache_store_le64(rec + 48, target_last_hidden_states_offset);
+    return dspark_target_cache_write_all(fp, rec, sizeof(rec), "samples.idx record");
+}
+
+static bool dspark_target_cache_write_json_string(FILE *fp, const char *s) {
+    if (fputc('"', fp) == EOF) return false;
+    for (const unsigned char *p = (const unsigned char *)(s ? s : ""); *p; p++) {
+        switch (*p) {
+        case '\\':
+        case '"':
+            if (fprintf(fp, "\\%c", *p) < 0) return false;
+            break;
+        case '\n':
+            if (fputs("\\n", fp) == EOF) return false;
+            break;
+        case '\r':
+            if (fputs("\\r", fp) == EOF) return false;
+            break;
+        case '\t':
+            if (fputs("\\t", fp) == EOF) return false;
+            break;
+        default:
+            if (*p < 0x20) {
+                if (fprintf(fp, "\\u%04x", (unsigned)*p) < 0) return false;
+            } else if (fputc((int)*p, fp) == EOF) {
+                return false;
+            }
+            break;
+        }
+    }
+    return fputc('"', fp) != EOF;
+}
+
+static const char *dspark_target_cache_quant_family(const ds4_weights *weights) {
+    if (!weights || DS4_N_LAYER == 0) return "unknown";
+    const ds4_layer_weights *layer = &weights->layer[0];
+    if (!layer->ffn_gate_exps || !layer->ffn_up_exps || !layer->ffn_down_exps) return "unknown";
+    if (layer->ffn_gate_exps->type == DS4_TENSOR_Q4_K &&
+        layer->ffn_up_exps->type == DS4_TENSOR_Q4_K &&
+        layer->ffn_down_exps->type == DS4_TENSOR_Q4_K) {
+        return "q4_k_routed_experts";
+    }
+    if (layer->ffn_gate_exps->type == DS4_TENSOR_IQ2_XXS &&
+        layer->ffn_up_exps->type == DS4_TENSOR_IQ2_XXS &&
+        layer->ffn_down_exps->type == DS4_TENSOR_Q2_K) {
+        return "iq2_xxs_gate_up_q2_k_down_routed_experts";
+    }
+    return "mixed_routed_experts";
+}
+
+static bool dspark_target_cache_write_tensor_type_counts(FILE *fp, const ds4_model *model) {
+    uint64_t counts[32] = {0};
+    uint64_t unknown = 0;
+    if (model) {
+        for (uint64_t i = 0; i < model->n_tensors; i++) {
+            uint32_t type = model->tensors[i].type;
+            if (type < (uint32_t)(sizeof(counts) / sizeof(counts[0]))) {
+                counts[type]++;
+            } else {
+                unknown++;
+            }
+        }
+    }
+    if (fprintf(fp, "{") < 0) return false;
+    bool first = true;
+    for (uint32_t type = 0; type < (uint32_t)(sizeof(counts) / sizeof(counts[0])); type++) {
+        if (!counts[type]) continue;
+        if (!first && fprintf(fp, ", ") < 0) return false;
+        first = false;
+        if (fprintf(fp, "\"%s\": %llu",
+                    tensor_type_name(type),
+                    (unsigned long long)counts[type]) < 0) {
+            return false;
+        }
+    }
+    if (unknown) {
+        if (!first && fprintf(fp, ", ") < 0) return false;
+        if (fprintf(fp, "\"unknown\": %llu", (unsigned long long)unknown) < 0) return false;
+    }
+    return fprintf(fp, "}") >= 0;
+}
+
+static bool dspark_target_cache_write_manifest(const char *output_dir,
+                                               const char *dataset_path,
+                                               const char *target_model_name_or_path,
+                                               const char *chat_template,
+                                               const ds4_model *model,
+                                               const ds4_weights *weights,
+                                               const ds4_dspark_config *cfg,
+                                               uint64_t num_samples,
+                                               uint64_t num_tokens) {
+    char path[PATH_MAX];
+    if (!dspark_target_cache_join_path(path, sizeof(path), output_dir, "manifest.json")) {
+        fprintf(stderr, "ds4: DSpark target cache manifest path is too long\n");
+        return false;
+    }
+    FILE *fp = fopen(path, "wb");
+    if (!fp) {
+        fprintf(stderr, "ds4: failed to create DSpark target cache manifest %s: %s\n",
+                path, strerror(errno));
+        return false;
+    }
+    const char *source_gguf_path = (model && model->path && model->path[0]) ? model->path : DS4_MODEL_SHAPE_NAME;
+    const char *target_model = target_model_name_or_path;
+    const char *template_name = (chat_template && chat_template[0]) ?
+                                chat_template :
+                                "ds4_tokenize_rendered_chat";
+    bool ok = true;
+    ok = ok && fprintf(fp, "{\n") >= 0;
+    ok = ok && fprintf(fp, "  \"version\": 2,\n") >= 0;
+    ok = ok && fprintf(fp, "  \"format\": \"deepspec-target-cache\",\n") >= 0;
+    ok = ok && fprintf(fp, "  \"producer\": \"ds4\",\n") >= 0;
+    ok = ok && fprintf(fp, "  \"producer_commit\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, DS4_GIT_COMMIT);
+    ok = ok && fprintf(fp, ",\n  \"source_dataset_path\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, dataset_path);
+    ok = ok && fprintf(fp, ",\n  \"source_gguf_path\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, source_gguf_path);
+    ok = ok && fprintf(fp, ",\n  \"target_model_name_or_path\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, target_model);
+    ok = ok && fprintf(fp, ",\n  \"model_shape\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, DS4_MODEL_SHAPE_NAME);
+    ok = ok && fprintf(fp, ",\n  \"quantization_family\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, dspark_target_cache_quant_family(weights));
+    ok = ok && fprintf(fp, ",\n  \"num_samples\": %llu,\n", (unsigned long long)num_samples) >= 0;
+    ok = ok && fprintf(fp, "  \"num_tokens\": %llu,\n", (unsigned long long)num_tokens) >= 0;
+    ok = ok && fprintf(fp, "  \"num_shards\": %u,\n", num_samples ? 1u : 0u) >= 0;
+    ok = ok && fprintf(fp, "  \"target_layer_ids\": [%u, %u, %u],\n",
+                       cfg->target_layer_ids[0],
+                       cfg->target_layer_ids[1],
+                       cfg->target_layer_ids[2]) >= 0;
+    ok = ok && fprintf(fp, "  \"hidden_size\": %u,\n", DS4_N_EMBD) >= 0;
+    ok = ok && fprintf(fp, "  \"target_hidden_size\": %u,\n", DS4_N_EMBD) >= 0;
+    ok = ok && fprintf(fp, "  \"target_hidden_layers\": %u,\n", cfg->n_mtp_layers) >= 0;
+    ok = ok && fprintf(fp, "  \"hidden_dtype\": \"bfloat16\",\n") >= 0;
+    ok = ok && fprintf(fp, "  \"token_dtype\": \"int32\",\n") >= 0;
+    ok = ok && fprintf(fp, "  \"mask_dtype\": \"uint8\",\n") >= 0;
+    ok = ok && fprintf(fp, "  \"index_record_size\": 56,\n") >= 0;
+    ok = ok && fprintf(fp, "  \"input_convention\": {\n") >= 0;
+    ok = ok && fprintf(fp, "    \"tokenization\": \"ds4_tokenize_rendered_chat\",\n") >= 0;
+    ok = ok && fprintf(fp, "    \"chat_template\": ") >= 0;
+    ok = ok && dspark_target_cache_write_json_string(fp, template_name);
+    ok = ok && fprintf(fp, ",\n    \"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\",\n") >= 0;
+    ok = ok && fprintf(fp, "    \"loss_mask\": \"1 for every exported prompt token\"\n") >= 0;
+    ok = ok && fprintf(fp, "  },\n") >= 0;
+    ok = ok && fprintf(fp, "  \"hidden_convention\": {\n") >= 0;
+    ok = ok && fprintf(fp, "    \"target_hidden_states\": \"bfloat16 mean over DS4 HC heads after each target layer; row-major [seq_len, target_hidden_layers, hidden_size]\",\n") >= 0;
+    ok = ok && fprintf(fp, "    \"target_last_hidden_states\": \"bfloat16 output-HC projection plus final RMSNorm; row-major [seq_len, hidden_size]\"\n") >= 0;
+    ok = ok && fprintf(fp, "  },\n") >= 0;
+    ok = ok && fprintf(fp, "  \"gguf_tensor_type_counts\": ") >= 0;
+    ok = ok && dspark_target_cache_write_tensor_type_counts(fp, model);
+    ok = ok && fprintf(fp, ",\n  \"shards\": [") >= 0;
+    if (num_samples) {
+        ok = ok && fprintf(fp, "\n    {\n      \"file_name\": \"shard-00000.bin\",\n      \"shard_id\": 0\n    }\n  ") >= 0;
+    }
+    ok = ok && fprintf(fp, "]\n}\n") >= 0;
+    if (fclose(fp) != 0) ok = false;
+    if (!ok) fprintf(stderr, "ds4: failed to write DSpark target cache manifest %s\n", path);
+    return ok;
+}
+
+static uint32_t dspark_target_cache_layer_slot(const ds4_dspark_config *cfg, uint32_t layer_id) {
+    for (uint32_t i = 0; i < cfg->n_mtp_layers && i < 3; i++) {
+        if (cfg->target_layer_ids[i] == layer_id) return i;
+    }
+    return UINT32_MAX;
+}
+
+static void dspark_target_cache_hc_mean_bf16(uint16_t *out,
+                                             const float *hc_rows,
+                                             uint32_t rows,
+                                             uint32_t slot,
+                                             uint32_t n_slots) {
+    const float inv_hc = 1.0f / (float)DS4_N_HC;
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    for (uint32_t row = 0; row < rows; row++) {
+        const float *hc = hc_rows + (uint64_t)row * hc_dim;
+        uint16_t *dst = out + ((uint64_t)row * n_slots + slot) * DS4_N_EMBD;
+        for (uint32_t d = 0; d < DS4_N_EMBD; d++) {
+            float sum = 0.0f;
+            for (uint32_t h = 0; h < DS4_N_HC; h++) {
+                sum += hc[(uint64_t)h * DS4_N_EMBD + d];
+            }
+            dst[d] = f32_to_bf16(sum * inv_hc);
+        }
+    }
+}
+
+static void dspark_target_cache_last_hidden_bf16(uint16_t *out,
+                                                 const ds4_model *model,
+                                                 const ds4_weights *weights,
+                                                 const float *hc_rows,
+                                                 uint32_t rows) {
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    float *embd = xmalloc((size_t)DS4_N_EMBD * sizeof(embd[0]));
+    float *norm = xmalloc((size_t)DS4_N_EMBD * sizeof(norm[0]));
+    const float *norm_weight = tensor_data(model, weights->output_norm);
+    for (uint32_t row = 0; row < rows; row++) {
+        const float *hc = hc_rows + (uint64_t)row * hc_dim;
+        output_hc_head_one(embd, model, weights, hc);
+        rms_norm_weight(norm, embd, norm_weight, DS4_N_EMBD, DS4_RMS_EPS);
+        uint16_t *dst = out + (uint64_t)row * DS4_N_EMBD;
+        for (uint32_t d = 0; d < DS4_N_EMBD; d++) dst[d] = f32_to_bf16(norm[d]);
+    }
+    free(norm);
+    free(embd);
+}
+
+static bool dspark_target_cache_encode_chunk(ds4_gpu_graph *g,
+                                             const ds4_model *model,
+                                             const ds4_weights *weights,
+                                             const ds4_dspark_config *cfg,
+                                             const token_vec *prompt,
+                                             uint32_t pos0,
+                                             uint32_t n_tokens,
+                                             float *hc_rows,
+                                             uint16_t *target_chunk,
+                                             uint16_t *last_chunk) {
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    bool ok = metal_graph_upload_prompt_tokens(g->prefill_tokens, prompt, pos0, n_tokens);
+    if (ok) ok = metal_graph_upload_prompt_embeddings_hc(g->batch_cur_hc,
+                                                         g->prefill_tokens,
+                                                         model,
+                                                         weights,
+                                                         prompt,
+                                                         pos0,
+                                                         n_tokens);
+    for (uint32_t il = 0; ok && il < DS4_N_LAYER; il++) {
+        ok = ds4_gpu_begin_commands() != 0;
+        if (ok) {
+            ok = metal_graph_encode_layer_batch(g,
+                                                model,
+                                                &weights->layer[il],
+                                                il,
+                                                pos0,
+                                                n_tokens);
+        }
+        if (ok) ok = ds4_gpu_end_commands() != 0;
+        if (!ok) {
+            fprintf(stderr, "ds4: DSpark target cache layer %u encode failed\n", il);
+            return false;
+        }
+        const uint32_t slot = dspark_target_cache_layer_slot(cfg, il);
+        if (slot != UINT32_MAX) {
+            if (ds4_gpu_tensor_read(g->batch_cur_hc,
+                                    0,
+                                    hc_rows,
+                                    (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) {
+                fprintf(stderr, "ds4: failed to read DSpark target layer %u hidden states\n", il);
+                return false;
+            }
+            dspark_target_cache_hc_mean_bf16(target_chunk,
+                                             hc_rows,
+                                             n_tokens,
+                                             slot,
+                                             cfg->n_mtp_layers);
+        }
+    }
+    if (ok && ds4_gpu_tensor_read(g->batch_cur_hc,
+                                  0,
+                                  hc_rows,
+                                  (uint64_t)n_tokens * hc_dim * sizeof(float)) == 0) {
+        fprintf(stderr, "ds4: failed to read DSpark target final hidden states\n");
+        ok = false;
+    }
+    if (ok) {
+        dspark_target_cache_last_hidden_bf16(last_chunk,
+                                            model,
+                                            weights,
+                                            hc_rows,
+                                            n_tokens);
+    }
+    return ok;
+}
+#endif
+
+int ds4_engine_collect_imatrix(ds4_engine *e,
+                               const char *dataset_path,
+                               const char *output_path,
+                               int ctx_size,
+                               int max_prompts,
                                int max_tokens) {
 #ifdef DS4_NO_GPU
     (void)e;
@@ -25023,7 +26709,8 @@ int ds4_engine_collect_imatrix(ds4_engine *e,
 
     ds4_gpu_graph g;
     bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
-                                        raw_cap, (uint32_t)ctx_size, prefill_cap, false);
+                                        NULL, raw_cap, (uint32_t)ctx_size,
+                                        prefill_cap, false);
     if (!ok) {
         fprintf(stderr, "ds4: failed to allocate imatrix Metal graph runtime\n");
         free(dataset);
@@ -25140,6 +26827,315 @@ int ds4_engine_collect_imatrix(ds4_engine *e,
 #endif
 }
 
+int ds4_engine_collect_dspark_target_cache(ds4_engine *e,
+                                           const char *dataset_path,
+                                           const char *output_dir,
+                                           const char *target_model_name_or_path,
+                                           const char *chat_template,
+                                           int ctx_size,
+                                           int max_prompts,
+                                           int max_tokens) {
+#ifdef DS4_NO_GPU
+    (void)e;
+    (void)dataset_path;
+    (void)output_dir;
+    (void)target_model_name_or_path;
+    (void)chat_template;
+    (void)ctx_size;
+    (void)max_prompts;
+    (void)max_tokens;
+    fprintf(stderr, "ds4: DSpark target cache export requires a graph backend build\n");
+    return 1;
+#else
+    if (!e || !dataset_path || !output_dir) return 1;
+    if (!target_model_name_or_path || !target_model_name_or_path[0]) {
+        fprintf(stderr,
+                "ds4: DSpark target cache export requires --dspark-target-cache-target-model\n");
+        return 1;
+    }
+    if (e->backend != DS4_BACKEND_METAL || !e->metal_ready) {
+        fprintf(stderr, "ds4: DSpark target cache export currently requires --metal\n");
+        return 1;
+    }
+    if (e->ssd_streaming) {
+        fprintf(stderr, "ds4: DSpark target cache export requires non-streaming Metal weights\n");
+        return 1;
+    }
+    if (ctx_size <= 0) ctx_size = 32768;
+
+    ds4_dspark_config cfg;
+    ds4_dspark_config_init_defaults(&cfg);
+    if (cfg.n_mtp_layers == 0 || cfg.n_mtp_layers > 3) {
+        fprintf(stderr, "ds4: unsupported DSpark target layer count %u\n", cfg.n_mtp_layers);
+        return 1;
+    }
+    for (uint32_t i = 0; i < cfg.n_mtp_layers; i++) {
+        if (cfg.target_layer_ids[i] >= DS4_N_LAYER) {
+            fprintf(stderr,
+                    "ds4: DSpark target layer %u is outside the loaded %u-layer model\n",
+                    cfg.target_layer_ids[i],
+                    DS4_N_LAYER);
+            return 1;
+        }
+        for (uint32_t j = i + 1; j < cfg.n_mtp_layers; j++) {
+            if (cfg.target_layer_ids[i] == cfg.target_layer_ids[j]) {
+                fprintf(stderr, "ds4: duplicate DSpark target layer %u\n", cfg.target_layer_ids[i]);
+                return 1;
+            }
+        }
+    }
+
+    char *dataset = NULL;
+    size_t dataset_len = 0;
+    if (!imatrix_read_text_file(dataset_path, &dataset, &dataset_len)) return 1;
+    if (!dspark_target_cache_output_dir_prepare(output_dir)) {
+        free(dataset);
+        return 1;
+    }
+
+    char shard_path[PATH_MAX];
+    char index_path[PATH_MAX];
+    if (!dspark_target_cache_join_path(shard_path, sizeof(shard_path), output_dir, "shard-00000.bin") ||
+        !dspark_target_cache_join_path(index_path, sizeof(index_path), output_dir, "samples.idx")) {
+        fprintf(stderr, "ds4: DSpark target cache output path is too long\n");
+        free(dataset);
+        return 1;
+    }
+
+    FILE *shard = fopen(shard_path, "wb");
+    if (!shard) {
+        fprintf(stderr, "ds4: failed to create DSpark target cache shard %s: %s\n",
+                shard_path, strerror(errno));
+        free(dataset);
+        return 1;
+    }
+    FILE *index = fopen(index_path, "wb");
+    if (!index) {
+        fprintf(stderr, "ds4: failed to create DSpark target cache index %s: %s\n",
+                index_path, strerror(errno));
+        fclose(shard);
+        free(dataset);
+        return 1;
+    }
+
+    const ds4_model *model = &e->model;
+    const ds4_weights *weights = &e->weights;
+    const uint32_t prefill_cap =
+        metal_graph_prefill_cap_for_prompt(ctx_size, e->prefill_chunk);
+    const uint32_t raw_cap = metal_graph_raw_cap_for_context(ctx_size, prefill_cap);
+
+    ds4_gpu_graph g;
+    bool ok = metal_graph_alloc_raw_cap(&g, weights, &weights->layer[0],
+                                        NULL, raw_cap, (uint32_t)ctx_size,
+                                        prefill_cap, false);
+    if (!ok) {
+        fprintf(stderr, "ds4: failed to allocate DSpark target cache Metal graph runtime\n");
+        fclose(index);
+        fclose(shard);
+        free(dataset);
+        return 1;
+    }
+    g.quality = e->quality;
+    g.ssd_streaming = false;
+    g.ssd_streaming_cold = false;
+    g.streaming_preload_experts = 0;
+    g.power_percent = (uint32_t)e->power_percent;
+
+    const uint64_t hc_dim = (uint64_t)DS4_N_HC * DS4_N_EMBD;
+    float *hc_rows = xmalloc((size_t)prefill_cap * (size_t)hc_dim * sizeof(hc_rows[0]));
+    uint16_t *target_chunk = xmalloc((size_t)prefill_cap *
+                                     (size_t)cfg.n_mtp_layers *
+                                     (size_t)DS4_N_EMBD *
+                                     sizeof(target_chunk[0]));
+    uint16_t *last_chunk = xmalloc((size_t)prefill_cap *
+                                   (size_t)DS4_N_EMBD *
+                                   sizeof(last_chunk[0]));
+
+    fprintf(stderr,
+            "ds4: exporting DeepSpec DSpark target cache from %s (model=%s, target_layers=[%u,%u,%u], ctx=%d, chunk=%u)\n",
+            dataset_path,
+            DS4_MODEL_SHAPE_NAME,
+            cfg.target_layer_ids[0],
+            cfg.target_layer_ids[1],
+            cfg.target_layer_ids[2],
+            ctx_size,
+            prefill_cap);
+
+    int prompts_done = 0;
+    int tokens_done = 0;
+    char *cursor = dataset;
+    const char *marker_lit = "===== DS4_IMATRIX_PROMPT";
+    while (ok && *cursor) {
+        if (max_prompts > 0 && prompts_done >= max_prompts) break;
+        if (max_tokens > 0 && tokens_done >= max_tokens) break;
+
+        char *start = cursor;
+        char *marker = strstr(cursor, marker_lit);
+        if (marker) {
+            char *nl = strchr(marker, '\n');
+            if (!nl) break;
+            start = nl + 1;
+        } else if (prompts_done != 0) {
+            break;
+        }
+
+        char *next = strstr(start, marker_lit);
+        char *end = next ? next : dataset + dataset_len;
+        char saved = *end;
+        char *prompt_text = imatrix_trim_block(start, end);
+        if (prompt_text[0] != '\0') {
+            token_vec prompt = {0};
+            ds4_tokenize_rendered_chat(e, prompt_text, &prompt);
+            if (prompt.len > ctx_size) prompt.len = ctx_size;
+            if (max_tokens > 0 && prompt.len > max_tokens - tokens_done) {
+                prompt.len = max_tokens - tokens_done;
+            }
+            if (prompt.len > 0) {
+                uint16_t *last_full = xmalloc((size_t)prompt.len *
+                                              (size_t)DS4_N_EMBD *
+                                              sizeof(last_full[0]));
+                int32_t *ids = xmalloc((size_t)prompt.len * sizeof(ids[0]));
+                uint8_t *mask = xmalloc((size_t)prompt.len * sizeof(mask[0]));
+                for (int i = 0; i < prompt.len; i++) {
+                    ids[i] = (int32_t)prompt.v[i];
+                    mask[i] = 1;
+                }
+
+                uint64_t input_ids_offset = 0;
+                uint64_t attention_mask_offset = 0;
+                uint64_t loss_mask_offset = 0;
+                uint64_t target_hidden_states_offset = 0;
+                uint64_t target_last_hidden_states_offset = 0;
+                ok = dspark_target_cache_file_pos(shard, &input_ids_offset) &&
+                     dspark_target_cache_write_all(shard,
+                                                   ids,
+                                                   (size_t)prompt.len * sizeof(ids[0]),
+                                                   "input_ids");
+                ok = ok && dspark_target_cache_file_pos(shard, &attention_mask_offset) &&
+                     dspark_target_cache_write_all(shard,
+                                                   mask,
+                                                   (size_t)prompt.len * sizeof(mask[0]),
+                                                   "attention_mask");
+                ok = ok && dspark_target_cache_file_pos(shard, &loss_mask_offset) &&
+                     dspark_target_cache_write_all(shard,
+                                                   mask,
+                                                   (size_t)prompt.len * sizeof(mask[0]),
+                                                   "loss_mask");
+                ok = ok && dspark_target_cache_file_pos(shard, &target_hidden_states_offset);
+
+                if (ok && !metal_graph_reset_prefill_state(&g)) {
+                    fprintf(stderr, "ds4: failed to reset DSpark target cache graph state\n");
+                    ok = false;
+                }
+                for (uint32_t pos = 0; ok && pos < (uint32_t)prompt.len;) {
+                    uint32_t chunk = (uint32_t)prompt.len - pos;
+                    if (chunk > prefill_cap) chunk = prefill_cap;
+                    memset(target_chunk,
+                           0,
+                           (size_t)chunk * (size_t)cfg.n_mtp_layers *
+                           (size_t)DS4_N_EMBD * sizeof(target_chunk[0]));
+                    ok = dspark_target_cache_encode_chunk(&g,
+                                                          model,
+                                                          weights,
+                                                          &cfg,
+                                                          &prompt,
+                                                          pos,
+                                                          chunk,
+                                                          hc_rows,
+                                                          target_chunk,
+                                                          last_chunk);
+                    if (ok) {
+                        ok = dspark_target_cache_write_all(shard,
+                                                           target_chunk,
+                                                           (size_t)chunk *
+                                                           (size_t)cfg.n_mtp_layers *
+                                                           (size_t)DS4_N_EMBD *
+                                                           sizeof(target_chunk[0]),
+                                                           "target_hidden_states");
+                    }
+                    if (ok) {
+                        memcpy(last_full + (uint64_t)pos * DS4_N_EMBD,
+                               last_chunk,
+                               (size_t)chunk * (size_t)DS4_N_EMBD * sizeof(last_chunk[0]));
+                    }
+                    pos += chunk;
+                }
+                ok = ok && dspark_target_cache_file_pos(shard, &target_last_hidden_states_offset) &&
+                     dspark_target_cache_write_all(shard,
+                                                   last_full,
+                                                   (size_t)prompt.len *
+                                                   (size_t)DS4_N_EMBD *
+                                                   sizeof(last_full[0]),
+                                                   "target_last_hidden_states");
+                ok = ok && dspark_target_cache_write_index_record(index,
+                                                                  (uint64_t)prompts_done,
+                                                                  0,
+                                                                  (uint32_t)prompt.len,
+                                                                  input_ids_offset,
+                                                                  attention_mask_offset,
+                                                                  loss_mask_offset,
+                                                                  target_hidden_states_offset,
+                                                                  target_last_hidden_states_offset);
+                if (ok) {
+                    prompts_done++;
+                    tokens_done += prompt.len;
+                    fprintf(stderr,
+                            "ds4: DSpark target cache prompts=%d tokens=%d\r",
+                            prompts_done,
+                            tokens_done);
+                    fflush(stderr);
+                }
+                free(mask);
+                free(ids);
+                free(last_full);
+            }
+            token_vec_free(&prompt);
+        }
+        *end = saved;
+        if (!next) break;
+        cursor = next;
+    }
+    fputc('\n', stderr);
+
+    if (fflush(shard) != 0 || fsync(fileno(shard)) != 0) {
+        fprintf(stderr, "ds4: failed to flush DSpark target cache shard %s: %s\n",
+                shard_path, strerror(errno));
+        ok = false;
+    }
+    if (fflush(index) != 0 || fsync(fileno(index)) != 0) {
+        fprintf(stderr, "ds4: failed to flush DSpark target cache index %s: %s\n",
+                index_path, strerror(errno));
+        ok = false;
+    }
+    if (fclose(index) != 0) ok = false;
+    if (fclose(shard) != 0) ok = false;
+
+    if (ok) ok = dspark_target_cache_write_manifest(output_dir,
+                                                    dataset_path,
+                                                    target_model_name_or_path,
+                                                    chat_template,
+                                                    model,
+                                                    weights,
+                                                    &cfg,
+                                                    (uint64_t)prompts_done,
+                                                    (uint64_t)tokens_done);
+    if (ok) {
+        fprintf(stderr,
+                "ds4: wrote DeepSpec DSpark target cache %s from %d prompts and %d tokens\n",
+                output_dir,
+                prompts_done,
+                tokens_done);
+    }
+
+    free(last_chunk);
+    free(target_chunk);
+    free(hc_rows);
+    metal_graph_free(&g);
+    free(dataset);
+    return ok ? 0 : 1;
+#endif
+}
+
 int ds4_engine_generate_argmax(
         ds4_engine        *e,
         const ds4_tokens  *prompt,
@@ -25690,9 +27686,22 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         model_open(&e->mtp_model, opt->mtp_path, graph_backend, true);
         mtp_weights_bind(&e->mtp_weights, &e->mtp_model);
         e->mtp_ready = true;
-        fprintf(stderr, "ds4: MTP support model loaded: %s (draft=%d)\n",
+        if ((e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK || e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) &&
+            (opt->mtp_draft_tokens <= 0 || opt->mtp_draft_tokens == 1)) {
+            e->mtp_draft_tokens = (int)e->mtp_weights.dspark.block_size;
+        }
+        fprintf(stderr, "ds4: draft model loaded: %s (kind=%s, draft=%d, runtime_mtp=%s)\n",
                 opt->mtp_path,
-                e->mtp_draft_tokens);
+                ds4_mtp_draft_kind_name(e->mtp_weights.kind),
+                e->mtp_draft_tokens,
+                ds4_engine_has_mtp(e) ? "yes" : "no");
+        const ds4_dspark_spec_gate spec_gate = ds4_dspark_speculative_gate(e->mtp_weights.kind,
+                                                                           e->mtp_ready,
+                                                                           e->mtp_draft_tokens);
+        if (spec_gate == DS4_DSPARK_SPEC_DSPARK_NOT_READY ||
+            spec_gate == DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY) {
+            fprintf(stderr, "ds4: %s\n", ds4_dspark_spec_gate_reason(spec_gate));
+        }
     }
 
 #ifndef DS4_NO_GPU
@@ -25902,7 +27911,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             *out = NULL;
             return 1;
         }
-        if (e->mtp_ready &&
+        if (ds4_engine_has_mtp(e) &&
             !ds4_gpu_set_model_map_range(e->mtp_model.map,
                                            e->mtp_model.size,
                                            e->mtp_model.tensor_data_pos,
@@ -25945,7 +27954,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
         free(load_sizes);
         /* Also apply explicit optional Q8 preload settings to the MTP support
          * model when loaded. */
-        if (e->mtp_ready) {
+        if (ds4_engine_has_mtp(e)) {
             (void)ds4_gpu_set_model_fd_for_map(e->mtp_model.fd, e->mtp_model.map);
             if (!accelerator_cache_model_tensors(e->backend, &e->mtp_model,
                                                  NULL, NULL, 0)) {
@@ -26072,7 +28081,8 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) {
         return 1;
     }
     if (!metal_graph_alloc_raw_cap(&s->graph, &e->weights, shape_layer,
-                                   raw_cap, (uint32_t)ctx_size, s->prefill_cap, e->mtp_ready))
+                                   &e->mtp_weights, raw_cap, (uint32_t)ctx_size,
+                                   s->prefill_cap, ds4_engine_has_mtp(e)))
     {
         free(s);
         return 1;
@@ -26091,9 +28101,16 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) {
         return 1;
     }
     s->logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->logits[0]));
-    if (e->mtp_ready) {
+    if (ds4_engine_has_mtp(e)) {
         s->mtp_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->mtp_logits[0]));
         s->mtp_draft_token = -1;
+        /* Allocate B2 draft logits buffer when DS4_SPEC_TEMP is set and DSpark is active. */
+        if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK && getenv("DS4_SPEC_TEMP")) {
+            const uint32_t block_size = e->mtp_weights.dspark.block_size > 0
+                ? e->mtp_weights.dspark.block_size : 16;
+            s->dspark_b2_draft_logits = xmalloc(
+                (size_t)block_size * DS4_N_VOCAB * sizeof(float));
+        }
     }
     if (e->distributed.role == DS4_DISTRIBUTED_COORDINATOR) {
         char err[256];
@@ -26110,6 +28127,7 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) {
             metal_graph_free(&s->graph);
             free(s->logits);
             free(s->mtp_logits);
+            free(s->dspark_b2_draft_logits);
             free(s);
             return 1;
         }
@@ -26134,6 +28152,7 @@ void ds4_session_free(ds4_session *s) {
     token_vec_free(&s->checkpoint);
     free(s->logits);
     free(s->mtp_logits);
+    free(s->dspark_b2_draft_logits);
     free(s);
 }
 
@@ -27107,7 +29126,7 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp,
     ds4_engine *e = s->engine;
     const bool mtp_probe_log = getenv("DS4_MTP_PROBE") != NULL;
     const bool mtp_should_draft =
-        probe_mtp && e->mtp_ready && s->mtp_logits &&
+        probe_mtp && ds4_engine_has_mtp(e) && s->mtp_logits &&
         (e->mtp_draft_tokens > 1 || mtp_probe_log);
     if (probe_mtp && s->mtp_draft_valid) {
         if (mtp_probe_log) {
@@ -27133,20 +29152,45 @@ static int ds4_session_eval_internal(ds4_session *s, int token, bool probe_mtp,
     }
     token_vec_push(&s->checkpoint, token);
     if (mtp_should_draft) {
-        int mtp_top = -1;
-        if (metal_graph_eval_mtp_draft(&s->graph,
-                                       &e->model,
-                                       &e->weights,
-                                       &e->mtp_model,
-                                       &e->mtp_weights,
-                                       token,
-                                       (uint32_t)(s->checkpoint.len - 1),
-                                       getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL,
-                                       &mtp_top)) {
-            s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB);
-            s->mtp_draft_valid = true;
-        } else if (getenv("DS4_MTP_PROBE")) {
-            fprintf(stderr, "ds4: mtp probe draft failed\n");
+        if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK) {
+            int draft_n = 0;
+            uint32_t base_real = 0;
+            if (metal_graph_eval_dspark_draft_block(&s->graph,
+                                                    &e->model,
+                                                    &e->weights,
+                                                    &e->mtp_model,
+                                                    &e->mtp_weights,
+                                                    token,
+                                                    (uint32_t)(s->checkpoint.len - 1),
+                                                    (uint32_t)e->mtp_draft_tokens,
+                                                    s->dspark_draft_tokens,
+                                                    &draft_n,
+                                                    &base_real,
+                                                    getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL,
+                                                    s->dspark_b2_draft_logits)) {
+                s->dspark_draft_count = draft_n;
+                s->dspark_draft_base_real = base_real;
+                s->mtp_draft_token = draft_n > 0 ? s->dspark_draft_tokens[0] : -1;
+                s->mtp_draft_valid = draft_n > 0;
+            } else if (getenv("DS4_MTP_PROBE") || getenv("DS4_MTP_SPEC_LOG")) {
+                fprintf(stderr, "ds4: DSpark draft block failed\n");
+            }
+        } else {
+            int mtp_top = -1;
+            if (metal_graph_eval_mtp_draft(&s->graph,
+                                           &e->model,
+                                           &e->weights,
+                                           &e->mtp_model,
+                                           &e->mtp_weights,
+                                           token,
+                                           (uint32_t)(s->checkpoint.len - 1),
+                                           getenv("DS4_MTP_FULL_LOGITS") ? s->mtp_logits : NULL,
+                                           &mtp_top)) {
+                s->mtp_draft_token = mtp_top >= 0 ? mtp_top : sample_argmax(s->mtp_logits, DS4_N_VOCAB);
+                s->mtp_draft_valid = true;
+            } else if (getenv("DS4_MTP_PROBE")) {
+                fprintf(stderr, "ds4: mtp probe draft failed\n");
+            }
         }
     }
     return 0;
@@ -27204,7 +29248,7 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token,
     accepted[n_accept++] = first_token;
     if (first_token == eos_token || max_tokens == 1 || n_accept >= accepted_cap) return n_accept;
 
-    if (!e->mtp_ready || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept;
+    if (!ds4_engine_has_mtp(e) || !s->mtp_draft_valid || e->mtp_draft_tokens <= 1) return n_accept;
 
     int draft_cap = e->mtp_draft_tokens;
     if (draft_cap > max_tokens - n_accept) draft_cap = max_tokens - n_accept;
@@ -27213,6 +29257,303 @@ int ds4_session_eval_speculative_argmax(ds4_session *s, int first_token,
     if (draft_cap > room - 1) draft_cap = room - 1;
     if (draft_cap <= 0) return n_accept;
 
+    if (e->mtp_weights.kind == DS4_MTP_DRAFT_DSPARK) {
+        int drafts[16];
+        int draft_n = s->dspark_draft_count;
+        if (draft_n > draft_cap) draft_n = draft_cap;
+
+        /* Adaptive block size: conservative-then-aggressive.
+         * Start at block=2 (near-baseline, safe). Escalate to full block
+         * ONLY after seeing a full commit (high acceptance detected).
+         * Drop back to block=2 on any partial commit.
+         *
+         * This makes DSpark net-positive across ALL workloads:
+         *   structured: escalates to block=5 after 1st full commit → +8% speedup
+         *   creative:   stays at block=2 → ~95-100% of baseline (no waste)
+         * DS4_DSPARK_ADAPTIVE=1 enables this. */
+        if (getenv("DS4_DSPARK_ADAPTIVE") && draft_n > 2) {
+            if (s->dspark_prev_drafted == 0) {
+                draft_n = 2;  /* first cycle → conservative */
+            } else if (s->dspark_prev_accepted == s->dspark_prev_drafted) {
+                /* previous was full commit → escalate (keep full block) */
+            } else {
+                draft_n = 2;  /* previous was partial → conservative */
+            }
+        }
+        if (draft_n <= 0) {
+            s->mtp_draft_valid = false;
+            return n_accept;
+        }
+        memcpy(drafts, s->dspark_draft_tokens, (size_t)draft_n * sizeof(drafts[0]));
+        s->mtp_draft_valid = false;
+        s->dspark_draft_count = 0;
+
+        const bool mtp_timing = getenv("DS4_MTP_TIMING") != NULL;
+        const double mtp_t0 = mtp_timing ? now_sec() : 0.0;
+#define DS4_DSPARK_KEEP_ACCEPTED(n_) do { \
+            uint32_t keep_ = s->dspark_draft_base_real + 1u + (uint32_t)(n_); \
+            if (keep_ > DS4_N_SWA) keep_ = 0; \
+            s->graph.dspark_n_real = keep_; \
+            s->dspark_prev_accepted = (int)(n_); \
+            s->dspark_prev_drafted = draft_n; \
+        } while (0)
+
+        /* B2 rejection sampling: parse DS4_SPEC_TEMP for stochastic path.
+         * When set (temp > 0), uses B2 to produce lossless samples from
+         * the target model's distribution. Default (unset or <=0): greedy.
+         *
+         * ARDD adversarial review fix: RNG state persisted in session struct
+         * (not reseeded per call) to avoid correlated random sequences when
+         * multiple speculative eval calls happen within the same second. */
+        float b2_temp = 0.0f;
+        const char *spec_temp_env = getenv("DS4_SPEC_TEMP");
+        if (spec_temp_env && spec_temp_env[0]) {
+            char *end = NULL;
+            float v = strtof(spec_temp_env, &end);
+            if (end != spec_temp_env && v > 0.0f) b2_temp = v;
+        }
+        if (b2_temp > 0.0f && s->dspark_b2_rng == 0) {
+            const char *seed_env = getenv("DS4_SPEC_RNG_SEED");
+            if (seed_env && seed_env[0]) {
+                s->dspark_b2_rng = (uint64_t)strtoull(seed_env, NULL, 0);
+            }
+            if (s->dspark_b2_rng == 0) {
+                s->dspark_b2_rng = (uint64_t)time(NULL) ^
+                                   ((uint64_t)getpid() << 32) ^
+                                   (uint64_t)clock();
+            }
+        }
+
+        /* Greedy first-draft check (common to both greedy and B2 paths).
+         * At temp=0 this is exact; at temp>0 it is a fast pre-filter —
+         * if the argmax doesn't match, B2 would also very likely reject. */
+        if (b2_temp <= 0.0f && sample_argmax(s->logits, DS4_N_VOCAB) != drafts[0]) {
+            DS4_DSPARK_KEEP_ACCEPTED(0);
+            if (getenv("DS4_MTP_SPEC_LOG")) {
+                fprintf(stderr, "ds4: dspark spec miss first draft=%d\n", drafts[0]);
+            }
+            return n_accept;
+        }
+        if (drafts[0] == eos_token) draft_n = 1;
+
+        ds4_spec_frontier frontier;
+        memset(&frontier, 0, sizeof(frontier));
+        int *row_tops = xmalloc((size_t)draft_n * sizeof(row_tops[0]));
+        float *row_logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(row_logits[0]));
+
+        /* For B2 at temp>0, allocate target logits buffer for ALL draft positions.
+         * At temp=0 this stays NULL and we use the existing argmax path.
+         *
+         * CRITICAL (ARDD adversarial review fix — off-by-one in target logits):
+         * metal_graph_verify_suffix_tops row[i] = target logits AFTER processing
+         * drafts[i] → predicts drafts[i+1], NOT drafts[i].
+         * Correct mapping: drafts[0] → s->logits (previous target eval),
+         *                  drafts[j>0] → verify_row[j-1].
+         * We store raw verify output in b2_verify_logits, then shift into
+         * b2_target_logits with s->logits prepended as row 0. */
+        const uint64_t row_bytes = (uint64_t)DS4_N_VOCAB * sizeof(float);
+        const uint64_t all_logits_bytes = (uint64_t)draft_n * row_bytes;
+        float *b2_verify_logits = (b2_temp > 0.0f)
+            ? xmalloc((size_t)all_logits_bytes) : NULL;
+        float *b2_target_logits = (b2_temp > 0.0f)
+            ? xmalloc((size_t)all_logits_bytes) : NULL;
+
+        const int start = s->checkpoint.len;
+        const double snapshot_t0 = mtp_timing ? now_sec() : 0.0;
+        bool have_frontier = spec_frontier_snapshot(&frontier, s);
+        bool ok = have_frontier;
+        const double snapshot_done = mtp_timing ? now_sec() : 0.0;
+        if (ok) {
+            for (int i = 0; i < draft_n; i++) token_vec_push(&s->checkpoint, drafts[i]);
+            ok = metal_graph_verify_suffix_tops(&s->graph,
+                                                &e->model,
+                                                &e->weights,
+                                                &s->checkpoint,
+                                                (uint32_t)start,
+                                                (uint32_t)draft_n,
+                                                false,
+                                                row_tops,
+                                                b2_verify_logits);
+        }
+        /* Assemble shifted target logits for B2:
+         * row 0 = s->logits (target prediction for drafts[0])
+         * row j = verify_row[j-1] (target prediction for drafts[j]) */
+        if (ok && b2_verify_logits && b2_target_logits) {
+            memcpy(b2_target_logits, s->logits, (size_t)row_bytes);
+            if (draft_n > 1) {
+                memcpy(b2_target_logits + DS4_N_VOCAB,
+                       b2_verify_logits,
+                       (size_t)(draft_n - 1) * (size_t)row_bytes);
+            }
+        }
+        const double verify_done = mtp_timing ? now_sec() : 0.0;
+        if (ok) {
+            int commit_drafts;
+
+            if (b2_temp > 0.0f && b2_target_logits && s->dspark_b2_draft_logits) {
+                /* ---- B2 stochastic path ---- */
+                b2_result b2r = b2_rejection_sample(
+                    drafts,
+                    s->dspark_b2_draft_logits,
+                    b2_target_logits,
+                    DS4_N_VOCAB,
+                    draft_n,
+                    b2_temp,
+                    &s->dspark_b2_rng);
+                commit_drafts = b2r.n_accepted;
+
+                if (getenv("DS4_MTP_SPEC_LOG")) {
+                    fprintf(stderr,
+                            "ds4: dspark b2 accepted=%d/%d correction=%s temp=%.2f\n",
+                            b2r.n_accepted, draft_n,
+                            b2r.has_correction ? "yes" : "no",
+                            b2_temp);
+                }
+
+                if (commit_drafts == draft_n && !b2r.has_correction) {
+                    /* All draft tokens accepted — fast commit path. */
+                    ok = metal_graph_dspark_refresh_verified_rows(&s->graph,
+                                                                  &e->mtp_model,
+                                                                  &e->mtp_weights,
+                                                                  s->dspark_draft_base_real + 1u,
+                                                                  (uint32_t)start,
+                                                                  (uint32_t)draft_n);
+                    if (ok) ok = metal_graph_read_spec_logits_row(&s->graph,
+                                                                   (uint32_t)(draft_n - 1),
+                                                                   row_logits);
+                    if (ok) {
+                        memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0]));
+                        for (int i = 0; i < draft_n && n_accept < accepted_cap; i++) {
+                            accepted[n_accept++] = drafts[i];
+                            if (drafts[i] == eos_token) break;
+                        }
+                        s->checkpoint_valid = true;
+                        s->mtp_draft_valid = false;
+                        DS4_DSPARK_KEEP_ACCEPTED(draft_n);
+                        if (mtp_timing) {
+                            fprintf(stderr,
+                                    "ds4: dspark b2 timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms total=%.3f ms\n",
+                                    draft_n, draft_n,
+                                    (snapshot_done - snapshot_t0) * 1000.0,
+                                    (verify_done - snapshot_done) * 1000.0,
+                                    (now_sec() - mtp_t0) * 1000.0);
+                        }
+                        spec_frontier_free(&frontier);
+                        free(row_logits);
+                        free(row_tops);
+                        free(b2_target_logits); free(b2_verify_logits);
+                        return n_accept;
+                    }
+                }
+                /* B2 partial accept or correction: fall through to replay path.
+                 * If B2 produced a correction token, replace the first rejected
+                 * draft with it so the replay commits the corrected sequence. */
+                if (b2r.has_correction && commit_drafts < draft_n) {
+                    drafts[commit_drafts] = b2r.correction_token;
+                    commit_drafts++; /* include the correction in replay */
+                }
+            } else {
+                /* ---- Greedy argmax path (unchanged) ---- */
+                commit_drafts = 1;
+                for (int i = 1; i < draft_n; i++) {
+                    if (row_tops[i - 1] != drafts[i]) break;
+                    commit_drafts++;
+                }
+                if (commit_drafts == draft_n) {
+                    ok = metal_graph_dspark_refresh_verified_rows(&s->graph,
+                                                                  &e->mtp_model,
+                                                                  &e->mtp_weights,
+                                                                  s->dspark_draft_base_real + 1u,
+                                                                  (uint32_t)start,
+                                                                  (uint32_t)draft_n);
+                    if (ok) ok = metal_graph_read_spec_logits_row(&s->graph,
+                                                                   (uint32_t)(draft_n - 1),
+                                                                   row_logits);
+                    if (ok) {
+                        memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0]));
+                        for (int i = 0; i < draft_n && n_accept < accepted_cap; i++) {
+                            accepted[n_accept++] = drafts[i];
+                            if (drafts[i] == eos_token) break;
+                        }
+                        s->checkpoint_valid = true;
+                        s->mtp_draft_valid = false;
+                        DS4_DSPARK_KEEP_ACCEPTED(draft_n);
+                        if (mtp_timing) {
+                            fprintf(stderr,
+                                    "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms total=%.3f ms\n",
+                                    draft_n, draft_n,
+                                    (snapshot_done - snapshot_t0) * 1000.0,
+                                    (verify_done - snapshot_done) * 1000.0,
+                                    (now_sec() - mtp_t0) * 1000.0);
+                        }
+                        spec_frontier_free(&frontier);
+                        free(row_logits);
+                        free(row_tops);
+                        free(b2_target_logits); free(b2_verify_logits);
+                        return n_accept;
+                    }
+                }
+            }
+
+            /* Partial commit: restore frontier, replay accepted tokens one-by-one. */
+            s->checkpoint.len = start;
+            ok = have_frontier && spec_frontier_restore(&frontier, s);
+            int replayed = 0;
+            for (; ok && replayed < commit_drafts; replayed++) {
+                ok = metal_graph_eval_token_raw_swa(&s->graph,
+                                                    &e->model,
+                                                    &e->weights,
+                                                    drafts[replayed],
+                                                    (uint32_t)(start + replayed),
+                                                    row_logits);
+                if (ok) {
+                    token_vec_push(&s->checkpoint, drafts[replayed]);
+                    ok = metal_graph_dspark_refresh_current_row(&s->graph,
+                                                                &e->mtp_model,
+                                                                &e->mtp_weights,
+                                                                s->dspark_draft_base_real + 1u + (uint32_t)replayed,
+                                                                (uint32_t)(start + replayed));
+                }
+            }
+            if (ok) {
+                memcpy(s->logits, row_logits, (size_t)DS4_N_VOCAB * sizeof(s->logits[0]));
+                for (int i = 0; i < replayed && n_accept < accepted_cap; i++) {
+                    accepted[n_accept++] = drafts[i];
+                    if (drafts[i] == eos_token) break;
+                }
+                s->checkpoint_valid = true;
+                s->mtp_draft_valid = false;
+                DS4_DSPARK_KEEP_ACCEPTED(replayed);
+                if (mtp_timing) {
+                    fprintf(stderr,
+                            "ds4: dspark timing drafted=%d committed=%d snapshot=%.3f ms verify=%.3f ms replay=%.3f ms total=%.3f ms\n",
+                            draft_n,
+                            replayed,
+                            (snapshot_done - snapshot_t0) * 1000.0,
+                            (verify_done - snapshot_done) * 1000.0,
+                            (now_sec() - verify_done) * 1000.0,
+                            (now_sec() - mtp_t0) * 1000.0);
+                }
+                spec_frontier_free(&frontier);
+                free(row_logits);
+                free(row_tops);
+                free(b2_target_logits); free(b2_verify_logits);
+                return n_accept;
+            }
+        }
+        s->checkpoint.len = start;
+        if (have_frontier) (void)spec_frontier_restore(&frontier, s);
+        snprintf(err, errlen, "DSpark verifier failed");
+        s->checkpoint_valid = false;
+        DS4_DSPARK_KEEP_ACCEPTED(0);
+        spec_frontier_free(&frontier);
+        free(row_logits);
+        free(row_tops);
+        free(b2_target_logits); free(b2_verify_logits);
+        return -1;
+#undef DS4_DSPARK_KEEP_ACCEPTED
+    }
+
     int drafts[16];
     int draft_n = 1;
     drafts[0] = s->mtp_draft_token;
@@ -27769,6 +30110,7 @@ void ds4_session_invalidate(ds4_session *s) {
     s->checkpoint_valid = false;
     s->checkpoint.len = 0;
     s->mtp_draft_valid = false;
+    s->dspark_draft_count = 0;
 }
 
 void ds4_session_rewind(ds4_session *s, int pos) {
@@ -27776,6 +30118,7 @@ void ds4_session_rewind(ds4_session *s, int pos) {
     if (pos > s->checkpoint.len) pos = s->checkpoint.len;
     s->checkpoint.len = pos;
     s->mtp_draft_valid = false;
+    s->dspark_draft_count = 0;
 }
 
 int ds4_session_pos(ds4_session *s) {
diff --git a/ds4.h b/ds4.h
index 9d040c92b..4ec3ad6cb 100644
--- a/ds4.h
+++ b/ds4.h
@@ -56,6 +56,32 @@ typedef struct {
 #define DS4_DEFAULT_TOP_P 1.0f
 #define DS4_DEFAULT_MIN_P 0.05f
 
+
+typedef enum {
+    DS4_MTP_DRAFT_NONE = 0,
+    DS4_MTP_DRAFT_LEGACY,
+    DS4_MTP_DRAFT_DSPARK,
+    DS4_MTP_DRAFT_DSPARK_NONSEQ,
+} ds4_mtp_draft_kind;
+
+typedef struct {
+    uint32_t n_mtp_layers;
+    uint32_t block_size;
+    uint32_t noise_token_id;
+    uint32_t markov_rank;
+    uint32_t target_layer_ids[3];
+} ds4_dspark_config;
+
+void ds4_dspark_config_init_defaults(ds4_dspark_config *cfg);
+const char *ds4_mtp_draft_kind_name(ds4_mtp_draft_kind kind);
+/* Classify draft GGUF layout from presence markers (unit-testable, no model load). */
+ds4_mtp_draft_kind ds4_mtp_draft_kind_guess(bool has_e_proj, bool has_main_proj, bool has_markov_w1);
+ds4_mtp_draft_kind ds4_mtp_draft_kind_guess_ex(bool has_e_proj,
+                                                bool has_main_proj,
+                                                bool has_markov_w1,
+                                                bool markov_rank_set,
+                                                uint32_t markov_rank);
+
 typedef struct ds4_engine ds4_engine;
 typedef struct ds4_session ds4_session;
 
@@ -186,6 +212,14 @@ int ds4_engine_collect_imatrix(ds4_engine *e,
                                int ctx_size,
                                int max_prompts,
                                int max_tokens);
+int ds4_engine_collect_dspark_target_cache(ds4_engine *e,
+                                           const char *dataset_path,
+                                           const char *output_dir,
+                                           const char *target_model_name_or_path,
+                                           const char *chat_template,
+                                           int ctx_size,
+                                           int max_prompts,
+                                           int max_tokens);
 void ds4_engine_dump_tokens(ds4_engine *e, const ds4_tokens *tokens);
 int ds4_dump_text_tokenization(const char *model_path, const char *text, FILE *fp);
 int ds4_engine_head_test(ds4_engine *e, const ds4_tokens *prompt);
@@ -273,7 +307,13 @@ int ds4_session_ctx(ds4_session *s);
 int ds4_session_prefill_cap(ds4_session *s);
 int ds4_engine_routed_quant_bits(ds4_engine *e);
 bool ds4_engine_has_output_head(ds4_engine *e);
+/* True when speculative decode has a real proposer and target verifier. */
+bool ds4_mtp_speculative_draft_ready(ds4_mtp_draft_kind kind);
+bool ds4_mtp_draft_runtime_supported(ds4_backend backend,
+                                     ds4_mtp_draft_kind kind);
 bool ds4_engine_has_mtp(ds4_engine *e);
+ds4_mtp_draft_kind ds4_engine_mtp_draft_kind(ds4_engine *e);
+
 int ds4_engine_mtp_draft_tokens(ds4_engine *e);
 const ds4_tokens *ds4_session_tokens(ds4_session *s);
 
diff --git a/ds4_cli.c b/ds4_cli.c
index 4ad2240e8..61de77021 100644
--- a/ds4_cli.c
+++ b/ds4_cli.c
@@ -43,6 +43,12 @@ typedef struct {
     const char *imatrix_output_path;
     int imatrix_max_prompts;
     int imatrix_max_tokens;
+    const char *dspark_target_cache_dataset_path;
+    const char *dspark_target_cache_output_dir;
+    const char *dspark_target_cache_target_model;
+    const char *dspark_target_cache_chat_template;
+    int dspark_target_cache_max_prompts;
+    int dspark_target_cache_max_tokens;
     ds4_think_mode think_mode;
     bool head_test;
     bool first_token_test;
@@ -1562,6 +1568,18 @@ static cli_config parse_options(int argc, char **argv) {
             c.gen.imatrix_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--imatrix-max-tokens")) {
             c.gen.imatrix_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-dataset")) {
+            c.gen.dspark_target_cache_dataset_path = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-out")) {
+            c.gen.dspark_target_cache_output_dir = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-target-model")) {
+            c.gen.dspark_target_cache_target_model = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-chat-template")) {
+            c.gen.dspark_target_cache_chat_template = need_arg(&i, argc, argv, arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-max-prompts")) {
+            c.gen.dspark_target_cache_max_prompts = parse_int(need_arg(&i, argc, argv, arg), arg);
+        } else if (!strcmp(arg, "--dspark-target-cache-max-tokens")) {
+            c.gen.dspark_target_cache_max_tokens = parse_int(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--think")) {
             c.gen.think_mode = DS4_THINK_HIGH;
         } else if (!strcmp(arg, "--think-max")) {
@@ -1621,6 +1639,24 @@ static cli_config parse_options(int argc, char **argv) {
         fprintf(stderr, "ds4: --imatrix-dataset requires --imatrix-out\n");
         exit(2);
     }
+    if (c.gen.dspark_target_cache_output_dir && !c.gen.dspark_target_cache_dataset_path) {
+        fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-dataset\n");
+        exit(2);
+    }
+    if (c.gen.dspark_target_cache_dataset_path && !c.gen.dspark_target_cache_output_dir) {
+        fprintf(stderr, "ds4: --dspark-target-cache-dataset requires --dspark-target-cache-out\n");
+        exit(2);
+    }
+    if (c.gen.dspark_target_cache_output_dir && c.gen.prompt) {
+        fprintf(stderr, "ds4: --dspark-target-cache-out does not use -p/--prompt-file\n");
+        exit(2);
+    }
+    if (c.gen.dspark_target_cache_output_dir &&
+        (!c.gen.dspark_target_cache_target_model ||
+         !c.gen.dspark_target_cache_target_model[0])) {
+        fprintf(stderr, "ds4: --dspark-target-cache-out requires --dspark-target-cache-target-model\n");
+        exit(2);
+    }
     if (c.gen.perplexity_file_path && c.gen.prompt) {
         fprintf(stderr, "ds4: --perplexity-file does not use -p/--prompt-file\n");
         exit(2);
@@ -1693,6 +1729,15 @@ int main(int argc, char **argv) {
                                         cfg.gen.ctx_size,
                                         cfg.gen.imatrix_max_prompts,
                                         cfg.gen.imatrix_max_tokens);
+    } else if (cfg.gen.dspark_target_cache_output_dir) {
+        rc = ds4_engine_collect_dspark_target_cache(engine,
+                                                    cfg.gen.dspark_target_cache_dataset_path,
+                                                    cfg.gen.dspark_target_cache_output_dir,
+                                                    cfg.gen.dspark_target_cache_target_model,
+                                                    cfg.gen.dspark_target_cache_chat_template,
+                                                    cfg.gen.ctx_size,
+                                                    cfg.gen.dspark_target_cache_max_prompts,
+                                                    cfg.gen.dspark_target_cache_max_tokens);
     } else if (cfg.gen.perplexity_file_path) {
         rc = run_perplexity_file(engine, &cfg);
     } else if (cfg.gen.prompt == NULL) {
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 188b341ad..688507a44 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -8917,6 +8917,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor(
                                       n_head, head_dim);
 }
 
+extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor(
+        ds4_gpu_tensor       *heads,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                sinks_offset,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *raw_kv,
+        uint32_t                n_tokens,
+        uint32_t                n_raw,
+        uint32_t                raw_cap,
+        uint32_t                raw_start,
+        uint32_t                n_head,
+        uint32_t                head_dim) {
+    (void)heads; (void)model_map; (void)model_size; (void)sinks_offset;
+    (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap;
+    (void)raw_start; (void)n_head; (void)head_dim;
+    return 0;
+}
+
 extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
         ds4_gpu_tensor       *heads,
         const void             *model_map,
diff --git a/ds4_dspark_runtime.c b/ds4_dspark_runtime.c
new file mode 100644
index 000000000..cf6c5434e
--- /dev/null
+++ b/ds4_dspark_runtime.c
@@ -0,0 +1,41 @@
+#include "ds4_dspark_runtime.h"
+
+#include <string.h>
+
+
+float ds4_dspark_bf16_to_f32(uint16_t h) {
+    uint32_t bits = (uint32_t)h << 16;
+    float f;
+    memcpy(&f, &bits, sizeof(f));
+    return f;
+}
+
+
+
+ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind,
+                                                 bool mtp_ready,
+                                                 int mtp_draft_tokens) {
+    if (!mtp_ready || mtp_draft_tokens <= 1) return DS4_DSPARK_SPEC_DISABLED;
+    if (kind == DS4_MTP_DRAFT_LEGACY) return DS4_DSPARK_SPEC_LEGACY_MTP;
+    if (kind == DS4_MTP_DRAFT_DSPARK) return DS4_DSPARK_SPEC_DSPARK_ENABLED;
+    if (kind == DS4_MTP_DRAFT_DSPARK_NONSEQ) return DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY;
+    return DS4_DSPARK_SPEC_DISABLED;
+}
+
+const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate) {
+    switch (gate) {
+    case DS4_DSPARK_SPEC_LEGACY_MTP:
+        return "legacy MTP draft path (DSpark block draft not engaged)";
+    case DS4_DSPARK_SPEC_DSPARK_ENABLED:
+        return "DSpark block speculative decode enabled";
+    case DS4_DSPARK_SPEC_DSPARK_NOT_READY:
+        return "DSpark draft graph has not been validated on real DSpark GGUF weights; "
+               "speculative decode stays off (no fake draft tokens)";
+    case DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY:
+        return "DSpark nonseq draft head has not been validated on real trained DSpark GGUF weights; "
+               "speculative decode stays off (no fake draft tokens)";
+    case DS4_DSPARK_SPEC_DISABLED:
+    default:
+        return "speculative draft disabled";
+    }
+}
\ No newline at end of file
diff --git a/ds4_dspark_runtime.h b/ds4_dspark_runtime.h
new file mode 100644
index 000000000..c70384b3e
--- /dev/null
+++ b/ds4_dspark_runtime.h
@@ -0,0 +1,29 @@
+#ifndef DS4_DSPARK_RUNTIME_H
+#define DS4_DSPARK_RUNTIME_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "ds4.h"
+
+
+typedef enum {
+    DS4_DSPARK_SPEC_DISABLED = 0,
+    DS4_DSPARK_SPEC_LEGACY_MTP,
+    DS4_DSPARK_SPEC_DSPARK_ENABLED,
+    DS4_DSPARK_SPEC_DSPARK_NOT_READY,
+    DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY,
+} ds4_dspark_spec_gate;
+
+
+
+float ds4_dspark_bf16_to_f32(uint16_t h);
+
+
+ds4_dspark_spec_gate ds4_dspark_speculative_gate(ds4_mtp_draft_kind kind,
+                                                 bool mtp_ready,
+                                                 int mtp_draft_tokens);
+
+const char *ds4_dspark_spec_gate_reason(ds4_dspark_spec_gate gate);
+
+#endif
\ No newline at end of file
diff --git a/ds4_gpu.h b/ds4_gpu.h
index b58aca9bd..6651a2880 100644
--- a/ds4_gpu.h
+++ b/ds4_gpu.h
@@ -623,6 +623,22 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor(
         uint32_t                n_head,
         uint32_t                head_dim);
 
+/* Non-causal variant (mask = all-attend): every query attends to every key in
+ * the gathered window. Used by the DSpark drafter's block attention. */
+int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor(
+        ds4_gpu_tensor       *heads,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                sinks_offset,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *raw_kv,
+        uint32_t                n_tokens,
+        uint32_t                n_raw,
+        uint32_t                raw_cap,
+        uint32_t                raw_start,
+        uint32_t                n_head,
+        uint32_t                head_dim);
+
 int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
         ds4_gpu_tensor       *heads,
         const void             *model_map,
diff --git a/ds4_help.c b/ds4_help.c
index d32e088cf..aae4b24a5 100644
--- a/ds4_help.c
+++ b/ds4_help.c
@@ -170,11 +170,11 @@ static void print_model_runtime(FILE *fp, const help_colors *c,
     opt(fp, c, "--prefill-chunk N", "Metal graph prefill chunk size. Default: auto (PRO long prompts use 8192; others use 4096).");
     if (full) {
         if (tool != DS4_HELP_BENCH) {
-            opt(fp, c, "--mtp FILE", "Optional MTP support GGUF used for draft-token probes.");
+            opt(fp, c, "--mtp FILE", "Optional speculative draft GGUF: legacy MTP or experimental converted DSpark/DeepSpec on Metal.");
         }
         if (tool == DS4_HELP_DS4 || tool == DS4_HELP_AGENT || tool == DS4_HELP_SERVER) {
-            opt(fp, c, "--mtp-draft N", "Maximum autoregressive MTP draft tokens. Default: 1");
-            opt(fp, c, "--mtp-margin F", "Verifier confidence margin for fast MTP acceptance. Default: 3");
+            opt(fp, c, "--mtp-draft N", "Maximum speculative draft tokens. Legacy default: 1; DSpark uses GGUF block size.");
+            opt(fp, c, "--mtp-margin F", "Verifier confidence margin for legacy fast MTP acceptance. Default: 3");
         }
         opt(fp, c, "--quality", "Prefer exact kernels where faster approximate paths exist.");
         opt(fp, c, "--warm-weights", "Touch mapped tensor pages at startup to reduce first-use stalls.");
@@ -254,6 +254,12 @@ static void print_cli_diagnostics(FILE *fp, const help_colors *c) {
     opt(fp, c, "--imatrix-out FILE", "Write llama-compatible routed-MoE imatrix .dat.");
     opt(fp, c, "--imatrix-max-prompts N", "Stop imatrix collection after N prompts.");
     opt(fp, c, "--imatrix-max-tokens N", "Stop imatrix collection after N prompt tokens.");
+    opt(fp, c, "--dspark-target-cache-dataset FILE", "Rendered prompt dataset for DeepSpec DSpark target-cache export.");
+    opt(fp, c, "--dspark-target-cache-out DIR", "Write DeepSpec DSpark target cache manifest/index/shard.");
+    opt(fp, c, "--dspark-target-cache-target-model HF_OR_PATH", "Required DeepSpec target model name/path stored in the target-cache manifest.");
+    opt(fp, c, "--dspark-target-cache-chat-template NAME", "DeepSpec chat template name stored in the target-cache manifest.");
+    opt(fp, c, "--dspark-target-cache-max-prompts N", "Stop target-cache export after N prompts.");
+    opt(fp, c, "--dspark-target-cache-max-tokens N", "Stop target-cache export after N prompt tokens.");
     opt(fp, c, "--head-test", "Run the output HC/logits head after the native slice.");
     opt(fp, c, "--first-token-test", "Run exact CPU whole-model pass for the first prompt token.");
     opt(fp, c, "--metal-graph-test", "Compare first GPU-resident graph stages with CPU.");
diff --git a/ds4_metal.m b/ds4_metal.m
index 7e3f8bd5c..c43762e0e 100644
--- a/ds4_metal.m
+++ b/ds4_metal.m
@@ -17050,6 +17050,13 @@ static void ds4_gpu_fill_raw_decode_batch_mask(
     }
 }
 
+static void ds4_gpu_fill_raw_decode_batch_all_mask(
+        uint16_t *mask,
+        uint32_t  n_tokens,
+        uint32_t  n_raw) {
+    memset(mask, 0, (size_t)n_tokens * n_raw * sizeof(mask[0]));
+}
+
 static void ds4_gpu_fill_mixed_decode_batch_mask(
         uint16_t *mask,
         uint32_t  n_tokens,
@@ -18432,6 +18439,7 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads(
         uint32_t               raw_cap,
         uint32_t               raw_start,
         uint32_t               window,
+        bool                   noncausal,
         uint32_t               n_head,
         uint32_t               head_dim) {
     if (head_dim != 512 || n_head == 0 || n_tokens == 0 ||
@@ -18528,11 +18536,17 @@ static int ds4_gpu_encode_flash_attention_decode_raw_batch_heads(
         return 0;
     }
 
-    ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents],
-                                         n_tokens,
-                                         n_raw,
-                                         pos0,
-                                         window);
+    if (noncausal) {
+        ds4_gpu_fill_raw_decode_batch_all_mask((uint16_t *)[mask_buffer contents],
+                                                 n_tokens,
+                                                 n_raw);
+    } else {
+        ds4_gpu_fill_raw_decode_batch_mask((uint16_t *)[mask_buffer contents],
+                                             n_tokens,
+                                             n_raw,
+                                             pos0,
+                                             window);
+    }
 
     id<MTLComputePipelineState> pad_pipeline = nil;
     if (has_kvpad) {
@@ -18693,6 +18707,7 @@ static int ds4_gpu_encode_flash_attention_decode_mixed_batch_heads(
                                                                        raw_cap,
                                                                        raw_start,
                                                                        window,
+                                                                       false,
                                                                        n_head,
                                                                        head_dim);
     }
@@ -19052,6 +19067,7 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor(
                                                                      raw_cap,
                                                                      raw_start,
                                                                      window,
+                                                                     false,
                                                                      n_head,
                                                                      head_dim)) {
             return 0;
@@ -19063,6 +19079,66 @@ int ds4_gpu_attention_decode_raw_batch_heads_tensor(
     return 1;
 }
 
+int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor(
+        ds4_gpu_tensor       *heads,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                sinks_offset,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *raw_kv,
+        uint32_t                n_tokens,
+        uint32_t                n_raw,
+        uint32_t                raw_cap,
+        uint32_t                raw_start,
+        uint32_t                n_head,
+        uint32_t                head_dim) {
+    if (!g_initialized && !ds4_gpu_init()) return 0;
+    if (!heads || !q || !raw_kv || !model_map || n_tokens == 0 ||
+        n_raw == 0 || raw_cap < n_raw || raw_start >= raw_cap) {
+        return 0;
+    }
+
+    @autoreleasepool {
+        if (sinks_offset > model_size || (uint64_t)n_head * sizeof(float) > model_size - sinks_offset) {
+            fprintf(stderr, "ds4: Metal attention sinks range is outside the mapped model\n");
+            return 0;
+        }
+
+        uint64_t sinks_inner = 0;
+        id<MTLBuffer> sinks_buf = ds4_gpu_wrap_model_range(model_map, model_size,
+                                                             sinks_offset,
+                                                             (uint64_t)n_head * sizeof(float),
+                                                             &sinks_inner);
+        if (!sinks_buf) return 0;
+
+        int owned = 0;
+        id<MTLCommandBuffer> cb = ds4_gpu_command_buffer(&owned);
+        if (!cb) return 0;
+
+        if (!ds4_gpu_encode_flash_attention_decode_raw_batch_heads(cb,
+                                                                     heads,
+                                                                     sinks_buf,
+                                                                     (NSUInteger)sinks_inner,
+                                                                     q,
+                                                                     raw_kv,
+                                                                     n_tokens,
+                                                                     0,
+                                                                     n_raw,
+                                                                     raw_cap,
+                                                                     raw_start,
+                                                                     0,
+                                                                     true,
+                                                                     n_head,
+                                                                     head_dim)) {
+            return 0;
+        }
+
+        if (!ds4_gpu_finish_command_buffer(cb, owned, "dspark noncausal batch attention heads")) return 0;
+    }
+
+    return 1;
+}
+
 int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
         ds4_gpu_tensor       *heads,
         const void             *model_map,
diff --git a/gguf-tools/README.md b/gguf-tools/README.md
index f692a86d1..1636f4f4f 100644
--- a/gguf-tools/README.md
+++ b/gguf-tools/README.md
@@ -13,6 +13,9 @@ The important pieces are:
   importance with `ds4`.
 - `quality-testing/`: prompts and scripts used to compare local GGUF variants
   against official DeepSeek V4 Flash continuations.
+- `deepspec/ds4_deepspec.py`: validates DS4 target-cache exports against the
+  DeepSpec v2 manifest/index/shard contract and emits the DS4-side non-Markov
+  DeepSpec config scaffold before external training.
 
 ## Build
 
@@ -108,6 +111,29 @@ gguf-tools/deepseek4-quantize \
 `--compare-tensor` regenerates a single tensor and byte-compares it against the
 template or `--compare-gguf`.  `--threads N` controls routed-expert workers.
 
+## Generate A DSpark/DeepSpec Draft GGUF
+
+Official DeepSeek-V4-Flash DSpark/DeepSpec Markov draft weights are stored in
+separate Hugging Face safetensor shards under the `mtp.*` namespace. Convert
+those shards into a DS4 auxiliary MTP GGUF with `--dspark-only`; the main Flash
+template supplies tokenizer metadata, tensor order, and GGUF layout:
+
+```sh
+gguf-tools/deepseek4-quantize \
+  --hf gguf/dspark-hf \
+  --template gguf/ds4flash.gguf \
+  --out gguf/deepseek4.dspark.gguf \
+  --dspark-only
+```
+
+The converter detects the official Markov layout from `mtp.0.main_proj.weight`
+plus `mtp.2.markov_head.markov_w1.weight`, stores the rank-256 Markov weights
+as F16, emits `deepseek4.dspark.*` metadata, and accepts the model
+repository root `config.json` as a fallback when `inference/config.json` is not
+present. Use `--dry-run` before writing and `--self-test-dspark-map` after
+changing tensor mapping rules.
+
+
 ## When No Imatrix Is Given
 
 `iq2_xxs` requires an importance vector.  If `--imatrix` is not provided and
diff --git a/gguf-tools/deepseek4-quantize.c b/gguf-tools/deepseek4-quantize.c
index 3955b4352..c32053a8e 100644
--- a/gguf-tools/deepseek4-quantize.c
+++ b/gguf-tools/deepseek4-quantize.c
@@ -36,6 +36,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <pthread.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
 #if defined(_WIN32)
 #error "deepseek4-quantize.c currently targets POSIX systems"
@@ -47,6 +49,13 @@
 #define DS4_KV_QUANTIZE_IMATRIX_N_CHUNKS  "quantize.imatrix.chunks_count"
 #define DS4_GGUF_DEFAULT_ALIGNMENT 32
 
+#define DS4_KV_DSPARK_N_MTP_LAYERS       "deepseek4.dspark.n_mtp_layers"
+#define DS4_KV_DSPARK_BLOCK_SIZE         "deepseek4.dspark.block_size"
+#define DS4_KV_DSPARK_NOISE_TOKEN_ID     "deepseek4.dspark.noise_token_id"
+#define DS4_KV_DSPARK_MARKOV_RANK        "deepseek4.dspark.markov_rank"
+#define DS4_KV_DSPARK_TARGET_LAYER_ID    "deepseek4.dspark.target_layer_ids"
+#define DS4_DSPARK_TARGET_LAYER_COUNT    3
+
 typedef enum {
     GGUF_TYPE_UINT8   = 0,
     GGUF_TYPE_INT8    = 1,
@@ -142,6 +151,24 @@ static char *read_file(const char *path, size_t *len_out) {
     return buf;
 }
 
+static char *read_optional_file(const char *path, size_t *len_out) {
+    FILE *fp = fopen(path, "rb");
+    if (!fp) {
+        if (errno == ENOENT) return NULL;
+        die_errno("open", path);
+    }
+    if (fseeko(fp, 0, SEEK_END) != 0) die_errno("seek", path);
+    off_t n = ftello(fp);
+    if (n < 0) die_errno("tell", path);
+    if (fseeko(fp, 0, SEEK_SET) != 0) die_errno("seek", path);
+    char *buf = xmalloc((size_t)n + 1);
+    if (n && fread(buf, 1, (size_t)n, fp) != (size_t)n) die_errno("read", path);
+    buf[n] = '\0';
+    fclose(fp);
+    if (len_out) *len_out = (size_t)n;
+    return buf;
+}
+
 static uint64_t read_u64_le_fp(FILE *fp, const char *what) {
     uint8_t b[8];
     if (fread(b, 1, sizeof(b), fp) != sizeof(b)) {
@@ -874,24 +901,28 @@ typedef enum { EXP_NONE, EXP_W1, EXP_W2, EXP_W3 } expert_part;
 
 typedef struct {
     bool is_expert;
+    bool is_mtp;
     int layer;
     expert_part part;
 } expert_tensor;
 
-static expert_tensor parse_expert_tensor(const char *name) {
-    expert_tensor e = {0};
+static bool parse_expert_tensor_as(const char *name, const char *fmt, bool is_mtp, expert_tensor *out) {
     int layer = -1;
     char kind[16];
     int rest = 0;
-    if (sscanf(name, "blk.%d.ffn_%15[^_]_exps.weight%n", &layer, kind, &rest) == 2
-        && rest == (int)strlen(name))
-    {
-        if (strcmp(kind, "gate") == 0 || strcmp(kind, "down") == 0 || strcmp(kind, "up") == 0) {
-            e.is_expert = true;
-            e.layer = layer;
-            e.part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3;
-        }
-    }
+    if (sscanf(name, fmt, &layer, kind, &rest) != 2 || rest != (int)strlen(name)) return false;
+    if (strcmp(kind, "gate") != 0 && strcmp(kind, "down") != 0 && strcmp(kind, "up") != 0) return false;
+    out->is_expert = true;
+    out->is_mtp = is_mtp;
+    out->layer = layer;
+    out->part = strcmp(kind, "gate") == 0 ? EXP_W1 : strcmp(kind, "down") == 0 ? EXP_W2 : EXP_W3;
+    return true;
+}
+
+static expert_tensor parse_expert_tensor(const char *name) {
+    expert_tensor e = {0};
+    if (parse_expert_tensor_as(name, "blk.%d.ffn_%15[^_]_exps.weight%n", false, &e)) return e;
+    if (parse_expert_tensor_as(name, "mtp.%d.ffn_%15[^_]_exps.weight%n", true, &e)) return e;
     return e;
 }
 
@@ -905,6 +936,16 @@ static const char *expert_part_name(expert_part p) {
     return "";
 }
 
+static void expert_hf_prefix(char *buf, size_t cap,
+                             const expert_tensor *e, int xid,
+                             const char *wid) {
+    if (e->is_mtp) {
+        snprintf(buf, cap, "mtp.%d.ffn.experts.%d.%s", e->layer, xid, wid);
+    } else {
+        snprintf(buf, cap, "layers.%d.ffn.experts.%d.%s", e->layer, xid, wid);
+    }
+}
+
 typedef struct {
     const char *gguf;
     const char *hf;
@@ -950,34 +991,203 @@ static const name_map layer_map[] = {
     { "ffn_up_shexp.weight",              "ffn.shared_experts.w3.weight" },
     { "ffn_down_shexp.weight",            "ffn.shared_experts.w2.weight" },
     { "ffn_gate_inp.weight",              "ffn.gate.weight" },
+    { "ffn_gate_exps.weight",             "ffn.experts.*.w1.weight" },
+    { "ffn_up_exps.weight",               "ffn.experts.*.w3.weight" },
+    { "ffn_down_exps.weight",             "ffn.experts.*.w2.weight" },
     { "exp_probs_b.bias",                 "ffn.gate.bias" },
     { "ffn_gate_tid2eid.weight",          "ffn.gate.tid2eid" },
 };
 
-static char *hf_name_for_regular(const char *gguf_name) {
-    for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) {
-        if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf);
-    }
+
+static const name_map dspark_mtp_map[] = {
+    { "main_proj.weight",                 "main_proj.weight" },
+    { "main_norm.weight",                 "main_norm.weight" },
+    { "norm.weight",                      "norm.weight" },
+    { "markov_head.markov_w1.weight",     "markov_head.markov_w1.weight" },
+    { "markov_head.markov_w2.weight",     "markov_head.markov_w2.weight" },
+    { "confidence_head.proj.weight",      "confidence_head.proj.weight" },
+    { "hc_head_base.weight",              "hc_head_base" },
+    { "hc_head_fn.weight",                "hc_head_fn" },
+    { "hc_head_scale.weight",             "hc_head_scale" },
+};
+
+static char *hf_name_for_mapped_layer(
+        const char     *gguf_name,
+        const char     *gguf_prefix,
+        const char     *hf_prefix,
+        const name_map *extra_map,
+        size_t          extra_map_len) {
     int layer = -1;
-    const char *p = gguf_name;
-    if (sscanf(p, "blk.%d.", &layer) != 1) {
-        fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name);
-        exit(1);
-    }
-    const char *rest = strchr(p + 4, '.');
+    char scan_fmt[32];
+    snprintf(scan_fmt, sizeof(scan_fmt), "%s.%%d.", gguf_prefix);
+    if (sscanf(gguf_name, scan_fmt, &layer) != 1) return NULL;
+
+    const char *rest = strchr(gguf_name + strlen(gguf_prefix) + 1, '.');
     if (!rest) die("bad layer tensor name");
     rest++;
+
+    for (size_t i = 0; i < extra_map_len; i++) {
+        if (strcmp(rest, extra_map[i].gguf) == 0) {
+            char buf[512];
+            snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, extra_map[i].hf);
+            return xstrdup(buf);
+        }
+    }
     for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) {
         if (strcmp(rest, layer_map[i].gguf) == 0) {
             char buf[512];
-            snprintf(buf, sizeof(buf), "layers.%d.%s", layer, layer_map[i].hf);
+            snprintf(buf, sizeof(buf), "%s.%d.%s", hf_prefix, layer, layer_map[i].hf);
             return xstrdup(buf);
         }
     }
+    return NULL;
+}
+
+static char *hf_name_for_regular(const char *gguf_name) {
+    for (size_t i = 0; i < sizeof(top_map) / sizeof(top_map[0]); i++) {
+        if (strcmp(gguf_name, top_map[i].gguf) == 0) return xstrdup(top_map[i].hf);
+    }
+
+    char *hf_name = hf_name_for_mapped_layer(gguf_name, "blk", "layers", NULL, 0);
+    if (hf_name) return hf_name;
+
+    hf_name = hf_name_for_mapped_layer(gguf_name, "mtp", "mtp",
+                                       dspark_mtp_map,
+                                       sizeof(dspark_mtp_map) / sizeof(dspark_mtp_map[0]));
+    if (hf_name) return hf_name;
+
     fprintf(stderr, "error: cannot map GGUF tensor to HF tensor: %s\n", gguf_name);
     exit(1);
 }
 
+static void expect_hf_name(const char *gguf, const char *want) {
+    char *got = hf_name_for_regular(gguf);
+    if (strcmp(got, want) != 0) {
+        fprintf(stderr, "error: map %s -> %s, expected %s\n", gguf, got, want);
+        exit(1);
+    }
+    free(got);
+}
+
+typedef struct {
+    uint32_t block_size;
+    uint32_t noise_token_id;
+    uint32_t markov_rank;
+    uint32_t n_mtp_layers;
+    uint32_t target_layer_ids[DS4_DSPARK_TARGET_LAYER_COUNT];
+} dspark_metadata;
+
+typedef enum {
+    DS4_DSPARK_HF_NONE = 0,
+    DS4_DSPARK_HF_MARKOV,
+    DS4_DSPARK_HF_NONSEQ,
+} dspark_hf_layout;
+
+static const char *dspark_hf_layout_name(dspark_hf_layout layout) {
+    switch (layout) {
+    case DS4_DSPARK_HF_MARKOV: return "markov";
+    case DS4_DSPARK_HF_NONSEQ: return "nonseq";
+    case DS4_DSPARK_HF_NONE:
+    default: return "none";
+    }
+}
+
+static bool is_mtp_tensor_name(const char *name) {
+    return str_starts(name, "mtp.");
+}
+
+static bool is_dspark_special_tensor(const char *name) {
+    return strstr(name, ".main_proj.weight") != NULL ||
+           strstr(name, ".main_norm.weight") != NULL ||
+           strstr(name, ".attn_norm.weight") != NULL ||
+           strstr(name, ".attn_q_a_norm.weight") != NULL ||
+           strstr(name, ".attn_kv_a_norm.weight") != NULL ||
+           strstr(name, ".ffn_norm.weight") != NULL ||
+           strstr(name, ".markov_head.markov_w1.weight") != NULL ||
+           strstr(name, ".markov_head.markov_w2.weight") != NULL ||
+           strstr(name, ".confidence_head.proj.weight") != NULL;
+}
+
+static bool is_dspark_kv_key(const char *key) {
+    return strcmp(key, DS4_KV_DSPARK_N_MTP_LAYERS) == 0 ||
+           strcmp(key, DS4_KV_DSPARK_BLOCK_SIZE) == 0 ||
+           strcmp(key, DS4_KV_DSPARK_NOISE_TOKEN_ID) == 0 ||
+           strcmp(key, DS4_KV_DSPARK_MARKOV_RANK) == 0 ||
+           strncmp(key, DS4_KV_DSPARK_TARGET_LAYER_ID, strlen(DS4_KV_DSPARK_TARGET_LAYER_ID)) == 0;
+}
+
+static dspark_hf_layout dspark_hf_layout_guess(bool has_main_proj,
+                                               bool has_markov_w1,
+                                               bool has_confidence_proj,
+                                               bool markov_rank_set,
+                                               uint32_t markov_rank) {
+    if (!has_main_proj) return DS4_DSPARK_HF_NONE;
+    if (has_markov_w1 && has_confidence_proj) return DS4_DSPARK_HF_MARKOV;
+    if (!has_markov_w1 && !has_confidence_proj && markov_rank_set && markov_rank == 0) {
+        return DS4_DSPARK_HF_NONSEQ;
+    }
+    return DS4_DSPARK_HF_NONE;
+}
+
+static dspark_hf_layout db_dspark_hf_layout(const st_db *db, bool markov_rank_set, uint32_t markov_rank) {
+    return dspark_hf_layout_guess(db_has(db, "mtp.0.main_proj.weight"),
+                                  db_has(db, "mtp.2.markov_head.markov_w1.weight"),
+                                  db_has(db, "mtp.2.confidence_head.proj.weight"),
+                                  markov_rank_set,
+                                  markov_rank);
+}
+
+static dspark_metadata dspark_metadata_defaults(void) {
+    dspark_metadata m = {
+        .block_size = 5,
+        .noise_token_id = 128799,
+        .markov_rank = 256,
+        .n_mtp_layers = 3,
+        .target_layer_ids = {40, 41, 42},
+    };
+    return m;
+}
+
+static void dspark_metadata_apply_hf_config_path(dspark_metadata *m, const char *cfg_path, bool *markov_rank_set) {
+    size_t len = 0;
+    char *jtext = read_optional_file(cfg_path, &len);
+    if (!jtext) return;
+    json_doc d = json_parse_text(jtext, len);
+    int block = json_obj_get(&d, 0, "dspark_block_size");
+    int noise = json_obj_get(&d, 0, "dspark_noise_token_id");
+    int rank = json_obj_get(&d, 0, "dspark_markov_rank");
+    int n_mtp = json_obj_get(&d, 0, "n_mtp_layers");
+    int layers = json_obj_get(&d, 0, "dspark_target_layer_ids");
+    if (block >= 0) m->block_size = (uint32_t)json_i64(&d, block);
+    if (noise >= 0) m->noise_token_id = (uint32_t)json_i64(&d, noise);
+    if (rank >= 0) {
+        m->markov_rank = (uint32_t)json_i64(&d, rank);
+        if (markov_rank_set) *markov_rank_set = true;
+    }
+    if (n_mtp >= 0) m->n_mtp_layers = (uint32_t)json_i64(&d, n_mtp);
+    if (layers >= 0 && d.v[layers].type == JT_ARRAY) {
+        int n = 0;
+        for (int i = layers + 1; i < d.len && d.v[i].parent == layers && n < DS4_DSPARK_TARGET_LAYER_COUNT;) {
+            m->target_layer_ids[n++] = (uint32_t)json_i64(&d, i);
+            i = json_skip(&d, i);
+        }
+    }
+    json_free(&d);
+    free(jtext);
+}
+
+static dspark_metadata dspark_metadata_from_hf_config(const char *hf_dir, bool *markov_rank_set) {
+    if (markov_rank_set) *markov_rank_set = false;
+    dspark_metadata m = dspark_metadata_defaults();
+    char *root_cfg_path = path_join(hf_dir, "config.json");
+    dspark_metadata_apply_hf_config_path(&m, root_cfg_path, markov_rank_set);
+    free(root_cfg_path);
+    char *inference_cfg_path = path_join(hf_dir, "inference/config.json");
+    dspark_metadata_apply_hf_config_path(&m, inference_cfg_path, markov_rank_set);
+    free(inference_cfg_path);
+    return m;
+}
+
 typedef struct {
     char *prefix;
     ds4q_type type;
@@ -1000,14 +1210,27 @@ static bool is_attention_tensor(const char *name) {
     return strstr(name, ".attn") || strstr(name, "attn_") || strstr(name, ".indexer") || strstr(name, "indexer_");
 }
 
+static bool is_norm_tensor(const char *name) {
+    return strcmp(name, "output_norm.weight") == 0 ||
+           strstr(name, "_norm.weight") != NULL ||
+           strstr(name, ".norm.weight") != NULL;
+}
+
 static bool is_shared_expert(const char *name) {
     return strstr(name, "_shexp.") != NULL;
 }
-
 static bool is_output_tensor(const char *name) {
     return str_starts(name, "output.");
 }
 
+static bool is_loader_plain_f16_tensor(const char *name) {
+    return strcmp(name, "output_hc_fn.weight") == 0 ||
+           strstr(name, ".hc_attn_fn.weight") != NULL ||
+           strstr(name, ".hc_ffn_fn.weight") != NULL ||
+           strstr(name, ".hc_head_fn.weight") != NULL ||
+           strstr(name, ".ffn_gate_inp.weight") != NULL;
+}
+
 typedef struct {
     char *name;
     int n_dims;
@@ -1041,6 +1264,19 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens
         tmpl->type != DS4Q_TYPE_BF16 && !ds4q_can_quantize(tmpl->type)) {
         return tmpl->type;
     }
+    if (is_mtp_tensor_name(name) && is_dspark_special_tensor(name)) {
+        if (strstr(name, ".confidence_head.proj.weight")) return DS4Q_TYPE_F32;
+        if (strstr(name, ".main_proj.weight")) return DS4Q_TYPE_Q8_0;
+        if (strstr(name, ".main_norm.weight") || strstr(name, ".attn_norm.weight") ||
+            strstr(name, ".attn_q_a_norm.weight") || strstr(name, ".attn_kv_a_norm.weight") ||
+            strstr(name, ".ffn_norm.weight")) return DS4Q_TYPE_F32;
+        if (strstr(name, ".markov_head.markov_w1.weight") ||
+            strstr(name, ".markov_head.markov_w2.weight")) {
+            return tmpl->type == DS4Q_TYPE_F32 ? DS4Q_TYPE_F32 : DS4Q_TYPE_F16;
+        }
+    }
+    if (is_loader_plain_f16_tensor(name)) return DS4Q_TYPE_F16;
+    if (is_norm_tensor(name)) return DS4Q_TYPE_F32;
     if (tensor_n_dims(tmpl) <= 1) return tmpl->type;
     if (strcmp(name, "token_embd.weight") == 0 && p->embedding != DS4Q_TYPE_COUNT) return p->embedding;
     if (is_output_tensor(name) && p->output != DS4Q_TYPE_COUNT) return p->output;
@@ -1051,6 +1287,148 @@ static ds4q_type policy_type(const quant_policy *p, const char *name, const tens
     return tmpl->type;
 }
 
+static void expect_policy_type(const quant_policy *p, const char *name, ds4q_type tmpl_type, ds4q_type want) {
+    tensor_meta tmpl = {
+        .name = (char *)name,
+        .n_dims = 2,
+        .ne = {4096, 4096, 1, 1},
+        .type = tmpl_type,
+    };
+    ds4q_type got = policy_type(p, name, &tmpl);
+    if (got != want) {
+        fprintf(stderr, "error: policy %s -> %s, expected %s\n",
+                name, ds4q_type_name(got), ds4q_type_name(want));
+        exit(1);
+    }
+}
+
+static void self_test_dspark_only_args(void);
+static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type);
+
+static void expect_dspark_template_type(const char *name, ds4q_type hf_type, ds4q_type want) {
+    ds4q_type got = dspark_template_for_name(name, hf_type);
+    if (got != want) {
+        fprintf(stderr, "error: DSpark template %s -> %s, expected %s\n",
+                name, ds4q_type_name(got), ds4q_type_name(want));
+        exit(1);
+    }
+}
+
+
+static void self_test_dspark_map(void) {
+    expect_hf_name("mtp.0.hc_attn_base.weight", "mtp.0.hc_attn_base");
+    expect_hf_name("mtp.0.main_proj.weight", "mtp.0.main_proj.weight");
+    expect_hf_name("mtp.2.markov_head.markov_w1.weight", "mtp.2.markov_head.markov_w1.weight");
+    expect_hf_name("mtp.2.confidence_head.proj.weight", "mtp.2.confidence_head.proj.weight");
+    expert_tensor routed = parse_expert_tensor("mtp.2.ffn_down_exps.weight");
+    if (!routed.is_expert || !routed.is_mtp || routed.layer != 2 || routed.part != EXP_W2) {
+        die("bad DSpark MTP routed expert parse");
+    }
+    char eprefix[256];
+    expert_hf_prefix(eprefix, sizeof(eprefix), &routed, 7, expert_part_name(routed.part));
+    if (strcmp(eprefix, "mtp.2.ffn.experts.7.w2") != 0) {
+        die("bad DSpark MTP expert HF prefix");
+    }
+    quant_policy pol = {0};
+    pol.dense = DS4Q_TYPE_Q4_K;
+    expect_policy_type(&pol, "mtp.0.main_proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_Q8_0);
+    expect_policy_type(&pol, "mtp.2.markov_head.markov_w1.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "mtp.2.confidence_head.proj.weight", DS4Q_TYPE_F32, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "mtp.2.hc_head_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "mtp.0.hc_attn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "blk.0.hc_ffn_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "output_hc_fn.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_policy_type(&pol, "blk.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "blk.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "blk.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "blk.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    pol.dense = DS4Q_TYPE_COUNT;
+    expect_policy_type(&pol, "mtp.0.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_policy_type(&pol, "mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.hc_attn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.hc_attn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.attn_sinks.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.attn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.attn_q_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.attn_kv_a_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.hc_ffn_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.hc_ffn_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.ffn_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.exp_probs_b.bias", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.2.main_norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.2.norm.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.0.ffn_gate_inp.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F16);
+    expect_dspark_template_type("mtp.2.hc_head_base.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.2.hc_head_scale.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    expect_dspark_template_type("mtp.2.confidence_head.proj.weight", DS4Q_TYPE_BF16, DS4Q_TYPE_F32);
+    if (dspark_hf_layout_guess(true, true, true, false, 0) != DS4_DSPARK_HF_MARKOV) {
+        die("official DSpark HF layout not detected");
+    }
+    if (dspark_hf_layout_guess(true, false, false, true, 0) != DS4_DSPARK_HF_NONSEQ) {
+        die("nonseq DSpark HF layout not detected");
+    }
+    if (dspark_hf_layout_guess(true, false, false, false, 0) != DS4_DSPARK_HF_NONE) {
+        die("main-proj-only DSpark layout detected without markov_rank=0 metadata");
+    }
+    char tmpdir[] = "/tmp/ds4q-config-XXXXXX";
+    char *dir = mkdtemp(tmpdir);
+    if (!dir) die_errno("mkdtemp", tmpdir);
+    char *cfg_path = path_join(dir, "config.json");
+    FILE *cfp = fopen(cfg_path, "wb");
+    if (!cfp) die_errno("create config", cfg_path);
+    fputs("{\"dspark_block_size\":7,\"dspark_noise_token_id\":9,\"dspark_markov_rank\":0,"
+          "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[5,6,7]}", cfp);
+    if (fclose(cfp) != 0) die_errno("close config", cfg_path);
+    bool rank_set = false;
+    dspark_metadata fm = dspark_metadata_from_hf_config(dir, &rank_set);
+    if (!rank_set || fm.block_size != 7 || fm.noise_token_id != 9 || fm.markov_rank != 0 ||
+        fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 5 || fm.target_layer_ids[2] != 7) {
+        die("bad DSpark root config metadata parse");
+    }
+    unlink(cfg_path);
+    free(cfg_path);
+    rmdir(dir);
+    char tmpdir_inference[] = "/tmp/ds4q-config-merge-XXXXXX";
+    char *dir_inference = mkdtemp(tmpdir_inference);
+    if (!dir_inference) die_errno("mkdtemp", tmpdir_inference);
+    char *root_cfg_path = path_join(dir_inference, "config.json");
+    FILE *root_cfp = fopen(root_cfg_path, "wb");
+    if (!root_cfp) die_errno("create root config", root_cfg_path);
+    fputs("{\"num_nextn_predict_layers\":1}", root_cfp);
+    if (fclose(root_cfp) != 0) die_errno("close root config", root_cfg_path);
+    char *inf_dir = path_join(dir_inference, "inference");
+    if (mkdir(inf_dir, 0700) != 0) die_errno("mkdir", inf_dir);
+    char *inf_cfg_path = path_join(inf_dir, "config.json");
+    FILE *inf_cfp = fopen(inf_cfg_path, "wb");
+    if (!inf_cfp) die_errno("create inference config", inf_cfg_path);
+    fputs("{\"dspark_block_size\":8,\"dspark_noise_token_id\":11,\"dspark_markov_rank\":0,"
+          "\"n_mtp_layers\":3,\"dspark_target_layer_ids\":[40,41,42]}", inf_cfp);
+    if (fclose(inf_cfp) != 0) die_errno("close inference config", inf_cfg_path);
+    rank_set = false;
+    fm = dspark_metadata_from_hf_config(dir_inference, &rank_set);
+    if (!rank_set || fm.block_size != 8 || fm.noise_token_id != 11 || fm.markov_rank != 0 ||
+        fm.n_mtp_layers != 3 || fm.target_layer_ids[0] != 40 || fm.target_layer_ids[2] != 42) {
+        die("bad DSpark inference config metadata merge");
+    }
+    unlink(inf_cfg_path);
+    unlink(root_cfg_path);
+    rmdir(inf_dir);
+    rmdir(dir_inference);
+    free(inf_cfg_path);
+    free(inf_dir);
+    free(root_cfg_path);
+    dspark_metadata dm = dspark_metadata_defaults();
+    if (dm.block_size != 5 || dm.noise_token_id != 128799 || dm.markov_rank != 256 ||
+        dm.n_mtp_layers != 3 || dm.target_layer_ids[0] != 40) {
+        die("bad DSpark metadata defaults");
+    }
+    self_test_dspark_only_args();
+    puts("dspark_map: OK");
+}
+
+
 static ds4q_type parse_type(const char *raw) {
     char wanted[64];
     size_t n = 0;
@@ -1150,18 +1528,23 @@ static size_t tensor_nbytes(ds4q_type type, const int64_t *ne, int n_dims) {
     return nbytes;
 }
 
+static bool reversed_shape_matches(const st_info *info, const tensor_meta *tmpl, int nd) {
+    if (info->n_dims != nd) return false;
+    for (int i = 0; i < nd; i++) {
+        if (tmpl->ne[i] != info->shape[nd - 1 - i]) return false;
+    }
+    return true;
+}
+
 static void check_reversed_shape(const char *gguf_name, const st_info *info, const tensor_meta *tmpl) {
-    int nd = tensor_n_dims(tmpl);
-    if (info->n_dims != nd) {
+    if (reversed_shape_matches(info, tmpl, tmpl->n_dims)) return;
+    if (reversed_shape_matches(info, tmpl, tensor_n_dims(tmpl))) return;
+    if (info->n_dims != tmpl->n_dims && info->n_dims != tensor_n_dims(tmpl)) {
         fprintf(stderr, "error: rank mismatch for %s\n", gguf_name);
         exit(1);
     }
-    for (int i = 0; i < nd; i++) {
-        if (tmpl->ne[i] != info->shape[nd - 1 - i]) {
-            fprintf(stderr, "error: shape mismatch for %s\n", gguf_name);
-            exit(1);
-        }
-    }
+    fprintf(stderr, "error: shape mismatch for %s\n", gguf_name);
+    exit(1);
 }
 
 static byte_buf generate_regular(st_db *db, const char *gguf_name, const tensor_meta *tmpl,
@@ -1223,7 +1606,7 @@ typedef struct {
 
 static void generate_one_expert(expert_job *j, int xid) {
     char prefix[256];
-    snprintf(prefix, sizeof(prefix), "layers.%d.ffn.experts.%d.%s", j->expert.layer, xid, j->wid);
+    expert_hf_prefix(prefix, sizeof(prefix), &j->expert, xid, j->wid);
     char weight_name[320];
     char scale_name[320];
     snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix);
@@ -1339,6 +1722,8 @@ typedef struct {
     size_t data_offset;
     size_t tensor_bytes;
     size_t alignment;
+    bool write_dspark;
+    dspark_metadata dspark;
 } output_context;
 
 static size_t gguf_scalar_size(uint32_t type) {
@@ -1455,6 +1840,62 @@ static void write_imatrix_kvs(FILE *fp, const imatrix_store *im) {
     }
 }
 
+
+static size_t gguf_kv_scalar_size(uint32_t type) {
+    return 4 + gguf_scalar_size(type);
+}
+
+
+static size_t gguf_kv_u32_size(const char *key) {
+    return gguf_string_size(key) + gguf_kv_scalar_size(GGUF_TYPE_UINT32);
+}
+
+static uint64_t extra_dspark_kv_count(bool enabled) {
+    if (!enabled) return 0;
+    return 4 + DS4_DSPARK_TARGET_LAYER_COUNT;
+}
+
+static size_t extra_dspark_kv_size(bool enabled) {
+    if (!enabled) return 0;
+    size_t n = 0;
+    n += gguf_kv_u32_size(DS4_KV_DSPARK_N_MTP_LAYERS);
+    n += gguf_kv_u32_size(DS4_KV_DSPARK_BLOCK_SIZE);
+    n += gguf_kv_u32_size(DS4_KV_DSPARK_NOISE_TOKEN_ID);
+    n += gguf_kv_u32_size(DS4_KV_DSPARK_MARKOV_RANK);
+    for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) {
+        char key[64];
+        snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i);
+        n += gguf_kv_u32_size(key);
+    }
+    return n;
+}
+
+static void write_dspark_kvs(FILE *fp, const dspark_metadata *m) {
+    write_gguf_string(fp, DS4_KV_DSPARK_N_MTP_LAYERS);
+    write_u32(fp, GGUF_TYPE_UINT32);
+    write_u32(fp, m->n_mtp_layers);
+
+    write_gguf_string(fp, DS4_KV_DSPARK_BLOCK_SIZE);
+    write_u32(fp, GGUF_TYPE_UINT32);
+    write_u32(fp, m->block_size);
+
+    write_gguf_string(fp, DS4_KV_DSPARK_NOISE_TOKEN_ID);
+    write_u32(fp, GGUF_TYPE_UINT32);
+    write_u32(fp, m->noise_token_id);
+
+    write_gguf_string(fp, DS4_KV_DSPARK_MARKOV_RANK);
+    write_u32(fp, GGUF_TYPE_UINT32);
+    write_u32(fp, m->markov_rank);
+
+    for (int i = 0; i < DS4_DSPARK_TARGET_LAYER_COUNT; i++) {
+        char key[64];
+        snprintf(key, sizeof(key), "%s.%d", DS4_KV_DSPARK_TARGET_LAYER_ID, i);
+        write_gguf_string(fp, key);
+        write_u32(fp, GGUF_TYPE_UINT32);
+        write_u32(fp, m->target_layer_ids[i]);
+    }
+}
+
 static gguf_file load_gguf_metadata(const char *path) {
     gguf_file g = {0};
     g.path = xstrdup(path);
@@ -1499,7 +1940,7 @@ static gguf_file load_gguf_metadata(const char *path) {
          * otherwise the output can contain duplicate GGUF metadata with stale
          * and new values.
          */
-        if (!is_imatrix_kv_key(key)) {
+        if (!is_imatrix_kv_key(key) && !is_dspark_kv_key(key)) {
             kv_keep[n_kv_keep++] = (byte_span){
                 .start = (size_t)(rec_start - kv_start),
                 .end = (size_t)(rec_end - kv_start),
@@ -1549,6 +1990,149 @@ static gguf_file load_gguf_metadata(const char *path) {
     return g;
 }
 
+static void gguf_replace_tensors_start(gguf_file *g) {
+    for (uint64_t i = 0; i < g->n_tensors; i++) free(g->tensors[i].name);
+    free(g->tensors);
+    g->tensors = NULL;
+    g->n_tensors = 0;
+    g->data_offset = 0;
+    hmap_free(&g->tensor_map);
+}
+
+static void gguf_add_tensor_meta(gguf_file *g, const char *name, int n_dims, const int64_t *ne, ds4q_type type) {
+    g->tensors = xrealloc(g->tensors, (size_t)(g->n_tensors + 1) * sizeof(g->tensors[0]));
+    tensor_meta *t = &g->tensors[g->n_tensors++];
+    memset(t, 0, sizeof(*t));
+    t->name = xstrdup(name);
+    t->n_dims = n_dims;
+    for (int i = 0; i < n_dims; i++) t->ne[i] = ne[i];
+    t->type = type;
+    t->size = tensor_nbytes(type, t->ne, t->n_dims);
+}
+
+static ds4q_type template_type_for_hf_dtype(const char *dtype) {
+    if (strcmp(dtype, "F32") == 0) return DS4Q_TYPE_F32;
+    if (strcmp(dtype, "BF16") == 0) return DS4Q_TYPE_BF16;
+    if (strcmp(dtype, "F8_E4M3") == 0) return DS4Q_TYPE_F16;
+    if (strcmp(dtype, "I8") == 0) return DS4Q_TYPE_Q4_K;
+    if (strcmp(dtype, "I64") == 0) return DS4Q_TYPE_I32;
+    fprintf(stderr, "error: unsupported HF dtype for DSpark template: %s\n", dtype);
+    exit(1);
+}
+
+static bool is_dspark_required_stage_tensor(const char *rest) {
+    return strcmp(rest, "hc_attn_fn.weight") == 0 ||
+           strcmp(rest, "hc_attn_scale.weight") == 0 ||
+           strcmp(rest, "hc_attn_base.weight") == 0 ||
+           strcmp(rest, "attn_norm.weight") == 0 ||
+           strcmp(rest, "attn_q_a.weight") == 0 ||
+           strcmp(rest, "attn_q_a_norm.weight") == 0 ||
+           strcmp(rest, "attn_q_b.weight") == 0 ||
+           strcmp(rest, "attn_kv.weight") == 0 ||
+           strcmp(rest, "attn_kv_a_norm.weight") == 0 ||
+           strcmp(rest, "attn_sinks.weight") == 0 ||
+           strcmp(rest, "attn_output_a.weight") == 0 ||
+           strcmp(rest, "attn_output_b.weight") == 0 ||
+           strcmp(rest, "hc_ffn_fn.weight") == 0 ||
+           strcmp(rest, "hc_ffn_scale.weight") == 0 ||
+           strcmp(rest, "hc_ffn_base.weight") == 0 ||
+           strcmp(rest, "ffn_norm.weight") == 0 ||
+           strcmp(rest, "ffn_gate_inp.weight") == 0 ||
+           strcmp(rest, "exp_probs_b.bias") == 0 ||
+           strcmp(rest, "ffn_gate_shexp.weight") == 0 ||
+           strcmp(rest, "ffn_up_shexp.weight") == 0 ||
+           strcmp(rest, "ffn_down_shexp.weight") == 0;
+}
+
+static bool is_dspark_routed_stage_tensor(const char *rest) {
+    return strcmp(rest, "ffn_gate_exps.weight") == 0 ||
+           strcmp(rest, "ffn_up_exps.weight") == 0 ||
+           strcmp(rest, "ffn_down_exps.weight") == 0;
+}
+
+static bool is_dspark_loader_f32_tensor(const char *name) {
+    return strstr(name, ".main_norm.weight") ||
+           (strstr(name, ".norm.weight") && str_starts(name, "mtp.")) ||
+           strstr(name, ".attn_norm.weight") ||
+           strstr(name, ".attn_q_a_norm.weight") ||
+           strstr(name, ".attn_kv_a_norm.weight") ||
+           strstr(name, ".hc_attn_scale.weight") ||
+           strstr(name, ".hc_attn_base.weight") ||
+           strstr(name, ".attn_sinks.weight") ||
+           strstr(name, ".hc_ffn_scale.weight") ||
+           strstr(name, ".hc_ffn_base.weight") ||
+           strstr(name, ".ffn_norm.weight") ||
+           strstr(name, ".exp_probs_b.bias") ||
+           strstr(name, ".hc_head_base.weight") ||
+           strstr(name, ".hc_head_scale.weight") ||
+           strstr(name, ".confidence_head.proj.weight");
+}
+
+static ds4q_type dspark_template_for_name(const char *name, ds4q_type hf_type) {
+    if (is_dspark_loader_f32_tensor(name)) return DS4Q_TYPE_F32;
+    if (strstr(name, ".markov_head.markov_w1.weight") ||
+        strstr(name, ".markov_head.markov_w2.weight")) return DS4Q_TYPE_F16;
+    if (strstr(name, ".hc_head_fn.weight") ||
+        strstr(name, ".hc_attn_fn.weight") ||
+        strstr(name, ".hc_ffn_fn.weight") ||
+        strstr(name, ".ffn_gate_inp.weight")) return DS4Q_TYPE_F16;
+    if (is_attention_projection(name) || is_shared_expert(name)) return DS4Q_TYPE_Q8_0;
+    if (parse_expert_tensor(name).is_expert) return DS4Q_TYPE_Q4_K;
+    return hf_type;
+}
+
+static void gguf_add_regular_from_hf(gguf_file *g, st_db *db, const char *gguf_name) {
+    char *hf_name = hf_name_for_regular(gguf_name);
+    tensor_entry *te = db_tensor(db, hf_name, NULL);
+    int nd = te->info.n_dims;
+    int64_t ne[DS4Q_MAX_DIMS] = {0};
+    for (int i = 0; i < nd; i++) ne[i] = te->info.shape[nd - 1 - i];
+    ds4q_type hf_type = template_type_for_hf_dtype(te->info.dtype);
+    gguf_add_tensor_meta(g, gguf_name, nd, ne, dspark_template_for_name(gguf_name, hf_type));
+    free(hf_name);
+}
+
+static void gguf_add_expert_from_hf(gguf_file *g, st_db *db, const char *gguf_name, int n_experts) {
+    expert_tensor e = parse_expert_tensor(gguf_name);
+    if (!e.is_expert) die("internal error: expected routed expert tensor");
+    char prefix[256];
+    expert_hf_prefix(prefix, sizeof(prefix), &e, 0, expert_part_name(e.part));
+    char weight_name[320];
+    snprintf(weight_name, sizeof(weight_name), "%s.weight", prefix);
+    tensor_entry *te = db_tensor(db, weight_name, NULL);
+    if (te->info.n_dims != 2) die("bad DSpark routed expert rank");
+    int64_t ne[3] = { te->info.shape[1] * 2, te->info.shape[0], n_experts };
+    gguf_add_tensor_meta(g, gguf_name, 3, ne, DS4Q_TYPE_Q4_K);
+}
+
+static void gguf_add_dspark_stage(gguf_file *g, st_db *db, uint32_t stage, int n_experts) {
+    char name[256];
+    for (size_t i = 0; i < sizeof(layer_map) / sizeof(layer_map[0]); i++) {
+        const char *rest = layer_map[i].gguf;
+        if (!is_dspark_required_stage_tensor(rest) && !is_dspark_routed_stage_tensor(rest)) continue;
+        snprintf(name, sizeof(name), "mtp.%u.%s", stage, rest);
+        if (is_dspark_routed_stage_tensor(rest)) gguf_add_expert_from_hf(g, db, name, n_experts);
+        else gguf_add_regular_from_hf(g, db, name);
+    }
+}
+
+static void gguf_use_dspark_mtp_template(gguf_file *g, st_db *db, int n_experts, dspark_hf_layout layout) {
+    if (layout == DS4_DSPARK_HF_NONE) die("--dspark-only requires DSpark HF tensors");
+    gguf_replace_tensors_start(g);
+    gguf_add_regular_from_hf(g, db, "mtp.0.main_proj.weight");
+    gguf_add_regular_from_hf(g, db, "mtp.0.main_norm.weight");
+    for (uint32_t s = 0; s < DS4_DSPARK_TARGET_LAYER_COUNT; s++) gguf_add_dspark_stage(g, db, s, n_experts);
+    gguf_add_regular_from_hf(g, db, "mtp.2.norm.weight");
+    gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_base.weight");
+    gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_fn.weight");
+    gguf_add_regular_from_hf(g, db, "mtp.2.hc_head_scale.weight");
+    if (layout == DS4_DSPARK_HF_MARKOV) {
+        gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w1.weight");
+        gguf_add_regular_from_hf(g, db, "mtp.2.markov_head.markov_w2.weight");
+        gguf_add_regular_from_hf(g, db, "mtp.2.confidence_head.proj.weight");
+    }
+}
+
 static byte_buf read_gguf_tensor_data(const gguf_file *g, const char *path, const char *name) {
     int idx = hmap_get(&g->tensor_map, name);
     if (idx < 0) {
@@ -1574,11 +2158,15 @@ static uint64_t fnv1a64_bytes(const uint8_t *data, size_t n) {
     return h;
 }
 
-static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy, const imatrix_store *im) {
+static output_context build_output_context(const gguf_file *tmpl, const quant_policy *policy,
+                                           const imatrix_store *im, bool write_dspark,
+                                           const dspark_metadata *dspark) {
     output_context out = {0};
     out.n_tensors = tmpl->n_tensors;
-    out.n_kv_extra = extra_imatrix_kv_count(im);
+    out.n_kv_extra = extra_imatrix_kv_count(im) + extra_dspark_kv_count(write_dspark);
     out.alignment = tmpl->alignment;
+    out.write_dspark = write_dspark;
+    if (write_dspark && dspark) out.dspark = *dspark;
     out.tensors = xcalloc((size_t)out.n_tensors, sizeof(out.tensors[0]));
     size_t tensor_info = 0;
     size_t off = 0;
@@ -1598,7 +2186,8 @@ static output_context build_output_context(const gguf_file *tmpl, const quant_po
         tensor_info += gguf_string_size(dst->name) + 4 + (size_t)dst->n_dims * 8 + 4 + 8;
     }
     out.tensor_bytes = off;
-    out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + extra_imatrix_kv_size(im) + tensor_info;
+    out.meta_size = 4 + 4 + 8 + 8 + tmpl->kv_raw_len + extra_imatrix_kv_size(im) +
+                    extra_dspark_kv_size(write_dspark) + tensor_info;
     out.data_offset = ds4q_pad(out.meta_size, tmpl->alignment);
     return out;
 }
@@ -1623,6 +2212,7 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte
     write_u64(fp, tmpl->n_kv + out_ctx->n_kv_extra);
     if (fwrite(tmpl->kv_raw, 1, tmpl->kv_raw_len, fp) != tmpl->kv_raw_len) die("write GGUF KV failed");
     write_imatrix_kvs(fp, imatrix);
+    if (out_ctx->write_dspark) write_dspark_kvs(fp, &out_ctx->dspark);
     for (uint64_t i = 0; i < out_ctx->n_tensors; i++) {
         const tensor_meta *t = &out_ctx->tensors[i];
         write_gguf_string(fp, t->name);
@@ -1646,10 +2236,9 @@ static void write_full_gguf(st_db *db, const gguf_file *tmpl, const output_conte
             fprintf(stderr, "error: generated size mismatch for %s: got %zu expected %zu\n", dst->name, data.size, expected);
             exit(1);
         }
-        if (fwrite(data.data, 1, data.size, fp) != data.size) die_errno("write tensor", out_path);
-        size_t padded = ds4q_pad(data.size, out_ctx->alignment);
+        if (fwrite(data.data, 1, data.size, fp) != data.size) die("write tensor data failed");
+        const size_t padded = ds4q_pad(data.size, out_ctx->alignment);
         write_padding(fp, padded - data.size);
-        fprintf(stderr, "       generated %.2f MiB\n", (double)data.size / 1048576.0);
         free(data.data);
     }
     fclose(fp);
@@ -1691,6 +2280,8 @@ typedef struct {
     bool dry_run;
     bool overwrite;
     bool imatrix_strict;
+    bool dspark_only;
+    bool self_test_dspark_map;
 } params;
 
 static void usage(const char *argv0) {
@@ -1704,6 +2295,8 @@ static void usage(const char *argv0) {
     printf("  --compare-tensor NAME  regenerate one tensor, byte-compare, and exit\n");
     printf("  --overwrite            replace --out if it already exists\n");
     printf("  --dry-run              print output plan without reading HF tensor data\n");
+    printf("  --self-test-dspark-map validate DSpark HF map, policy, and metadata defaults\n");
+    printf("  --dspark-only          replace template tensors with official DSpark MTP tensors\n");
     printf("  --imatrix FILE         legacy .dat imatrix from ds4 --imatrix-out\n");
     printf("  --imatrix-strict       fail if a quantized tensor has no matching imatrix vector\n");
     printf("  --experts TYPE         set routed w1/w2/w3 expert tensors to TYPE\n");
@@ -1762,6 +2355,10 @@ static params parse_args(int argc, char **argv) {
             p.compare_tensor = need_value(argc, argv, &i, arg);
         } else if (strcmp(arg, "--overwrite") == 0) {
             p.overwrite = true;
+        } else if (strcmp(arg, "--self-test-dspark-map") == 0) {
+            p.self_test_dspark_map = true;
+        } else if (strcmp(arg, "--dspark-only") == 0) {
+            p.dspark_only = true;
         } else if (strcmp(arg, "--dry-run") == 0) {
             p.dry_run = true;
         } else if (strcmp(arg, "--imatrix") == 0) {
@@ -1805,6 +2402,7 @@ static params parse_args(int argc, char **argv) {
             exit(1);
         }
     }
+    if (p.self_test_dspark_map) return p;
     if (!p.hf_dir) die("--hf is required");
     if (!p.template_gguf) die("--template is required");
     if (!p.dry_run && !p.compare_tensor && !p.out_gguf) die("--out is required unless --dry-run or --compare-tensor is used");
@@ -1813,6 +2411,18 @@ static params parse_args(int argc, char **argv) {
     return p;
 }
 
+static void self_test_dspark_only_args(void) {
+    char *argv[] = {
+        "deepseek4-quantize",
+        "--self-test-dspark-map",
+        "--dspark-only",
+    };
+    params p = parse_args((int)(sizeof(argv) / sizeof(argv[0])), argv);
+    if (!p.self_test_dspark_map || !p.dspark_only) {
+        die("bad --dspark-only self-test parsing");
+    }
+}
+
 static void free_gguf_file(gguf_file *g) {
     free(g->path);
     free(g->kv_raw);
@@ -1866,6 +2476,10 @@ static void compare_one_tensor(st_db *db, const gguf_file *tmpl, const output_co
 
 int main(int argc, char **argv) {
     params p = parse_args(argc, argv);
+    if (p.self_test_dspark_map) {
+        self_test_dspark_map();
+        return 0;
+    }
     imatrix_store imatrix = {0};
     if (p.imatrix_file) imatrix_load(&imatrix, p.imatrix_file, p.imatrix_strict);
 
@@ -1881,12 +2495,31 @@ int main(int argc, char **argv) {
     } else {
         fprintf(stderr, "using %d routed experts from --n-experts\n", p.n_experts);
     }
-    output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix);
-    print_plan(&tmpl, &out_ctx);
-    if (p.dry_run) return 0;
 
     st_db db;
+    bool write_dspark = false;
+    dspark_metadata dspark_meta = dspark_metadata_defaults();
+    bool markov_rank_set = false;
+    dspark_meta = dspark_metadata_from_hf_config(p.hf_dir, &markov_rank_set);
     db_open(&db, p.hf_dir);
+    dspark_hf_layout dspark_layout = db_dspark_hf_layout(&db, markov_rank_set, dspark_meta.markov_rank);
+    if (dspark_layout != DS4_DSPARK_HF_NONE) {
+        write_dspark = true;
+        fprintf(stderr, "DSpark HF %s layout detected; writing deepseek4.dspark.* metadata\n",
+                dspark_hf_layout_name(dspark_layout));
+    }
+    if (p.dspark_only) {
+        gguf_use_dspark_mtp_template(&tmpl, &db, p.n_experts, dspark_layout);
+        write_dspark = true;
+    }
+    output_context out_ctx = build_output_context(&tmpl, &p.policy, &imatrix, write_dspark, &dspark_meta);
+    print_plan(&tmpl, &out_ctx);
+    if (p.dry_run) {
+        db_close(&db);
+        free_gguf_file(&tmpl);
+        free(out_ctx.tensors);
+        return 0;
+    }
     if (p.compare_tensor) {
         compare_one_tensor(&db, &tmpl, &out_ctx, &p, &imatrix);
         db_close(&db);
diff --git a/gguf-tools/deepspec/ds4_deepspec.py b/gguf-tools/deepspec/ds4_deepspec.py
new file mode 100755
index 000000000..b76f85a73
--- /dev/null
+++ b/gguf-tools/deepspec/ds4_deepspec.py
@@ -0,0 +1,505 @@
+#!/usr/bin/env python3
+"""DS4 helpers for DeepSpec target-cache interoperability."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import struct
+import sys
+import tempfile
+import textwrap
+from pathlib import Path
+
+INDEX_RECORD_STRUCT = struct.Struct("<QIIQQQQQ")
+TARGET_CACHE_VERSION = 2
+EXPECTED_HIDDEN_DTYPE = "bfloat16"
+EXPECTED_TOKEN_DTYPE = "int32"
+EXPECTED_MASK_DTYPE = "uint8"
+DEFAULT_TARGET_MODEL = "deepseek-ai/DeepSeek-V4-Flash"
+DEFAULT_CHAT_TEMPLATE = "deepseek_v4_rendered"
+DEFAULT_DSPARK_BLOCK_SIZE = 5
+DEFAULT_DSPARK_MTP_LAYERS = 3
+DEFAULT_TARGET_LAYER_IDS = [40, 41, 42]
+DEFAULT_MASK_TOKEN_ID = 128799
+
+
+class CacheValidationError(RuntimeError):
+    pass
+
+
+def _require(condition: bool, message: str) -> None:
+    if not condition:
+        raise CacheValidationError(message)
+
+
+def _read_json(path: Path) -> dict:
+    try:
+        with path.open("r", encoding="utf-8") as fp:
+            data = json.load(fp)
+    except OSError as exc:
+        raise CacheValidationError(f"cannot read {path}: {exc}") from exc
+    except json.JSONDecodeError as exc:
+        raise CacheValidationError(f"invalid JSON in {path}: {exc}") from exc
+    _require(isinstance(data, dict), f"{path} is not a JSON object")
+    return data
+
+
+def _required_int(manifest: dict, key: str) -> int:
+    value = manifest.get(key)
+    _require(isinstance(value, int) and value >= 0, f"manifest.{key} must be a non-negative integer")
+    return value
+
+
+def _validate_manifest(manifest: dict,
+                       expected_target_model: str | None,
+                       expected_chat_template: str | None) -> tuple[int, list[int], int, list[dict]]:
+    _require(manifest.get("version") == TARGET_CACHE_VERSION,
+             f"manifest.version must be {TARGET_CACHE_VERSION}")
+    if "format" in manifest:
+        _require(manifest["format"] == "deepspec-target-cache",
+                 "manifest.format must be deepspec-target-cache")
+    _require(manifest.get("hidden_dtype") == EXPECTED_HIDDEN_DTYPE,
+             f"manifest.hidden_dtype must be {EXPECTED_HIDDEN_DTYPE}")
+    _require(manifest.get("token_dtype") == EXPECTED_TOKEN_DTYPE,
+             f"manifest.token_dtype must be {EXPECTED_TOKEN_DTYPE}")
+    _require(manifest.get("mask_dtype") == EXPECTED_MASK_DTYPE,
+             f"manifest.mask_dtype must be {EXPECTED_MASK_DTYPE}")
+    _require(manifest.get("index_record_size") == INDEX_RECORD_STRUCT.size,
+             f"manifest.index_record_size must be {INDEX_RECORD_STRUCT.size}")
+
+    hidden_size = _required_int(manifest, "hidden_size")
+    _require(hidden_size > 0, "manifest.hidden_size must be positive")
+    num_samples = _required_int(manifest, "num_samples")
+    num_shards = _required_int(manifest, "num_shards")
+
+    layers = manifest.get("target_layer_ids")
+    _require(isinstance(layers, list) and len(layers) > 0,
+             "manifest.target_layer_ids must be a non-empty list")
+    _require(all(isinstance(layer, int) and layer >= 0 for layer in layers),
+             "manifest.target_layer_ids must contain non-negative integers")
+    _require(len(set(layers)) == len(layers), "manifest.target_layer_ids must not contain duplicates")
+    _require(layers == sorted(layers), "manifest.target_layer_ids must be sorted in capture order")
+
+    target_hidden_layers = manifest.get("target_hidden_layers")
+    if target_hidden_layers is not None:
+        _require(target_hidden_layers == len(layers),
+                 "manifest.target_hidden_layers must match target_layer_ids length")
+
+    if expected_target_model is not None:
+        _require(manifest.get("target_model_name_or_path") == expected_target_model,
+                 "manifest.target_model_name_or_path does not match expected target model")
+
+    if expected_chat_template is not None:
+        convention = manifest.get("input_convention")
+        _require(isinstance(convention, dict), "manifest.input_convention must be an object")
+        _require(convention.get("chat_template") == expected_chat_template,
+                 "manifest.input_convention.chat_template does not match expected template")
+
+    shards = manifest.get("shards")
+    _require(isinstance(shards, list), "manifest.shards must be a list")
+    _require(len(shards) == num_shards, "manifest.num_shards must match shards length")
+    if num_samples > 0:
+        _require(num_shards > 0, "manifest with samples must contain at least one shard")
+    return hidden_size, layers, num_samples, shards
+
+
+def _load_shard_map(cache_dir: Path, shards: list[dict]) -> dict[int, Path]:
+    shard_map: dict[int, Path] = {}
+    for entry in shards:
+        _require(isinstance(entry, dict), "manifest.shards entries must be objects")
+        shard_id = entry.get("shard_id")
+        file_name = entry.get("file_name")
+        _require(isinstance(shard_id, int) and shard_id >= 0, "shard_id must be a non-negative integer")
+        _require(isinstance(file_name, str) and file_name, "shard file_name must be a non-empty string")
+        _require(shard_id not in shard_map, f"duplicate shard_id {shard_id}")
+        path = cache_dir / file_name
+        _require(path.is_file(), f"missing shard file {path}")
+        shard_map[shard_id] = path
+    return shard_map
+
+
+def _intervals_for_record(seq_len: int,
+                          hidden_size: int,
+                          num_layers: int,
+                          offsets: tuple[int, int, int, int, int]) -> list[tuple[str, int, int]]:
+    input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = offsets
+    target_hidden_bytes = seq_len * num_layers * hidden_size * 2
+    target_last_bytes = seq_len * hidden_size * 2
+    return [
+        ("input_ids", input_ids_offset, seq_len * 4),
+        ("attention_mask", attention_mask_offset, seq_len),
+        ("loss_mask", loss_mask_offset, seq_len),
+        ("target_hidden_states", target_hidden_offset, target_hidden_bytes),
+        ("target_last_hidden_states", target_last_offset, target_last_bytes),
+    ]
+
+
+def _validate_record(cache_dir: Path,
+                     record_index: int,
+                     record: tuple[int, int, int, int, int, int, int, int],
+                     shard_map: dict[int, Path],
+                     hidden_size: int,
+                     num_layers: int) -> None:
+    sample_id, shard_id, seq_len, input_ids_offset, attention_mask_offset, loss_mask_offset, target_hidden_offset, target_last_offset = record
+    _require(sample_id == record_index,
+             f"record {record_index} sample_id is {sample_id}, expected {record_index}")
+    _require(seq_len > 0, f"record {record_index} seq_len must be positive")
+    _require(shard_id in shard_map, f"record {record_index} references unknown shard_id {shard_id}")
+    shard = shard_map[shard_id]
+    shard_size = shard.stat().st_size
+    intervals = _intervals_for_record(seq_len,
+                                      hidden_size,
+                                      num_layers,
+                                      (input_ids_offset,
+                                       attention_mask_offset,
+                                       loss_mask_offset,
+                                       target_hidden_offset,
+                                       target_last_offset))
+    sorted_intervals = sorted(intervals, key=lambda item: item[1])
+    for name, offset, size in sorted_intervals:
+        _require(offset >= 0, f"record {record_index} {name} offset must be non-negative")
+        _require(size > 0, f"record {record_index} {name} size must be positive")
+        _require(offset + size <= shard_size,
+                 f"record {record_index} {name} extends beyond shard {shard.relative_to(cache_dir)}")
+    for (_, prev_offset, prev_size), (name, offset, _) in zip(sorted_intervals, sorted_intervals[1:]):
+        _require(prev_offset + prev_size <= offset,
+                 f"record {record_index} {name} overlaps previous tensor payload")
+
+
+def validate_target_cache(cache_dir: Path,
+                          expected_target_model: str | None = None,
+                          expected_chat_template: str | None = None) -> dict:
+    cache_dir = cache_dir.resolve()
+    _require(cache_dir.is_dir(), f"cache directory does not exist: {cache_dir}")
+    manifest = _read_json(cache_dir / "manifest.json")
+    hidden_size, layers, num_samples, shards = _validate_manifest(manifest,
+                                                                 expected_target_model,
+                                                                 expected_chat_template)
+    shard_map = _load_shard_map(cache_dir, shards)
+    index_path = cache_dir / "samples.idx"
+    _require(index_path.is_file(), f"missing index file {index_path}")
+    index_size = index_path.stat().st_size
+    _require(index_size == num_samples * INDEX_RECORD_STRUCT.size,
+             "samples.idx size must equal num_samples * index_record_size")
+    with index_path.open("rb") as fp:
+        for record_index in range(num_samples):
+            raw = fp.read(INDEX_RECORD_STRUCT.size)
+            _require(len(raw) == INDEX_RECORD_STRUCT.size,
+                     f"short samples.idx record {record_index}")
+            _validate_record(cache_dir,
+                             record_index,
+                             INDEX_RECORD_STRUCT.unpack(raw),
+                             shard_map,
+                             hidden_size,
+                             len(layers))
+    return {
+        "cache_dir": str(cache_dir),
+        "num_samples": num_samples,
+        "num_shards": len(shards),
+        "hidden_size": hidden_size,
+        "target_layer_ids": layers,
+        "index_record_size": INDEX_RECORD_STRUCT.size,
+    }
+
+def render_nonseq_config(target_cache_path: str | None = None,
+                         target_model_name_or_path: str = DEFAULT_TARGET_MODEL,
+                         chat_template: str = DEFAULT_CHAT_TEMPLATE,
+                         target_layer_ids: list[int] | None = None,
+                         max_train_steps: int | None = None,
+                         global_batch_size: int = 512,
+                         local_batch_size: int = 1) -> str:
+    """Return a DeepSpec config for a DeepSeek-V4 non-Markov DSpark pilot."""
+    if target_layer_ids is None:
+        target_layer_ids = DEFAULT_TARGET_LAYER_IDS
+    _require(len(target_layer_ids) > 0, "target_layer_ids must not be empty")
+    return textwrap.dedent(f"""\
+        # Generated by ds4_deepspec.py for DS4 DeepSpec training.
+        import os
+
+        try:
+            from deepspec.trainer import DeepSeekV4DSparkTrainer
+        except ImportError as exc:
+            raise RuntimeError(
+                "DS4 DeepSeek-V4 DSpark training needs a DeepSpec checkout/fork "
+                "that provides DeepSeekV4DSparkTrainer; upstream DeepSpec main "
+                "currently ships Qwen3/Gemma trainers only."
+            ) from exc
+
+        BASE_TB_DIR = os.path.expanduser("~/tensorboard")
+        BASE_CKPT_DIR = os.path.expanduser("~/checkpoints")
+
+        seed = 42
+        project_name = "deepspec"
+        exp_name = "dspark_block5_deepseek_v4_flash_nonseq"
+
+        model = dict(
+            target_model_name_or_path={target_model_name_or_path!r},
+            block_size={DEFAULT_DSPARK_BLOCK_SIZE},
+            num_draft_layers={len(target_layer_ids)},
+            target_layer_ids={target_layer_ids!r},
+            mask_token_id={DEFAULT_MASK_TOKEN_ID},
+            num_anchors=512,
+            markov_rank=0,
+            markov_head_type="vanilla",
+            confidence_head_alpha=0.0,
+            confidence_head_with_markov=False,
+        )
+
+        train = dict(
+            trainer_cls=DeepSeekV4DSparkTrainer,
+            lr=6.0e-4,
+            warmup_ratio=0.04,
+            weight_decay=0.0,
+            precision="bf16",
+            local_batch_size={local_batch_size},
+            global_batch_size={global_batch_size},
+            num_train_epochs=10,
+            max_train_steps={max_train_steps!r},
+            max_grad_norm=1.0,
+            sharding_strategy="no_shard",
+            torch_compile=False,
+            loss_decay_gamma=None,
+            ce_loss_alpha=1.0,
+            l1_loss_alpha=0.0,
+        )
+
+        logging = dict(
+            logging_steps=10,
+            checkpointing_steps=3000,
+        )
+
+        data = dict(
+            target_cache_path={target_cache_path!r},
+            chat_template={chat_template!r},
+            max_length=4096,
+            num_workers=4,
+        )
+
+        def finalize_cfg(cfg):
+            logging_cfg = dict(cfg["logging"])
+            project = str(cfg["project_name"])
+            exp = str(cfg["exp_name"])
+            logging_cfg["checkpoint_dir"] = os.path.join(BASE_CKPT_DIR, project, exp)
+            logging_cfg["tensorboard_dir"] = os.path.join(BASE_TB_DIR, project, exp)
+            cfg["logging"] = logging_cfg
+            return cfg
+        """)
+
+
+def _target_cache_config_defaults(target_cache_path: str,
+                                  target_model_name_or_path: str | None,
+                                  chat_template: str | None) -> tuple[str, str, list[int]]:
+    cache_dir = Path(target_cache_path)
+    manifest = _read_json(cache_dir / "manifest.json")
+
+    manifest_target = manifest.get("target_model_name_or_path")
+    if target_model_name_or_path is None:
+        _require(isinstance(manifest_target, str) and manifest_target,
+                 "manifest.target_model_name_or_path is required to emit a config without --target-model")
+        target_model_name_or_path = manifest_target
+    elif manifest_target is not None:
+        _require(isinstance(manifest_target, str) and manifest_target,
+                 "manifest.target_model_name_or_path must be a non-empty string when present")
+        _require(manifest_target == target_model_name_or_path,
+                 "manifest.target_model_name_or_path does not match expected target model")
+
+    convention = manifest.get("input_convention")
+    manifest_template = None
+    if convention is not None:
+        _require(isinstance(convention, dict), "manifest.input_convention must be an object")
+        manifest_template = convention.get("chat_template")
+        if manifest_template is not None:
+            _require(isinstance(manifest_template, str) and manifest_template,
+                     "manifest.input_convention.chat_template must be a non-empty string when present")
+    if chat_template is None:
+        _require(isinstance(manifest_template, str) and manifest_template,
+                 "manifest.input_convention.chat_template is required to emit a config without --chat-template")
+        chat_template = manifest_template
+    elif manifest_template is not None:
+        _require(manifest_template == chat_template,
+                 "manifest.input_convention.chat_template does not match expected template")
+
+    _, target_layer_ids, _, _ = _validate_manifest(manifest, None, None)
+    return target_model_name_or_path, chat_template, target_layer_ids
+
+
+def write_nonseq_config(path: Path,
+                        target_cache_path: str | None = None,
+                        target_model_name_or_path: str | None = None,
+                        chat_template: str | None = None,
+                        max_train_steps: int | None = None,
+                        global_batch_size: int = 512,
+                        local_batch_size: int = 1,
+                        overwrite: bool = False) -> dict:
+    if path.exists() and not overwrite:
+        raise CacheValidationError(f"refusing to overwrite existing config: {path}")
+    _require(target_cache_path is not None and target_cache_path != "",
+             "--target-cache is required with --emit-nonseq-config")
+    if max_train_steps is not None:
+        _require(max_train_steps > 0, "--max-train-steps must be positive")
+    _require(global_batch_size > 0, "--global-batch-size must be positive")
+    _require(local_batch_size > 0, "--local-batch-size must be positive")
+    target_model_name_or_path, chat_template, target_layer_ids = _target_cache_config_defaults(
+        target_cache_path,
+        target_model_name_or_path,
+        chat_template)
+    config = render_nonseq_config(target_cache_path,
+                                  target_model_name_or_path,
+                                  chat_template,
+                                  target_layer_ids,
+                                  max_train_steps,
+                                  global_batch_size,
+                                  local_batch_size)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(config, encoding="utf-8")
+    return {
+        "config": str(path),
+        "target_model_name_or_path": target_model_name_or_path,
+        "chat_template": chat_template,
+        "target_cache_path": target_cache_path,
+        "markov_rank": 0,
+    }
+
+
+def _write_self_test_cache(cache_dir: Path,
+                           target_model_name_or_path: str = DEFAULT_TARGET_MODEL,
+                           chat_template: str = DEFAULT_CHAT_TEMPLATE,
+                           include_optional_config: bool = True) -> None:
+    hidden_size = 4
+    layers = [1, 2, 3]
+    seq_len = 2
+    shard = cache_dir / "shard-00000.bin"
+    index = cache_dir / "samples.idx"
+    manifest = cache_dir / "manifest.json"
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    offsets: list[int] = []
+    payloads = [
+        struct.pack("<ii", 11, 12),
+        b"\x01\x01",
+        b"\x01\x01",
+        b"\x00\x3f" * (seq_len * len(layers) * hidden_size),
+        b"\x00\x3f" * (seq_len * hidden_size),
+    ]
+    pos = 0
+    with shard.open("wb") as fp:
+        for payload in payloads:
+            offsets.append(pos)
+            fp.write(payload)
+            pos += len(payload)
+    with index.open("wb") as fp:
+        fp.write(INDEX_RECORD_STRUCT.pack(0, 0, seq_len, *offsets))
+    manifest_data = {
+        "version": TARGET_CACHE_VERSION,
+        "format": "deepspec-target-cache",
+        "producer": "ds4-self-test",
+        "num_samples": 1,
+        "num_shards": 1,
+        "target_layer_ids": layers,
+        "hidden_size": hidden_size,
+        "target_hidden_layers": len(layers),
+        "hidden_dtype": EXPECTED_HIDDEN_DTYPE,
+        "token_dtype": EXPECTED_TOKEN_DTYPE,
+        "mask_dtype": EXPECTED_MASK_DTYPE,
+        "index_record_size": INDEX_RECORD_STRUCT.size,
+        "shards": [{"file_name": shard.name, "shard_id": 0}],
+    }
+    if include_optional_config:
+        manifest_data["target_model_name_or_path"] = target_model_name_or_path
+        manifest_data["input_convention"] = {"chat_template": chat_template}
+    manifest.write_text(json.dumps(manifest_data, indent=2), encoding="utf-8")
+
+
+def self_test() -> dict:
+    with tempfile.TemporaryDirectory(prefix="ds4-deepspec-cache-") as tmp:
+        cache_dir = Path(tmp) / "cache"
+        config_path = Path(tmp) / "dspark_v4_nonseq.py"
+        self_test_target_model = "local/self-test-target"
+        self_test_chat_template = "self_test_template"
+        _write_self_test_cache(cache_dir,
+                               target_model_name_or_path=self_test_target_model,
+                               chat_template=self_test_chat_template)
+        cache_result = validate_target_cache(cache_dir,
+                                             expected_target_model=self_test_target_model,
+                                             expected_chat_template=self_test_chat_template)
+        config_result = write_nonseq_config(config_path,
+                                            target_cache_path=str(cache_dir),
+                                            max_train_steps=1)
+        config_text = config_path.read_text(encoding="utf-8")
+        compile(config_text, str(config_path), "exec")
+        _require(f"target_model_name_or_path={self_test_target_model!r}" in config_text,
+                 "emitted config must inherit target model from cache manifest")
+        _require(f"chat_template={self_test_chat_template!r}" in config_text,
+                 "emitted config must inherit chat template from cache manifest")
+        _require("block_size=5" in config_text, "emitted config must use DeepSeek-V4 DSpark block_size=5")
+        _require("num_draft_layers=3" in config_text, "emitted config must use the three DSpark MTP layers")
+        _require("target_layer_ids=[1, 2, 3]" in config_text,
+                 "emitted config must inherit target layers from cache manifest")
+        optional_cache_dir = Path(tmp) / "optional-cache"
+        optional_config_path = Path(tmp) / "optional_nonseq.py"
+        explicit_target_model = "explicit/target"
+        explicit_chat_template = "explicit_template"
+        _write_self_test_cache(optional_cache_dir, include_optional_config=False)
+        optional_config = write_nonseq_config(optional_config_path,
+                                              target_cache_path=str(optional_cache_dir),
+                                              target_model_name_or_path=explicit_target_model,
+                                              chat_template=explicit_chat_template,
+                                              max_train_steps=1)
+        optional_text = optional_config_path.read_text(encoding="utf-8")
+        compile(optional_text, str(optional_config_path), "exec")
+        _require(optional_config["target_model_name_or_path"] == explicit_target_model,
+                 "explicit target model must be accepted when optional manifest target is absent")
+        _require(optional_config["chat_template"] == explicit_chat_template,
+                 "explicit chat template must be accepted when optional manifest template is absent")
+        _require(f"target_model_name_or_path={explicit_target_model!r}" in optional_text,
+                 "explicit target model must be emitted when optional manifest target is absent")
+        _require(f"chat_template={explicit_chat_template!r}" in optional_text,
+                 "explicit chat template must be emitted when optional manifest template is absent")
+        cache_result["nonseq_config"] = config_result
+        return cache_result
+
+
+def _parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Validate DS4 DeepSpec target-cache artifacts.")
+    parser.add_argument("cache_dir", nargs="?", help="Directory containing manifest.json, samples.idx, and shard files.")
+    parser.add_argument("--target-model", help="Expected manifest target_model_name_or_path, or emitted config target model.")
+    parser.add_argument("--chat-template", help="Expected manifest input_convention.chat_template, or emitted config chat template.")
+    parser.add_argument("--self-test", action="store_true", help="Run the built-in synthetic cache/config compatibility smoke.")
+    parser.add_argument("--emit-nonseq-config", metavar="FILE", help="Write a DeepSeek-V4 non-Markov DSpark DeepSpec config.")
+    parser.add_argument("--target-cache", help="target_cache_path value for --emit-nonseq-config.")
+    parser.add_argument("--max-train-steps", type=int, help="Optional train.max_train_steps value for the emitted config.")
+    parser.add_argument("--global-batch-size", type=int, default=512, help="Emitted train.global_batch_size. Default: 512.")
+    parser.add_argument("--local-batch-size", type=int, default=1, help="Emitted train.local_batch_size. Default: 1.")
+    parser.add_argument("--overwrite", action="store_true", help="Allow --emit-nonseq-config to replace FILE.")
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str]) -> int:
+    args = _parse_args(argv)
+    try:
+        if args.emit_nonseq_config:
+            result = write_nonseq_config(Path(args.emit_nonseq_config),
+                                         target_cache_path=args.target_cache,
+                                         target_model_name_or_path=args.target_model,
+                                         chat_template=args.chat_template,
+                                         max_train_steps=args.max_train_steps,
+                                         global_batch_size=args.global_batch_size,
+                                         local_batch_size=args.local_batch_size,
+                                         overwrite=args.overwrite)
+        elif args.self_test:
+            result = self_test()
+        else:
+            _require(args.cache_dir is not None, "cache_dir is required unless --self-test or --emit-nonseq-config is used")
+            result = validate_target_cache(Path(args.cache_dir),
+                                           expected_target_model=args.target_model,
+                                           expected_chat_template=args.chat_template)
+    except CacheValidationError as exc:
+        print(f"ds4-deepspec: {exc}", file=sys.stderr)
+        return 1
+    json.dump(result, sys.stdout, indent=2, sort_keys=True)
+    sys.stdout.write("\n")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
diff --git a/rocm/ds4_rocm_attention_launch.cuh b/rocm/ds4_rocm_attention_launch.cuh
index b9b43d958..0691db2e8 100644
--- a/rocm/ds4_rocm_attention_launch.cuh
+++ b/rocm/ds4_rocm_attention_launch.cuh
@@ -324,6 +324,25 @@ extern "C" int ds4_gpu_attention_decode_raw_batch_heads_tensor(
                                       n_head, head_dim);
 }
 
+extern "C" int ds4_gpu_attention_decode_raw_batch_heads_noncausal_tensor(
+        ds4_gpu_tensor       *heads,
+        const void             *model_map,
+        uint64_t                model_size,
+        uint64_t                sinks_offset,
+        const ds4_gpu_tensor *q,
+        const ds4_gpu_tensor *raw_kv,
+        uint32_t                n_tokens,
+        uint32_t                n_raw,
+        uint32_t                raw_cap,
+        uint32_t                raw_start,
+        uint32_t                n_head,
+        uint32_t                head_dim) {
+    (void)heads; (void)model_map; (void)model_size; (void)sinks_offset;
+    (void)q; (void)raw_kv; (void)n_tokens; (void)n_raw; (void)raw_cap;
+    (void)raw_start; (void)n_head; (void)head_dim;
+    return 0;
+}
+
 extern "C" int ds4_gpu_attention_decode_mixed_batch_heads_tensor(
         ds4_gpu_tensor       *heads,
         const void             *model_map,
diff --git a/tests/ds4_test.c b/tests/ds4_test.c
index ea1e52487..eea2db8ea 100644
--- a/tests/ds4_test.c
+++ b/tests/ds4_test.c
@@ -1,9 +1,11 @@
 #define DS4_SERVER_TEST
 #define DS4_SERVER_TEST_NO_MAIN
 #include "../ds4_server.c"
+#include "../ds4_dspark_runtime.h"
 #ifndef DS4_NO_GPU
 #include "../ds4_gpu.h"
 #include <math.h>
+#include <sys/wait.h>
 
 static ds4_engine *test_engine_fast;
 static ds4_engine *test_engine_quality;
@@ -85,11 +87,24 @@ static void test_restore_canonical_streaming_prefill(
                      saved.batch_selected_addr);
 }
 
+static ds4_backend test_backend(void) {
+#ifdef __APPLE__
+    return DS4_BACKEND_METAL;
+#else
+    return DS4_BACKEND_CUDA;
+#endif
+}
+
+
 static ds4_engine *test_open_engine(bool quality) {
     ds4_engine *engine = NULL;
-    /* DS4_TEST_MTP loads the MTP head on the fast engine so the speculative
-     * verify regression can reuse it; draft=4 hits the multi-row verify path. */
-    const char *mtp = getenv("DS4_TEST_MTP");
+    /* DS4_TEST_MTP loads the legacy MTP head on the fast engine so the speculative
+     * verify regression can reuse it; draft=4 hits the multi-row verify path.
+     * DS4_TEST_DSPARK loads an official DSpark draft GGUF and lets metadata choose
+     * the block size. */
+    const char *dspark = getenv("DS4_TEST_DSPARK");
+    const char *mtp = (dspark && dspark[0]) ? dspark : getenv("DS4_TEST_MTP");
+    const bool use_mtp = mtp && mtp[0] && !quality;
     ds4_engine_options opt = {
         .model_path = test_model_path(),
 #ifdef __APPLE__
@@ -106,8 +121,8 @@ static ds4_engine *test_open_engine(bool quality) {
             test_env_gib("DS4_TEST_SSD_STREAMING_CACHE_GB"),
         .ssd_streaming_preload_experts =
             test_env_u32("DS4_TEST_SSD_STREAMING_PRELOAD_EXPERTS"),
-        .mtp_path = (mtp && mtp[0] && !quality) ? mtp : NULL,
-        .mtp_draft_tokens = (mtp && mtp[0] && !quality) ? 4 : 0,
+        .mtp_path = use_mtp ? mtp : NULL,
+        .mtp_draft_tokens = use_mtp && !(dspark && dspark[0]) ? 4 : 0,
     };
     TEST_ASSERT(ds4_engine_open(&engine, &opt) == 0);
     return engine;
@@ -2174,8 +2189,368 @@ static void test_mtp_verify_depth(void) {
     free(spec);
     ds4_tokens_free(&prompt);
 }
+
+static void test_dspark_speculative_block(void) {
+    const char *dspark = getenv("DS4_TEST_DSPARK");
+    if (!dspark || !dspark[0]) {
+        fprintf(stderr, "ds4-test: dspark-speculative-block skipped (set DS4_TEST_DSPARK to a DSpark GGUF)\n");
+        return;
+    }
+
+    ds4_engine *engine = test_get_engine(false);
+    const ds4_mtp_draft_kind draft_kind = ds4_engine_mtp_draft_kind(engine);
+    TEST_ASSERT(draft_kind == DS4_MTP_DRAFT_DSPARK);
+    if (!ds4_mtp_draft_runtime_supported(test_backend(), draft_kind)) {
+        fprintf(stderr, "ds4-test: dspark-speculative-block skipped (backend does not support DSpark runtime)\n");
+        return;
+    }
+    TEST_ASSERT(ds4_engine_has_mtp(engine));
+    TEST_ASSERT(ds4_engine_mtp_draft_tokens(engine) == 5);
+
+    ds4_tokens prompt = {0};
+    ds4_chat_begin(engine, &prompt);
+    ds4_chat_append_message(engine, &prompt, "user", test_mtp_copy_prompt());
+    ds4_chat_append_assistant_prefix(engine, &prompt, DS4_THINK_NONE);
+    TEST_ASSERT(prompt.len > 0);
+
+    int *spec = malloc((size_t)TEST_MTP_MAXGEN * sizeof(*spec));
+    TEST_ASSERT(spec != NULL);
+    if (spec && prompt.len > 0) {
+        int nspec = 0, max_chunk = 0;
+        const bool ok_spec = test_mtp_capture_speculative(engine, &prompt, 96,
+                                                          spec, &nspec, &max_chunk);
+        TEST_ASSERT(ok_spec);
+        TEST_ASSERT(max_chunk > 1);
+
+        float worst_gap = 0.0f;
+        int worst_at = -1;
+        const bool ok_check = test_mtp_worst_argmax_gap(engine, &prompt, spec, nspec,
+                                                        &worst_gap, &worst_at);
+        TEST_ASSERT(ok_check);
+        fprintf(stderr, "ds4-test: dspark-speculative-block nspec=%d max_chunk=%d worst_argmax_gap=%.3f at=%d\n",
+                nspec, max_chunk, worst_gap, worst_at);
+        TEST_ASSERT(worst_gap <= 2.0f);
+    }
+
+    free(spec);
+    ds4_tokens_free(&prompt);
+}
+
+
 #endif
 
+static void test_dspark_binder_helpers(void) {
+    ds4_dspark_config cfg;
+    ds4_dspark_config_init_defaults(&cfg);
+    TEST_ASSERT(cfg.n_mtp_layers == 3);
+    TEST_ASSERT(cfg.block_size == 5);
+    TEST_ASSERT(cfg.noise_token_id == 128799u);
+    TEST_ASSERT(cfg.markov_rank == 256);
+    TEST_ASSERT(cfg.target_layer_ids[0] == 40);
+    TEST_ASSERT(cfg.target_layer_ids[1] == 41);
+    TEST_ASSERT(cfg.target_layer_ids[2] == 42);
+
+    TEST_ASSERT(ds4_mtp_draft_kind_guess(false, false, false) == DS4_MTP_DRAFT_NONE);
+    TEST_ASSERT(ds4_mtp_draft_kind_guess(true, false, false) == DS4_MTP_DRAFT_LEGACY);
+    TEST_ASSERT(ds4_mtp_draft_kind_guess(false, true, true) == DS4_MTP_DRAFT_DSPARK);
+    TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 0) ==
+                DS4_MTP_DRAFT_DSPARK_NONSEQ);
+    TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, false, 0) ==
+                DS4_MTP_DRAFT_NONE);
+    TEST_ASSERT(ds4_mtp_draft_kind_guess_ex(false, true, false, true, 256) ==
+                DS4_MTP_DRAFT_NONE);
+    TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK_NONSEQ),
+                        "dspark-nonseq"));
+    TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_DSPARK), "dspark"));
+    TEST_ASSERT(!strcmp(ds4_mtp_draft_kind_name(DS4_MTP_DRAFT_LEGACY), "legacy-mtp"));
+}
+
+static void test_dspark_markov_bf16_helpers(void) {
+    TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0x3fc0u) - 1.5f) < 0.001f);
+    TEST_ASSERT(fabsf(ds4_dspark_bf16_to_f32(0xbe80u) + 0.25f) < 0.001f);
+}
+
+
+static void test_dspark_runtime_helpers(void) {
+    ds4_dspark_config cfg;
+    ds4_dspark_config_init_defaults(&cfg);
+    TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_LEGACY, true, 4) ==
+                DS4_DSPARK_SPEC_LEGACY_MTP);
+    TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 5) ==
+                DS4_DSPARK_SPEC_DSPARK_ENABLED);
+    TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK_NONSEQ, true, 5) ==
+                DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY);
+    TEST_ASSERT(ds4_dspark_speculative_gate(DS4_MTP_DRAFT_DSPARK, true, 1) ==
+                DS4_DSPARK_SPEC_DISABLED);
+    TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_ENABLED),
+                       "enabled") != NULL);
+    TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY),
+                       "nonseq") != NULL);
+    TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_LEGACY));
+    TEST_ASSERT(!ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_NONE));
+    TEST_ASSERT(ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK));
+    TEST_ASSERT(!ds4_mtp_speculative_draft_ready(DS4_MTP_DRAFT_DSPARK_NONSEQ));
+    TEST_ASSERT(ds4_mtp_draft_runtime_supported(DS4_BACKEND_METAL,
+                                                DS4_MTP_DRAFT_DSPARK));
+    TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA,
+                                                 DS4_MTP_DRAFT_DSPARK));
+    TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CUDA,
+                                                 DS4_MTP_DRAFT_DSPARK_NONSEQ));
+    TEST_ASSERT(!ds4_mtp_draft_runtime_supported(DS4_BACKEND_CPU,
+                                                 DS4_MTP_DRAFT_LEGACY));
+    TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY),
+                       "nonseq") != NULL);
+    TEST_ASSERT(strstr(ds4_dspark_spec_gate_reason(DS4_DSPARK_SPEC_DSPARK_NONSEQ_NOT_READY),
+                       "not been validated") != NULL);
+    TEST_ASSERT(ds4_engine_has_mtp(NULL) == false);
+}
+
+static uint32_t test_le32(const unsigned char *p) {
+    return (uint32_t)p[0] |
+           ((uint32_t)p[1] << 8) |
+           ((uint32_t)p[2] << 16) |
+           ((uint32_t)p[3] << 24);
+}
+
+static uint64_t test_le64(const unsigned char *p) {
+    return (uint64_t)p[0] |
+           ((uint64_t)p[1] << 8) |
+           ((uint64_t)p[2] << 16) |
+           ((uint64_t)p[3] << 24) |
+           ((uint64_t)p[4] << 32) |
+           ((uint64_t)p[5] << 40) |
+           ((uint64_t)p[6] << 48) |
+           ((uint64_t)p[7] << 56);
+}
+
+static bool test_file_size(const char *path, uint64_t *size_out) {
+    struct stat st;
+    if (stat(path, &st) != 0 || st.st_size < 0) return false;
+    *size_out = (uint64_t)st.st_size;
+    return true;
+}
+static bool test_bf16_region_nonzero_finite(const char *path,
+                                            uint64_t offset,
+                                            uint64_t bytes) {
+    if (!path || bytes == 0 || (bytes & 1u) != 0) return false;
+    FILE *fp = fopen(path, "rb");
+    if (!fp) return false;
+    if (fseeko(fp, (off_t)offset, SEEK_SET) != 0) {
+        fclose(fp);
+        return false;
+    }
+    unsigned char buf[4096];
+    uint64_t remaining = bytes;
+    uint64_t values = 0;
+    uint64_t nonzero = 0;
+    while (remaining > 0) {
+        size_t chunk = remaining < sizeof(buf) ? (size_t)remaining : sizeof(buf);
+        if ((chunk & 1u) != 0) chunk--;
+        if (chunk == 0 || fread(buf, 1, chunk, fp) != chunk) {
+            fclose(fp);
+            return false;
+        }
+        for (size_t i = 0; i < chunk; i += 2) {
+            uint16_t u = (uint16_t)buf[i] | ((uint16_t)buf[i + 1] << 8);
+            if ((u & 0x7f80u) == 0x7f80u) {
+                fclose(fp);
+                return false;
+            }
+            if (u != 0) nonzero++;
+            values++;
+        }
+        remaining -= chunk;
+    }
+    return fclose(fp) == 0 && values == bytes / 2 && nonzero > 0;
+}
+
+
+static bool test_write_dspark_target_cache_dataset(const char *path) {
+    FILE *fp = fopen(path, "wb");
+    if (!fp) return false;
+    const bool ok = fputs("===== DS4_IMATRIX_PROMPT 0 =====\n"
+                          "Explain target cache export in one short sentence.\n",
+                          fp) >= 0;
+    return fclose(fp) == 0 && ok;
+}
+
+static int test_run_dspark_target_cache_cli(const char *dataset_path,
+                                            const char *output_dir) {
+    pid_t pid = fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        execl("./ds4", "./ds4",
+              "-m", test_model_path(),
+              "--metal",
+              "--dspark-target-cache-dataset", dataset_path,
+              "--dspark-target-cache-out", output_dir,
+              "--dspark-target-cache-target-model", "deepseek-ai/DeepSeek-V4-Flash",
+              "--dspark-target-cache-chat-template", "deepseek_v4_rendered",
+              "--dspark-target-cache-max-prompts", "1",
+              "--dspark-target-cache-max-tokens", "8",
+              "--ctx", "128",
+              (char *)NULL);
+        _exit(127);
+    }
+    int status = 0;
+    while (waitpid(pid, &status, 0) < 0) {
+        if (errno != EINTR) return -1;
+    }
+    if (!WIFEXITED(status)) return -1;
+    return WEXITSTATUS(status);
+}
+
+static int test_run_dspark_target_cache_cli_missing_target_model(const char *dataset_path,
+                                                                 const char *output_dir) {
+    pid_t pid = fork();
+    if (pid < 0) return -1;
+    if (pid == 0) {
+        execl("./ds4", "./ds4",
+              "-m", test_model_path(),
+              "--metal",
+              "--dspark-target-cache-dataset", dataset_path,
+              "--dspark-target-cache-out", output_dir,
+              "--dspark-target-cache-chat-template", "deepseek_v4_rendered",
+              "--dspark-target-cache-max-prompts", "1",
+              "--dspark-target-cache-max-tokens", "8",
+              "--ctx", "128",
+              (char *)NULL);
+        _exit(127);
+    }
+    int status = 0;
+    while (waitpid(pid, &status, 0) < 0) {
+        if (errno != EINTR) return -1;
+    }
+    if (!WIFEXITED(status)) return -1;
+    return WEXITSTATUS(status);
+}
+static bool test_json_u64_field(const char *json, const char *key, uint64_t *out) {
+    const char *p = strstr(json, key);
+    if (!p) return false;
+    p += strlen(key);
+    while (*p == ' ' || *p == '\t') p++;
+    char *end = NULL;
+    unsigned long long v = strtoull(p, &end, 10);
+    if (end == p) return false;
+    *out = (uint64_t)v;
+    return true;
+}
+
+
+static void test_dspark_target_cache_export(void) {
+    char root_template[PATH_MAX];
+    snprintf(root_template, sizeof(root_template), "%s",
+             "/tmp/ds4-target-cache-test-XXXXXX");
+    char *root = mkdtemp(root_template);
+    TEST_ASSERT(root != NULL);
+    if (!root) return;
+
+    char dataset_path[PATH_MAX];
+    char output_dir[PATH_MAX];
+    char missing_target_output_dir[PATH_MAX];
+    char manifest_path[PATH_MAX];
+    char lock_path[PATH_MAX];
+    char index_path[PATH_MAX];
+    char shard_path[PATH_MAX];
+    TEST_ASSERT(snprintf(dataset_path, sizeof(dataset_path), "%s/prompts.txt", root) <
+                (int)sizeof(dataset_path));
+    TEST_ASSERT(snprintf(output_dir, sizeof(output_dir), "%s/cache", root) <
+                (int)sizeof(output_dir));
+    TEST_ASSERT(snprintf(missing_target_output_dir, sizeof(missing_target_output_dir),
+                         "%s/missing-target-cache", root) <
+                (int)sizeof(missing_target_output_dir));
+    TEST_ASSERT(snprintf(manifest_path, sizeof(manifest_path), "%s/manifest.json",
+                         output_dir) < (int)sizeof(manifest_path));
+    TEST_ASSERT(snprintf(index_path, sizeof(index_path), "%s/samples.idx", output_dir) <
+                (int)sizeof(index_path));
+    TEST_ASSERT(snprintf(shard_path, sizeof(shard_path), "%s/shard-00000.bin",
+                         output_dir) < (int)sizeof(shard_path));
+    TEST_ASSERT(snprintf(lock_path, sizeof(lock_path), "%s/ds4.lock", root) <
+                (int)sizeof(lock_path));
+    TEST_ASSERT(setenv("DS4_LOCK_FILE", lock_path, 1) == 0);
+    TEST_ASSERT(test_write_dspark_target_cache_dataset(dataset_path));
+    const int missing_target_rc =
+        test_run_dspark_target_cache_cli_missing_target_model(dataset_path,
+                                                             missing_target_output_dir);
+    TEST_ASSERT(missing_target_rc != 0);
+
+    const int rc = test_run_dspark_target_cache_cli(dataset_path, output_dir);
+    TEST_ASSERT(rc == 0);
+    if (rc != 0) return;
+
+    char *manifest = test_read_file(manifest_path);
+    TEST_ASSERT(manifest != NULL);
+    if (!manifest) return;
+    uint64_t hidden_size = 0;
+    uint64_t target_hidden_layers = 0;
+    TEST_ASSERT(strstr(manifest, "\"version\": 2") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"format\": \"deepspec-target-cache\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"producer\": \"ds4\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"target_model_name_or_path\": \"deepseek-ai/DeepSeek-V4-Flash\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"source_gguf_path\": \"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"chat_template\": \"deepseek_v4_rendered\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"target_layer_ids\": [40, 41, 42]") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"hidden_dtype\": \"bfloat16\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"token_dtype\": \"int32\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"mask_dtype\": \"uint8\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"index_record_size\": 56") != NULL);
+    TEST_ASSERT(test_json_u64_field(manifest, "\"target_hidden_layers\": ",
+                                    &target_hidden_layers));
+    TEST_ASSERT(target_hidden_layers == 3);
+    TEST_ASSERT(strstr(manifest, "\"sample_split_marker\": \"===== DS4_IMATRIX_PROMPT\"") != NULL);
+    TEST_ASSERT(strstr(manifest, "\"shard-00000.bin\"") != NULL);
+    TEST_ASSERT(test_json_u64_field(manifest, "\"hidden_size\": ", &hidden_size));
+    TEST_ASSERT(hidden_size > 0);
+    free(manifest);
+
+    uint64_t index_size = 0;
+    uint64_t shard_size = 0;
+    TEST_ASSERT(test_file_size(index_path, &index_size));
+    TEST_ASSERT(index_size == 56);
+    TEST_ASSERT(test_file_size(shard_path, &shard_size));
+    TEST_ASSERT(shard_size > 0);
+    if (index_size != 56 || shard_size == 0) return;
+
+    FILE *idx = fopen(index_path, "rb");
+    TEST_ASSERT(idx != NULL);
+    if (!idx) return;
+    unsigned char rec[56];
+    TEST_ASSERT(fread(rec, 1, sizeof(rec), idx) == sizeof(rec));
+    TEST_ASSERT(fclose(idx) == 0);
+
+    const uint64_t sample_id = test_le64(rec + 0);
+    const uint32_t shard_id = test_le32(rec + 8);
+    const uint32_t seq_len = test_le32(rec + 12);
+    const uint64_t input_ids_offset = test_le64(rec + 16);
+    const uint64_t attention_mask_offset = test_le64(rec + 24);
+    const uint64_t loss_mask_offset = test_le64(rec + 32);
+    const uint64_t target_hidden_states_offset = test_le64(rec + 40);
+    const uint64_t target_last_hidden_states_offset = test_le64(rec + 48);
+
+    TEST_ASSERT(sample_id == 0);
+    TEST_ASSERT(seq_len > 0 && seq_len <= 8);
+    TEST_ASSERT(shard_id == 0);
+    TEST_ASSERT(input_ids_offset == 0);
+    TEST_ASSERT(attention_mask_offset == (uint64_t)seq_len * sizeof(int32_t));
+    TEST_ASSERT(loss_mask_offset == attention_mask_offset + seq_len);
+    TEST_ASSERT(target_hidden_states_offset == loss_mask_offset + seq_len);
+    const uint64_t target_hidden_bytes =
+        (uint64_t)seq_len * target_hidden_layers * hidden_size * sizeof(uint16_t);
+    TEST_ASSERT(target_last_hidden_states_offset ==
+                target_hidden_states_offset + target_hidden_bytes);
+    TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path,
+                                                target_hidden_states_offset,
+                                                target_hidden_bytes));
+    const uint64_t target_last_hidden_bytes =
+        (uint64_t)seq_len * hidden_size * sizeof(uint16_t);
+    TEST_ASSERT(shard_size == target_last_hidden_states_offset + target_last_hidden_bytes);
+    TEST_ASSERT(test_bf16_region_nonzero_finite(shard_path,
+                                                target_last_hidden_states_offset,
+                                                target_last_hidden_bytes));
+}
+
+
+
 static void test_server_unit_group(void) {
     ds4_server_unit_tests_run();
 }
@@ -2202,18 +2577,31 @@ static const ds4_test_entry test_entries[] = {
     {"--metal-tensor-equivalence", "metal-tensor-equivalence", "fast/quality Metal prompt-logit and greedy equivalence", test_metal_mpp_equivalence},
     {"--streaming-decode-prefill-correctness", "streaming-decode-prefill-correctness", "streaming decode-style cold prefill drift and repeatability", test_streaming_decode_prefill_correctness},
     {"--mtp-verify-depth", "mtp-verify-depth", "MTP speculative verify commits autoregressive-identical tokens at draft depth > 2", test_mtp_verify_depth},
+    {"--dspark-speculative-block", "dspark-speculative-block", "DSpark block drafts commit only target-verified tokens", test_dspark_speculative_block},
 #endif
+    {"--dspark-binder", "dspark-binder", "DSpark draft kind/config defaults without GGUF", test_dspark_binder_helpers},
+    {"--dspark-markov-bf16", "dspark-markov-bf16", "DSpark Markov BF16 tensor decoding", test_dspark_markov_bf16_helpers},
+    {"--dspark-runtime", "dspark-runtime", "DSpark capture plan and speculative gate helpers", test_dspark_runtime_helpers},
+
     {"--server", "server", "server parser/rendering/cache unit tests", test_server_unit_group},
 };
 
+static const ds4_test_entry manual_test_entries[] = {
+    {"--dspark-target-cache-export", "dspark-target-cache-export", "DeepSpec target-cache exporter smoke", test_dspark_target_cache_export},
+};
+
 static void test_print_help(const char *prog) {
     printf("Usage: %s [--all | TEST...]\n\n", prog);
     puts("Tests:");
     puts("  --all");
-    puts("      Run every test. This is the default, ordered from slower to faster.");
+    puts("      Run every default test. This is the default, ordered from slower to faster.");
     for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) {
         printf("  %-20s %s\n", test_entries[i].flag, test_entries[i].desc);
     }
+    puts("\nManual tests:");
+    for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) {
+        printf("  %-20s %s\n", manual_test_entries[i].flag, manual_test_entries[i].desc);
+    }
     puts("  --list");
     puts("      Print test names only.");
 #ifndef DS4_NO_GPU
@@ -2247,6 +2635,13 @@ static const ds4_test_entry *test_find_entry(const char *arg) {
     return NULL;
 }
 
+static const ds4_test_entry *test_find_manual_entry(const char *arg) {
+    for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) {
+        if (!strcmp(arg, manual_test_entries[i].flag)) return &manual_test_entries[i];
+    }
+    return NULL;
+}
+
 static void test_run_entry(const ds4_test_entry *entry) {
     int before = test_failures;
     fprintf(stderr, "%s:\n", entry->name);
@@ -2262,6 +2657,7 @@ static void test_run_entry(const ds4_test_entry *entry) {
 int main(int argc, char **argv) {
     bool run_all = argc == 1;
     bool selected[sizeof(test_entries) / sizeof(test_entries[0])] = {0};
+    bool selected_manual[sizeof(manual_test_entries) / sizeof(manual_test_entries[0])] = {0};
 
     for (int i = 1; i < argc; i++) {
         if (!strcmp(argv[i], "--all")) {
@@ -2270,18 +2666,27 @@ int main(int argc, char **argv) {
             for (size_t j = 0; j < sizeof(test_entries) / sizeof(test_entries[0]); j++) {
                 puts(test_entries[j].flag);
             }
+            for (size_t j = 0; j < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); j++) {
+                puts(manual_test_entries[j].flag);
+            }
             return 0;
         } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
             test_print_help(argv[0]);
             return 0;
         } else {
             const ds4_test_entry *entry = test_find_entry(argv[i]);
-            if (!entry) {
-                fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]);
-                test_print_help(argv[0]);
-                return 2;
+            if (entry) {
+                selected[(size_t)(entry - test_entries)] = true;
+                continue;
             }
-            selected[(size_t)(entry - test_entries)] = true;
+            entry = test_find_manual_entry(argv[i]);
+            if (entry) {
+                selected_manual[(size_t)(entry - manual_test_entries)] = true;
+                continue;
+            }
+            fprintf(stderr, "ds4-test: unknown test switch: %s\n", argv[i]);
+            test_print_help(argv[0]);
+            return 2;
         }
     }
 
@@ -2293,6 +2698,9 @@ int main(int argc, char **argv) {
         for (size_t i = 0; i < sizeof(test_entries) / sizeof(test_entries[0]); i++) {
             if (selected[i]) test_run_entry(&test_entries[i]);
         }
+        for (size_t i = 0; i < sizeof(manual_test_entries) / sizeof(manual_test_entries[0]); i++) {
+            if (selected_manual[i]) test_run_entry(&manual_test_entries[i]);
+        }
     }
 
 #ifndef DS4_NO_GPU