From 96c5be9dd0fb9971b09df9e2b0647c749aa4fb0d Mon Sep 17 00:00:00 2001
From: wjinxu <1299461899@qq.com>
Date: Thu, 2 Jul 2026 15:24:03 +0800
Subject: [PATCH 1/4] spec: add DSpark speculative decoding

DSpark (DeepSpec, 2026) on top of the merged DFlash drafter. It reuses the
DFlash encoder/decoder graph, target feature extraction and KV-cache injection,
and the verify/accept path unchanged; the draft model is a new "dspark" arch
adding a low-rank Markov head (markov_w1/w2) and an optional (unused here)
confidence head. No new public APIs.

The proposal is the only change: the block is anchor-first (position 0 already
predicts the first draft) and the decoder graph applies a semi-autoregressive,
previous-token conditioned logit bias in-graph, chained per block position:

  logits'(i) = logits(i) + markov_w2 . markov_w1[prev(i)]
  prev(0)    = the block's anchor token, prev(i>0) = argmax(logits'(i-1))

vectorized across all blocks in the batch; the anchors are fed through a
dedicated graph input (token 0 of every block). Greedy stays lossless
(verify unchanged, same as DFlash).

- new arch "dspark" (llama_model_dspark : llama_model_dflash, reuses the graph,
  loads the markov/confidence tensors; shares the target's embed/lm_head).
- Qwen3DSparkModel converter.
- new spec type "draft-dspark" (common_speculative_impl_draft_dspark :
  common_speculative_impl_draft_dflash, overrides draft() only: submits whole
  anchor-first blocks and greedily reads back the biased logits).
---
 common/common.h                |   3 +-
 common/speculative.cpp         | 113 ++++++++++++++++++++++++++++-
 conversion/__init__.py         |   1 +
 conversion/qwen.py             |  50 +++++++++++++
 gguf-py/gguf/constants.py      |  28 ++++++++
 gguf-py/gguf/gguf_writer.py    |   3 +
 gguf-py/gguf/tensor_mapping.py |  12 ++++
 src/llama-arch.cpp             |  10 +++
 src/llama-arch.h               |   6 ++
 src/llama-context.cpp          |   2 +-
 src/llama-hparams.h            |   3 +
 src/llama-model.cpp            |   6 +-
 src/llama-model.h              |   6 ++
 src/models/dspark.cpp          | 126 +++++++++++++++++++++++++++++++++
 src/models/models.h            |  16 +++++
 tests/test-llama-archs.cpp     |   4 +-
 16 files changed, 381 insertions(+), 8 deletions(-)
 create mode 100644 src/models/dspark.cpp

diff --git a/common/common.h b/common/common.h
index 2adb310b83fe..a4cce86eb08f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -170,6 +170,7 @@ enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
     COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
     COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH,  // DFlash speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK,  // DSpark speculative decoding (DFlash + Markov head)
     COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
@@ -385,7 +386,7 @@ struct common_params_speculative {
 
     uint32_t need_n_rs_seq() const {
         bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH || t == COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK;
         });
 
         return needs_rs_seq ? draft.n_max : 0u;
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 3951bbed5455..7fc6f9225c2e 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -34,6 +34,7 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
     {"draft-eagle3",  COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3},
     {"draft-mtp",     COMMON_SPECULATIVE_TYPE_DRAFT_MTP},
     {"draft-dflash",  COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH},
+    {"draft-dspark",  COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK},
     {"ngram-simple",  COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
     {"ngram-map-k",   COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
     {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
@@ -921,8 +922,9 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
     // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
     std::vector<float> features_buf;
 
-    common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq)
-        : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, n_seq)
+    common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq,
+            common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)
+        : common_speculative_impl(type, n_seq)
         , params(params.draft)
     {
         auto * ctx_tgt = this->params.ctx_tgt;
@@ -1189,6 +1191,101 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
     }
 };
 
+// DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs
+struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash {
+    common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq)
+        : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)
+    {
+        auto * ctx_dft = params.ctx_dft;
+        const llama_model * model_dft = llama_get_model(ctx_dft);
+
+        {
+            char buf[32] = {};
+            if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) {
+                GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'");
+            }
+            block_size = std::atoi(buf);
+        }
+        if (params.n_max > block_size) {
+            params.n_max = block_size;
+        }
+
+        LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__);
+        LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max);
+    }
+
+    void draft(common_speculative_draft_params_vec & dparams) override {
+        auto * ctx_dft = params.ctx_dft;
+
+        common_batch_clear(batch);
+
+        std::vector<int32_t> i_block_beg(n_seq, -1);
+        std::vector<int32_t> n_block    (n_seq,  0);
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+            if (!dp.drafting) {
+                continue;
+            }
+
+            common_sampler_reset(smpls[seq_id].get());
+
+            const int32_t n = (int32_t) dp.n_past;
+
+            int32_t n_draft = params.n_max;
+            if (dp.n_max > 0) {
+                n_draft = std::min(n_draft, dp.n_max);
+            }
+            n_draft = std::min(n_draft, block_size);
+            if (n_draft <= 0) {
+                continue;
+            }
+
+            // anchor-first block [id_last, <mask> * (block_size-1)]: submit the whole block so the
+            // in-graph Markov head can key anchors off the block boundaries; keep the first n_draft
+            i_block_beg[seq_id] = batch.n_tokens;
+            n_block    [seq_id] = n_draft;
+            for (int32_t i = 0; i < block_size; ++i) {
+                common_batch_add(batch, i == 0 ? dp.id_last : mask_token_id, n + i, { seq_id }, true);
+            }
+        }
+
+        if (batch.n_tokens == 0) {
+            return;
+        }
+
+        if (llama_decode(ctx_dft, batch) != 0) {
+            LOG_WRN("%s: llama_decode failed\n", __func__);
+            return;
+        }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            if (i_block_beg[seq_id] < 0) {
+                continue;
+            }
+            auto & dp     = dparams[seq_id];
+            auto & result = *dp.result;
+
+            const int32_t beg = i_block_beg[seq_id];
+            const int32_t nb  = n_block[seq_id]; // drafts to keep (<= block_size)
+
+            auto * smpl = smpls[seq_id].get();
+            // greedily read the predicted block at this sequence's noise positions 1..nb-1
+            for (int32_t i = 0; i < nb; ++i) {
+                common_sampler_sample(smpl, ctx_dft, beg + i, true);
+
+                const auto * cur_p = common_sampler_get_candidates(smpl, true);
+
+                const llama_token id = cur_p->data[0].id;
+
+                common_sampler_accept(smpl, id, true);
+
+                result.push_back(id);
+            }
+        }
+    }
+};
+
 struct common_speculative_impl_draft_mtp : public common_speculative_impl {
     common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft)
 
@@ -2133,6 +2230,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) {
         case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:  return "draft-eagle3";
         case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:     return "draft-mtp";
         case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH:  return "draft-dflash";
+        case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK:  return "draft-dspark";
         case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:  return "ngram-simple";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K:   return "ngram-map-k";
         case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v";
@@ -2186,6 +2284,7 @@ int32_t common_speculative_n_max(const common_params_speculative * spec) {
             case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:
             case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:
             case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH:
+            case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK:
                 n_max = std::max(n_max, std::max(0, spec->draft.n_max));
                 break;
             case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:
@@ -2224,6 +2323,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
         bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
         bool has_draft_mtp    = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP))    && params.draft.ctx_dft != nullptr;
         bool has_draft_dflash = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)) && params.draft.ctx_dft != nullptr;
+        bool has_draft_dspark = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)) && params.draft.ctx_dft != nullptr;
 
 
 
@@ -2234,7 +2334,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
         bool has_ngram_mod     = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD));
 
         // when adding a new type - update here the logic above
-        static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 10);
+        static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 11);
 
         // this list here defines the priority of the speculators
         // the one with highest priority are listed first
@@ -2267,6 +2367,9 @@ common_speculative * common_speculative_init(common_params_speculative & params,
         if (has_draft_dflash) {
             configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, params));
         }
+        if (has_draft_dspark) {
+            configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, params));
+        }
     }
 
     std::vector<std::unique_ptr<common_speculative_impl>> impls = {};
@@ -2291,6 +2394,10 @@ common_speculative * common_speculative_init(common_params_speculative & params,
                 impls.push_back(std::make_unique<common_speculative_impl_draft_dflash>(config.params, n_seq));
                 break;
             }
+            case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: {
+                impls.push_back(std::make_unique<common_speculative_impl_draft_dspark>(config.params, n_seq));
+                break;
+            }
             case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
                 common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);
 
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 02ea6385208a..8c723597d7b2 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -51,6 +51,7 @@
     "DeepseekV3ForCausalLM": "deepseek",
     "DeepseekV32ForCausalLM": "deepseek",
     "DFlashDraftModel": "qwen",
+    "Qwen3DSparkModel": "qwen",
     "DeepseekV4ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
diff --git a/conversion/qwen.py b/conversion/qwen.py
index 0356bd2da783..d91ed49cf280 100644
--- a/conversion/qwen.py
+++ b/conversion/qwen.py
@@ -673,3 +673,53 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         if not name.startswith("model."):
             name = "model." + name
         return super().filter_tensors((name, gen))
+
+
+@ModelBase.register("Qwen3DSparkModel")
+class DSparkModel(Qwen3Model):
+    # DSpark = DFlash backbone + a semi-autoregressive Markov head (+ optional confidence head).
+    # The DeepSpec checkpoint stores its config flat (block_size / target_layer_ids / mask_token_id /
+    # markov_rank at top level). embed_tokens / lm_head are byte-identical to the target, so they are
+    # NOT emitted here -- the DSpark decoder shares the target's via ctx_other (same as DFlash).
+    model_arch = gguf.MODEL_ARCH.DSPARK
+
+    def set_vocab(self):
+        if self.target_model_dir is None:
+            raise ValueError(
+                "DSpark draft model requires --target-model-dir to be specified. "
+                "Please provide the path to the target model directory containing the tokenizer."
+            )
+        logger.info(f"DSpark: Using tokenizer from target model: {self.target_model_dir}")
+        original_dir = self.dir_model
+        self.dir_model = self.target_model_dir
+        super().set_vocab()
+        self.dir_model = original_dir
+
+        mask_token_id = self.hparams.get("mask_token_id")
+        if mask_token_id is not None:
+            self.gguf_writer.add_mask_token_id(mask_token_id)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        block_size = self.hparams.get("block_size", 7)
+        self.gguf_writer.add_block_size(block_size)
+
+        # flat DeepSpec schema; mirror DFlash's +1 extract-layer convention
+        target_layer_ids = self.hparams.get("target_layer_ids", [])
+        if target_layer_ids:
+            extract_layer_ids = [i + 1 for i in target_layer_ids]
+            self.gguf_writer.add_target_layers(extract_layer_ids)
+
+        markov_rank = self.hparams.get("markov_rank", 0)
+        self.gguf_writer.add_markov_rank(markov_rank)
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, gen = item
+        # embed_tokens / lm_head are byte-identical to the target and shared at runtime -- drop them
+        if name.endswith(("embed_tokens.weight", "lm_head.weight")):
+            return None
+        if not name.startswith("model."):
+            name = "model." + name
+        return super().filter_tensors((name, gen))
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index cd4cdef8991f..626d6ed51af6 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -158,6 +158,7 @@ class LLM:
         TARGET_LAYERS                     = "{arch}.target_layers"
         TARGET_HIDDEN_SIZE                = "{arch}.target_hidden_size"
         BLOCK_SIZE                        = "{arch}.block_size"
+        MARKOV_RANK                       = "{arch}.markov_rank"
         NORM_BEFORE_RESIDUAL              = "{arch}.norm_before_residual"
 
     class Attention:
@@ -530,6 +531,7 @@ class MODEL_ARCH(IntEnum):
     MISTRAL3         = auto()
     EAGLE3           = auto()
     DFLASH           = auto()
+    DSPARK           = auto()
     MISTRAL4         = auto()
     PADDLEOCR        = auto()
     MIMO2            = auto()
@@ -953,6 +955,9 @@ class MODEL_TENSOR(IntEnum):
     # eagle3
     FC                     = auto()  # feature fusion layer
     D2T                    = auto()  # draft to target vocabulary mapping
+    DSPARK_MARKOV_W1       = auto()  # dspark markov head: prev-token embed
+    DSPARK_MARKOV_W2       = auto()  # dspark markov head: bias projection
+    DSPARK_CONF_PROJ       = auto()  # dspark confidence head: proj
     # lfm2 audio
     A_ENC_NORM_CONV        = auto()
     A_ENC_LINEAR_POS       = auto()
@@ -1111,6 +1116,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MISTRAL3:         "mistral3",
     MODEL_ARCH.EAGLE3:           "eagle3",
     MODEL_ARCH.DFLASH:           "dflash",
+    MODEL_ARCH.DSPARK:           "dspark",
     MODEL_ARCH.MISTRAL4:         "mistral4",
     MODEL_ARCH.PADDLEOCR:        "paddleocr",
     MODEL_ARCH.MIMO2:            "mimo2",
@@ -1559,6 +1565,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
     MODEL_TENSOR.FC:                        "fc",
+    MODEL_TENSOR.DSPARK_MARKOV_W1:          "markov_w1",
+    MODEL_TENSOR.DSPARK_MARKOV_W2:          "markov_w2",
+    MODEL_TENSOR.DSPARK_CONF_PROJ:          "conf_proj",
     MODEL_TENSOR.D2T:                       "d2t",
 }
 
@@ -4204,6 +4213,25 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FC,
         MODEL_TENSOR.ENC_OUTPUT_NORM,
     ],
+    MODEL_ARCH.DSPARK: [
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FC,
+        MODEL_TENSOR.ENC_OUTPUT_NORM,
+        MODEL_TENSOR.DSPARK_MARKOV_W1,
+        MODEL_TENSOR.DSPARK_MARKOV_W2,
+        MODEL_TENSOR.DSPARK_CONF_PROJ,
+    ],
     MODEL_ARCH.MISTRAL4: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 1e277f0687c5..09522d79aab9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -946,6 +946,9 @@ def add_sliding_window(self, value: int) -> None:
     def add_block_size(self, value: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_SIZE.format(arch=self.arch), value)
 
+    def add_markov_rank(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.MARKOV_RANK.format(arch=self.arch), value)
+
     def add_target_layers(self, value: Sequence[int]) -> None:
         self.add_array(Keys.LLM.TARGET_LAYERS.format(arch=self.arch), value)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 9efb36f8a447..011dae886789 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1290,6 +1290,18 @@ class TensorNameMap:
             "model.fc", # dflash
         ),
 
+        MODEL_TENSOR.DSPARK_MARKOV_W1: (
+            "model.markov_head.markov_w1", # dspark
+        ),
+
+        MODEL_TENSOR.DSPARK_MARKOV_W2: (
+            "model.markov_head.markov_w2", # dspark
+        ),
+
+        MODEL_TENSOR.DSPARK_CONF_PROJ: (
+            "model.confidence_head.proj", # dspark
+        ),
+
         MODEL_TENSOR.CLS: (
             "classifier",       # jina
             "classifier.dense", # roberta
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b890e66fcf6e..a66af71f7114 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -131,6 +131,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_MISTRAL3,         "mistral3"         },
     { LLM_ARCH_EAGLE3,           "eagle3"           },
     { LLM_ARCH_DFLASH,           "dflash"           },
+    { LLM_ARCH_DSPARK,           "dspark"           },
     { LLM_ARCH_MISTRAL4,         "mistral4"         },
     { LLM_ARCH_PADDLEOCR,        "paddleocr"        },
     { LLM_ARCH_MIMO2,            "mimo2"            },
@@ -307,6 +308,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_TARGET_LAYERS,         "%s.target_layers"        },
     { LLM_KV_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"   },
+    { LLM_KV_MARKOV_RANK,           "%s.markov_rank"          },
+    { LLM_KV_BLOCK_SIZE,            "%s.block_size"           },
     { LLM_KV_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual" },
 
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
@@ -603,6 +606,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_MASKED_EMBD_ORDERING,                   "masked_embd_ordering" },
     { LLM_TENSOR_FC,                                     "fc" },
     { LLM_TENSOR_D2T,                                    "d2t" },
+    { LLM_TENSOR_DSPARK_MARKOV_W1,                       "markov_w1" },
+    { LLM_TENSOR_DSPARK_MARKOV_W2,                       "markov_w2" },
+    { LLM_TENSOR_DSPARK_CONF_PROJ,                       "conf_proj" },
 };
 
 // declare information about the model weight tensors:
@@ -854,6 +860,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // eagle3
     {LLM_TENSOR_FC,                         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_D2T,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
+    // dspark
+    {LLM_TENSOR_DSPARK_MARKOV_W1,           {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_DSPARK_MARKOV_W2,           {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_DSPARK_CONF_PROJ,           {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a4f5091e7170..ba6fdeaf2d0a 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -145,6 +145,7 @@ enum llm_arch {
     LLM_ARCH_MELLUM,
     LLM_ARCH_EAGLE3,
     LLM_ARCH_DFLASH,
+    LLM_ARCH_DSPARK,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -353,6 +354,8 @@ enum llm_kv {
 
     LLM_KV_TARGET_LAYERS,
     LLM_KV_TARGET_HIDDEN_SIZE,
+    LLM_KV_MARKOV_RANK,
+    LLM_KV_BLOCK_SIZE,
     LLM_KV_NORM_BEFORE_RESIDUAL,
 
     LLM_KV_SHORTCONV_L_CACHE,
@@ -611,6 +614,9 @@ enum llm_tensor {
     LLM_TENSOR_MASKED_EMBD_ORDERING,
     LLM_TENSOR_FC,
     LLM_TENSOR_D2T,
+    LLM_TENSOR_DSPARK_MARKOV_W1,
+    LLM_TENSOR_DSPARK_MARKOV_W2,
+    LLM_TENSOR_DSPARK_CONF_PROJ,
 };
 
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 0465430df43a..77b8450ea837 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -100,7 +100,7 @@ llama_context::llama_context(
         cparams.ctx_other = params.ctx_other;
     }
 
-    if (model.arch == LLM_ARCH_EAGLE3 || model.arch == LLM_ARCH_DFLASH) {
+    if (model.arch == LLM_ARCH_EAGLE3 || model.arch == LLM_ARCH_DFLASH || model.arch == LLM_ARCH_DSPARK) {
         if (model.tok_embd == nullptr || model.output == nullptr) {
             if (params.ctx_other == nullptr) {
                 throw std::runtime_error(model.arch_name() + " requires ctx_other to be set (this warning is normal during memory fitting)");
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 8be5f28f39e6..e0e4ef245bfd 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -187,6 +187,9 @@ struct llama_hparams {
     // for Classifiers
     uint32_t n_cls_out = 1;
 
+    // for DSpark: the trained draft block size, in tokens (anchor + n-1 masks)
+    uint32_t n_dspark_block = 0;
+
     // input embedding dimension (0 = use n_embd)
     uint32_t n_embd_inp_impl = 0;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d58ebac28b9b..0598437b170b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -296,6 +296,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_eagle3(params);
         case LLM_ARCH_DFLASH:
             return new llama_model_dflash(params);
+        case LLM_ARCH_DSPARK:
+            return new llama_model_dspark(params);
         case LLM_ARCH_MIMO2:
             return new llama_model_mimo2(params);
         case LLM_ARCH_KIMI_LINEAR:
@@ -2524,6 +2526,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_TALKIE:
         case LLM_ARCH_MELLUM:
         case LLM_ARCH_DFLASH:
+        case LLM_ARCH_DSPARK:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
@@ -2648,7 +2651,8 @@ bool llama_model_has_encoder(const llama_model * model) {
         case LLM_ARCH_T5:
         case LLM_ARCH_T5ENCODER:
         case LLM_ARCH_EAGLE3:
-        case LLM_ARCH_DFLASH:    return true;
+        case LLM_ARCH_DFLASH:
+        case LLM_ARCH_DSPARK:    return true;
         default:                 return false;
     }
 }
diff --git a/src/llama-model.h b/src/llama-model.h
index 4800d2928c52..e7295db30ab6 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -599,6 +599,12 @@ struct llama_model {
     struct ggml_tensor * fc  = nullptr;  // feature fusion layer
     struct ggml_tensor * d2t = nullptr;  // draft to target vocabulary mapping
 
+    // dspark
+    struct ggml_tensor * dspark_markov_w1   = nullptr;
+    struct ggml_tensor * dspark_markov_w2   = nullptr;
+    struct ggml_tensor * dspark_conf_proj   = nullptr;
+    struct ggml_tensor * dspark_conf_proj_b = nullptr;
+
     // unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
     std::vector<int32_t> target_layer_ids;
 
diff --git a/src/models/dspark.cpp b/src/models/dspark.cpp
new file mode 100644
index 000000000000..1c873407ab4a
--- /dev/null
+++ b/src/models/dspark.cpp
@@ -0,0 +1,126 @@
+#include "models.h"
+
+// DSpark = DFlash backbone + a semi-autoregressive Markov head applied in-graph by the decoder
+
+void llama_model_dspark::load_arch_hparams(llama_model_loader & ml) {
+    llama_model_dflash::load_arch_hparams(ml);
+
+    ml.get_key(LLM_KV_BLOCK_SIZE, hparams.n_dspark_block, /*required*/ true);
+}
+
+void llama_model_dspark::load_arch_tensors(llama_model_loader & ml) {
+    llama_model_dflash::load_arch_tensors(ml);
+
+    LLAMA_LOAD_LOCALS;
+
+    uint32_t markov_rank = 0;
+    ml.get_key(LLM_KV_MARKOV_RANK, markov_rank, /*required*/ true);
+    const int64_t R = (int64_t) markov_rank;
+
+    dspark_markov_w1 = create_tensor(tn(LLM_TENSOR_DSPARK_MARKOV_W1, "weight"), { R, n_vocab }, 0);
+    dspark_markov_w2 = create_tensor(tn(LLM_TENSOR_DSPARK_MARKOV_W2, "weight"), { R, n_vocab }, 0);
+
+    dspark_conf_proj   = create_tensor(tn(LLM_TENSOR_DSPARK_CONF_PROJ, "weight"), { n_embd + R, 1 }, TENSOR_NOT_REQUIRED);
+    dspark_conf_proj_b = create_tensor(tn(LLM_TENSOR_DSPARK_CONF_PROJ, "bias"),   { 1 },             TENSOR_NOT_REQUIRED);
+}
+
+std::unique_ptr<llm_graph_context> llama_model_dspark::build_arch_graph(const llm_graph_params & params) const {
+    switch (params.gtype) {
+        case LLM_GRAPH_TYPE_ENCODER:
+            return std::make_unique<graph<true>>(*this, params);
+        case LLM_GRAPH_TYPE_DEFAULT:
+        case LLM_GRAPH_TYPE_DECODER:
+            return std::make_unique<graph<false>>(*this, params);
+        default:
+            GGML_ABORT("invalid graph type");
+    };
+}
+
+// DSpark encoder == DFlash encoder
+template <>
+llama_model_dspark::graph<true>::graph(const llama_model & model, const llm_graph_params & params)
+    : llama_model_dflash::graph<true>(model, params) {}
+
+// anchor (committed last) token of every draft block: token 0 of each block in the ubatch
+class llm_graph_input_dspark_anchor : public llm_graph_input_i {
+public:
+    llm_graph_input_dspark_anchor(uint32_t block_size) : block_size(block_size) {}
+    virtual ~llm_graph_input_dspark_anchor() = default;
+
+    void set_input(const llama_ubatch * ubatch) override {
+        GGML_ASSERT(ubatch->token);
+        const int64_t n_blocks = anchors->ne[0];
+        std::vector<int32_t> buf(n_blocks);
+        for (int64_t j = 0; j < n_blocks; ++j) {
+            buf[j] = ubatch->token[j*block_size];
+        }
+        ggml_backend_tensor_set(anchors, buf.data(), 0, n_blocks*sizeof(int32_t));
+    }
+
+    bool can_reuse(const llm_graph_params & params) override {
+        return params.ubatch.token && anchors &&
+               anchors->ne[0]*(int64_t) block_size == (int64_t) params.ubatch.n_tokens;
+    }
+
+    ggml_tensor * anchors = nullptr; // I32 [n_blocks]
+
+    const uint32_t block_size;
+};
+
+// DSpark decoder: DFlash decoder + Markov bias on the draft logits, chained per block position:
+//   logits'(i) = logits(i) + markov_w2 . markov_w1[prev(i)]
+//   prev(0)    = the block's anchor token, prev(i>0) = argmax(logits'(i-1))
+template <>
+llama_model_dspark::graph<false>::graph(const llama_model & model, const llm_graph_params & params)
+    : llama_model_dflash::graph<false>(model, params) {
+    // KV-injection (embd) batch: no logits to bias
+    if (ubatch.embd) {
+        return;
+    }
+
+    ggml_tensor * w1 = model.dspark_markov_w1;
+    ggml_tensor * w2 = model.dspark_markov_w2;
+    GGML_ASSERT(w1 && w2 && "DSpark markov weights not loaded");
+
+    ggml_tensor * base = res->t_logits; // [n_vocab, n_tokens]
+    const int64_t n_vocab = base->ne[0];
+    const int64_t n_tok   = base->ne[1];
+
+    const int64_t bs = model.hparams.n_dspark_block;
+    GGML_ASSERT(bs > 0);
+
+    // the drafting loop always submits whole anchor-first blocks
+    if (n_tok % bs != 0) {
+        return;
+    }
+    const int64_t n_blocks = n_tok / bs;
+
+    auto inp = std::make_unique<llm_graph_input_dspark_anchor>((uint32_t) bs);
+    inp->anchors = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_blocks);
+    ggml_set_input(inp->anchors);
+    ggml_tensor * prev = inp->anchors; // I32 [n_blocks]
+    res->add_input(std::move(inp));
+
+    ggml_tensor * cat = nullptr;
+    for (int64_t i = 0; i < bs; ++i) {
+        ggml_tensor * bias = ggml_mul_mat(ctx0, w2, ggml_get_rows(ctx0, w1, prev)); // [n_vocab, n_blocks]
+
+        // position i of every block: strided view [n_vocab, n_blocks]
+        ggml_tensor * base_i = ggml_view_2d(ctx0, base, n_vocab, n_blocks, bs*base->nb[1], i*base->nb[1]);
+        ggml_tensor * col    = ggml_add(ctx0, base_i, bias);
+
+        cat = cat ? ggml_concat(ctx0, cat, col, 1) : col;
+
+        if (i + 1 < bs) {
+            prev = ggml_argmax(ctx0, col); // I32 [n_blocks]
+        }
+    }
+
+    // cat is position-major; restore the ubatch's block-major order
+    ggml_tensor * out = ggml_reshape_3d(ctx0, cat, n_vocab, n_blocks, bs);
+    out = ggml_cont(ctx0, ggml_permute(ctx0, out, 0, 2, 1, 3)); // [n_vocab, bs, n_blocks]
+    out = ggml_reshape_2d(ctx0, out, n_vocab, n_tok);
+
+    res->t_logits = out;
+    ggml_build_forward_expand(gf, out);
+}
diff --git a/src/models/models.h b/src/models/models.h
index 7a52e7bc1ab7..d62e162862bb 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1253,6 +1253,22 @@ struct llama_model_dflash : public llama_model_base {
 };
 
 
+struct llama_model_dspark : public llama_model_dflash {
+    llama_model_dspark(const struct llama_model_params & params) : llama_model_dflash(params) {}
+    // extend the DFlash hparams/tensors with the block size and the Markov / confidence heads
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    // the DFlash graphs plus the in-graph Markov head on the decoder's draft logits
+    template <bool is_enc>
+    struct graph : public llama_model_dflash::graph<is_enc> {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_mistral4 : public llama_model_deepseek2 {
     llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
     // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2
diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index f39abe773fc6..5a86d7365847 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -454,7 +454,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
         if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
-        if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) {
+        if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH || arch == LLM_ARCH_DSPARK) {
             continue;
         }
         for (bool moe : {false, true}) {
@@ -560,7 +560,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
-        if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) {
+        if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH || arch == LLM_ARCH_DSPARK) {
             continue;
         }
 

From 8c548e7a8f1b6b72f5a26c69b8c7815b40bb3bd2 Mon Sep 17 00:00:00 2001
From: wjinxu <1299461899@qq.com>
Date: Sat, 4 Jul 2026 14:44:41 +0800
Subject: [PATCH 2/4] spec: read draft block size in the dflash impl

---
 common/speculative.cpp | 46 ++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 7fc6f9225c2e..cb58d7b8b84c 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -923,7 +923,8 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
     std::vector<float> features_buf;
 
     common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq,
-            common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)
+            common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH,
+            bool draft_at_anchor = false)
         : common_speculative_impl(type, n_seq)
         , params(params.draft)
     {
@@ -942,25 +943,30 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
         n_embd_dec    = llama_model_n_embd(model_dft);
         n_embd_enc    = (int32_t) target_layer_ids_n * n_embd_tgt;
 
-        // read the trained block size from the dflash.block_size metadata key
+        // read the trained block size from the <arch>.block_size metadata key
         block_size = 16;
         {
+            char arch[64] = {};
+            llama_model_meta_val_str(model_dft, "general.architecture", arch, sizeof(arch));
+
             char buf[32] = {};
-            if (llama_model_meta_val_str(model_dft, "dflash.block_size", buf, sizeof(buf)) >= 0) {
+            if (llama_model_meta_val_str(model_dft, (std::string(arch) + ".block_size").c_str(), buf, sizeof(buf)) >= 0) {
                 block_size = std::atoi(buf);
             }
         }
         mask_token_id = llama_vocab_mask(llama_model_get_vocab(model_dft));
 
-        LOG_INF("%s: adding speculative implementation 'draft-dflash'\n", __func__);
+        LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(type).c_str());
         LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
         LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n);
 
-        // DFlash input is [id_last, <mask> * (block_size-1)], so it can draft at most block_size-1 tokens per step
-        if (this->params.n_max > block_size - 1) {
-            LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n",
-                    __func__, this->params.n_max, block_size - 1, block_size - 1);
-            this->params.n_max = block_size - 1;
+        // the input block is [id_last, <mask> * (block_size-1)], so a step drafts at most block_size-1
+        // tokens from the mask positions, plus one more when the head also drafts at the anchor position
+        const int32_t n_draft_max = draft_at_anchor ? block_size : block_size - 1;
+        if (this->params.n_max > n_draft_max) {
+            LOG_WRN("%s: requested draft size %d exceeds the trained block size %d -- clamping to %d draft tokens per step\n",
+                    __func__, this->params.n_max, block_size, n_draft_max);
+            this->params.n_max = n_draft_max;
         }
 
         batch        = llama_batch_init(llama_n_batch(ctx_dft), 0,          n_seq);
@@ -1193,26 +1199,8 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
 
 // DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs
 struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash {
-    common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq)
-        : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)
-    {
-        auto * ctx_dft = params.ctx_dft;
-        const llama_model * model_dft = llama_get_model(ctx_dft);
-
-        {
-            char buf[32] = {};
-            if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) {
-                GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'");
-            }
-            block_size = std::atoi(buf);
-        }
-        if (params.n_max > block_size) {
-            params.n_max = block_size;
-        }
-
-        LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__);
-        LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max);
-    }
+    common_speculative_impl_draft_dspark(const common_params_speculative & params, uint32_t n_seq)
+        : common_speculative_impl_draft_dflash(params, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, /*draft_at_anchor*/ true) {}
 
     void draft(common_speculative_draft_params_vec & dparams) override {
         auto * ctx_dft = params.ctx_dft;

From 47932db0e1e555ac7aa51b97b29e2eb402361712 Mon Sep 17 00:00:00 2001
From: wjinxu <1299461899@qq.com>
Date: Sat, 4 Jul 2026 14:44:53 +0800
Subject: [PATCH 3/4] docs: add DSpark section to speculative.md

---
 docs/speculative.md | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/docs/speculative.md b/docs/speculative.md
index 4100b92f8f18..810e05e60900 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -78,6 +78,32 @@ See:
 
 - #22105
 
+### DSpark (`draft-dspark`)
+
+DSpark extends DFlash with a semi-autoregressive _Markov head_: the draft still emits a whole
+block per forward pass, but each block position's logits are biased by a low-rank term keyed on
+the previous token, chained in-graph across the block. This keeps drafting at one decode per
+block while recovering some of the left-to-right signal that pure block diffusion loses.
+
+The draft is a small DeepSpec checkpoint trained for a specific target (for example
+[`deepseek-ai/dspark_qwen3_4b_block7`](https://huggingface.co/deepseek-ai/dspark_qwen3_4b_block7)
+for `Qwen/Qwen3-4B`). Convert it with `--target-model-dir` so it inherits the target's tokenizer
+and token embeddings:
+
+```bash
+python convert_hf_to_gguf.py deepseek-ai/dspark_qwen3_4b_block7 \
+    --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-DSpark.gguf
+
+llama-server -m Qwen3-4B.gguf -md Qwen3-4B-DSpark.gguf \
+    --spec-type draft-dspark --spec-draft-n-max 7 -fa on --jinja
+```
+
+`--spec-draft-n-max` is clamped to the draft model's trained block size.
+
+See:
+
+- #25173
+
 ### n-gram Cache (`ngram-cache`)
 
 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@@ -173,7 +199,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
 ### General Speculative Parameters
 
 ```
---spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+--spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-dspark|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                         comma-separated list of types of speculative decoding to use
                                         (default: none)
                                         (env: LLAMA_ARG_SPEC_TYPE)
@@ -314,6 +340,7 @@ Specifies a comma-separated list of speculative decoding types to use.
 | `draft-simple` | Use a simple draft model for speculation |
 | `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
 | `draft-dflash` | Use a DFlash block-diffusion draft model that emits a block per step |
+| `draft-dspark` | Use a DSpark draft model (DFlash backbone + semi-autoregressive Markov head) |
 | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |

From 1ba891a03f0ba6fa7669160f9fade4f20dec3f6b Mon Sep 17 00:00:00 2001
From: wjinxu <1299461899@qq.com>
Date: Sat, 4 Jul 2026 14:59:45 +0800
Subject: [PATCH 4/4] spec: keep dspark block size read in the dspark impl

---
 common/speculative.cpp | 46 ++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index cb58d7b8b84c..7fc6f9225c2e 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -923,8 +923,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
     std::vector<float> features_buf;
 
     common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq,
-            common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH,
-            bool draft_at_anchor = false)
+            common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)
         : common_speculative_impl(type, n_seq)
         , params(params.draft)
     {
@@ -943,30 +942,25 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
         n_embd_dec    = llama_model_n_embd(model_dft);
         n_embd_enc    = (int32_t) target_layer_ids_n * n_embd_tgt;
 
-        // read the trained block size from the <arch>.block_size metadata key
+        // read the trained block size from the dflash.block_size metadata key
         block_size = 16;
         {
-            char arch[64] = {};
-            llama_model_meta_val_str(model_dft, "general.architecture", arch, sizeof(arch));
-
             char buf[32] = {};
-            if (llama_model_meta_val_str(model_dft, (std::string(arch) + ".block_size").c_str(), buf, sizeof(buf)) >= 0) {
+            if (llama_model_meta_val_str(model_dft, "dflash.block_size", buf, sizeof(buf)) >= 0) {
                 block_size = std::atoi(buf);
             }
         }
         mask_token_id = llama_vocab_mask(llama_model_get_vocab(model_dft));
 
-        LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(type).c_str());
+        LOG_INF("%s: adding speculative implementation 'draft-dflash'\n", __func__);
         LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min);
         LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n);
 
-        // the input block is [id_last, <mask> * (block_size-1)], so a step drafts at most block_size-1
-        // tokens from the mask positions, plus one more when the head also drafts at the anchor position
-        const int32_t n_draft_max = draft_at_anchor ? block_size : block_size - 1;
-        if (this->params.n_max > n_draft_max) {
-            LOG_WRN("%s: requested draft size %d exceeds the trained block size %d -- clamping to %d draft tokens per step\n",
-                    __func__, this->params.n_max, block_size, n_draft_max);
-            this->params.n_max = n_draft_max;
+        // DFlash input is [id_last, <mask> * (block_size-1)], so it can draft at most block_size-1 tokens per step
+        if (this->params.n_max > block_size - 1) {
+            LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n",
+                    __func__, this->params.n_max, block_size - 1, block_size - 1);
+            this->params.n_max = block_size - 1;
         }
 
         batch        = llama_batch_init(llama_n_batch(ctx_dft), 0,          n_seq);
@@ -1199,8 +1193,26 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
 
 // DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs
 struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash {
-    common_speculative_impl_draft_dspark(const common_params_speculative & params, uint32_t n_seq)
-        : common_speculative_impl_draft_dflash(params, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, /*draft_at_anchor*/ true) {}
+    common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq)
+        : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)
+    {
+        auto * ctx_dft = params.ctx_dft;
+        const llama_model * model_dft = llama_get_model(ctx_dft);
+
+        {
+            char buf[32] = {};
+            if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) {
+                GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'");
+            }
+            block_size = std::atoi(buf);
+        }
+        if (params.n_max > block_size) {
+            params.n_max = block_size;
+        }
+
+        LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__);
+        LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max);
+    }
 
     void draft(common_speculative_draft_params_vec & dparams) override {
         auto * ctx_dft = params.ctx_dft;