From 96c5be9dd0fb9971b09df9e2b0647c749aa4fb0d Mon Sep 17 00:00:00 2001 From: wjinxu <1299461899@qq.com> Date: Thu, 2 Jul 2026 15:24:03 +0800 Subject: [PATCH 1/4] spec: add DSpark speculative decoding DSpark (DeepSpec, 2026) on top of the merged DFlash drafter. It reuses the DFlash encoder/decoder graph, target feature extraction and KV-cache injection, and the verify/accept path unchanged; the draft model is a new "dspark" arch adding a low-rank Markov head (markov_w1/w2) and an optional (unused here) confidence head. No new public APIs. The proposal is the only change: the block is anchor-first (position 0 already predicts the first draft) and the decoder graph applies a semi-autoregressive, previous-token conditioned logit bias in-graph, chained per block position: logits'(i) = logits(i) + markov_w2 . markov_w1[prev(i)] prev(0) = the block's anchor token, prev(i>0) = argmax(logits'(i-1)) vectorized across all blocks in the batch; the anchors are fed through a dedicated graph input (token 0 of every block). Greedy stays lossless (verify unchanged, same as DFlash). - new arch "dspark" (llama_model_dspark : llama_model_dflash, reuses the graph, loads the markov/confidence tensors; shares the target's embed/lm_head). - Qwen3DSparkModel converter. - new spec type "draft-dspark" (common_speculative_impl_draft_dspark : common_speculative_impl_draft_dflash, overrides draft() only: submits whole anchor-first blocks and greedily reads back the biased logits). --- common/common.h | 3 +- common/speculative.cpp | 113 ++++++++++++++++++++++++++++- conversion/__init__.py | 1 + conversion/qwen.py | 50 +++++++++++++ gguf-py/gguf/constants.py | 28 ++++++++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 12 ++++ src/llama-arch.cpp | 10 +++ src/llama-arch.h | 6 ++ src/llama-context.cpp | 2 +- src/llama-hparams.h | 3 + src/llama-model.cpp | 6 +- src/llama-model.h | 6 ++ src/models/dspark.cpp | 126 +++++++++++++++++++++++++++++++++ src/models/models.h | 16 +++++ tests/test-llama-archs.cpp | 4 +- 16 files changed, 381 insertions(+), 8 deletions(-) create mode 100644 src/models/dspark.cpp diff --git a/common/common.h b/common/common.h index 2adb310b83fe..a4cce86eb08f 100644 --- a/common/common.h +++ b/common/common.h @@ -170,6 +170,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, // DFlash speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, // DSpark speculative decoding (DFlash + Markov head) COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values @@ -385,7 +386,7 @@ struct common_params_speculative { uint32_t need_n_rs_seq() const { bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { - return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH; + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH || t == COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK; }); return needs_rs_seq ? draft.n_max : 0u; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3951bbed5455..7fc6f9225c2e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -34,6 +34,7 @@ const std::map common_speculative_type_fro {"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3}, {"draft-mtp", COMMON_SPECULATIVE_TYPE_DRAFT_MTP}, {"draft-dflash", COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH}, + {"draft-dspark", COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK}, {"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, @@ -921,8 +922,9 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { // scratch buffer for concatenated target features [n_tokens, n_embd_enc] std::vector features_buf; - common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, n_seq) + common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq, + common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH) + : common_speculative_impl(type, n_seq) , params(params.draft) { auto * ctx_tgt = this->params.ctx_tgt; @@ -1189,6 +1191,101 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { } }; +// DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs +struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash { + common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq) + : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK) + { + auto * ctx_dft = params.ctx_dft; + const llama_model * model_dft = llama_get_model(ctx_dft); + + { + char buf[32] = {}; + if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) { + GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'"); + } + block_size = std::atoi(buf); + } + if (params.n_max > block_size) { + params.n_max = block_size; + } + + LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__); + LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max); + } + + void draft(common_speculative_draft_params_vec & dparams) override { + auto * ctx_dft = params.ctx_dft; + + common_batch_clear(batch); + + std::vector i_block_beg(n_seq, -1); + std::vector n_block (n_seq, 0); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + if (!dp.drafting) { + continue; + } + + common_sampler_reset(smpls[seq_id].get()); + + const int32_t n = (int32_t) dp.n_past; + + int32_t n_draft = params.n_max; + if (dp.n_max > 0) { + n_draft = std::min(n_draft, dp.n_max); + } + n_draft = std::min(n_draft, block_size); + if (n_draft <= 0) { + continue; + } + + // anchor-first block [id_last, * (block_size-1)]: submit the whole block so the + // in-graph Markov head can key anchors off the block boundaries; keep the first n_draft + i_block_beg[seq_id] = batch.n_tokens; + n_block [seq_id] = n_draft; + for (int32_t i = 0; i < block_size; ++i) { + common_batch_add(batch, i == 0 ? dp.id_last : mask_token_id, n + i, { seq_id }, true); + } + } + + if (batch.n_tokens == 0) { + return; + } + + if (llama_decode(ctx_dft, batch) != 0) { + LOG_WRN("%s: llama_decode failed\n", __func__); + return; + } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_block_beg[seq_id] < 0) { + continue; + } + auto & dp = dparams[seq_id]; + auto & result = *dp.result; + + const int32_t beg = i_block_beg[seq_id]; + const int32_t nb = n_block[seq_id]; // drafts to keep (<= block_size) + + auto * smpl = smpls[seq_id].get(); + // greedily read the predicted block at this sequence's noise positions 1..nb-1 + for (int32_t i = 0; i < nb; ++i) { + common_sampler_sample(smpl, ctx_dft, beg + i, true); + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + + const llama_token id = cur_p->data[0].id; + + common_sampler_accept(smpl, id, true); + + result.push_back(id); + } + } + } +}; + struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) @@ -2133,6 +2230,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3"; case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: return "draft-mtp"; case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: return "draft-dflash"; + case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: return "draft-dspark"; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v"; @@ -2186,6 +2284,7 @@ int32_t common_speculative_n_max(const common_params_speculative * spec) { case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: + case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: n_max = std::max(n_max, std::max(0, spec->draft.n_max)); break; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: @@ -2224,6 +2323,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr; bool has_draft_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; bool has_draft_dflash = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)) && params.draft.ctx_dft != nullptr; + bool has_draft_dspark = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)) && params.draft.ctx_dft != nullptr; @@ -2234,7 +2334,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, bool has_ngram_mod = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD)); // when adding a new type - update here the logic above - static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 10); + static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 11); // this list here defines the priority of the speculators // the one with highest priority are listed first @@ -2267,6 +2367,9 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_draft_dflash) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, params)); } + if (has_draft_dspark) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, params)); + } } std::vector> impls = {}; @@ -2291,6 +2394,10 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::make_unique(config.params, n_seq)); break; } + case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: { + impls.push_back(std::make_unique(config.params, n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple); diff --git a/conversion/__init__.py b/conversion/__init__.py index 02ea6385208a..8c723597d7b2 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -51,6 +51,7 @@ "DeepseekV3ForCausalLM": "deepseek", "DeepseekV32ForCausalLM": "deepseek", "DFlashDraftModel": "qwen", + "Qwen3DSparkModel": "qwen", "DeepseekV4ForCausalLM": "deepseek", "DistilBertForMaskedLM": "bert", "DistilBertForSequenceClassification": "bert", diff --git a/conversion/qwen.py b/conversion/qwen.py index 0356bd2da783..d91ed49cf280 100644 --- a/conversion/qwen.py +++ b/conversion/qwen.py @@ -673,3 +673,53 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca if not name.startswith("model."): name = "model." + name return super().filter_tensors((name, gen)) + + +@ModelBase.register("Qwen3DSparkModel") +class DSparkModel(Qwen3Model): + # DSpark = DFlash backbone + a semi-autoregressive Markov head (+ optional confidence head). + # The DeepSpec checkpoint stores its config flat (block_size / target_layer_ids / mask_token_id / + # markov_rank at top level). embed_tokens / lm_head are byte-identical to the target, so they are + # NOT emitted here -- the DSpark decoder shares the target's via ctx_other (same as DFlash). + model_arch = gguf.MODEL_ARCH.DSPARK + + def set_vocab(self): + if self.target_model_dir is None: + raise ValueError( + "DSpark draft model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory containing the tokenizer." + ) + logger.info(f"DSpark: Using tokenizer from target model: {self.target_model_dir}") + original_dir = self.dir_model + self.dir_model = self.target_model_dir + super().set_vocab() + self.dir_model = original_dir + + mask_token_id = self.hparams.get("mask_token_id") + if mask_token_id is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + block_size = self.hparams.get("block_size", 7) + self.gguf_writer.add_block_size(block_size) + + # flat DeepSpec schema; mirror DFlash's +1 extract-layer convention + target_layer_ids = self.hparams.get("target_layer_ids", []) + if target_layer_ids: + extract_layer_ids = [i + 1 for i in target_layer_ids] + self.gguf_writer.add_target_layers(extract_layer_ids) + + markov_rank = self.hparams.get("markov_rank", 0) + self.gguf_writer.add_markov_rank(markov_rank) + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + # embed_tokens / lm_head are byte-identical to the target and shared at runtime -- drop them + if name.endswith(("embed_tokens.weight", "lm_head.weight")): + return None + if not name.startswith("model."): + name = "model." + name + return super().filter_tensors((name, gen)) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index cd4cdef8991f..626d6ed51af6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -158,6 +158,7 @@ class LLM: TARGET_LAYERS = "{arch}.target_layers" TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" BLOCK_SIZE = "{arch}.block_size" + MARKOV_RANK = "{arch}.markov_rank" NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: @@ -530,6 +531,7 @@ class MODEL_ARCH(IntEnum): MISTRAL3 = auto() EAGLE3 = auto() DFLASH = auto() + DSPARK = auto() MISTRAL4 = auto() PADDLEOCR = auto() MIMO2 = auto() @@ -953,6 +955,9 @@ class MODEL_TENSOR(IntEnum): # eagle3 FC = auto() # feature fusion layer D2T = auto() # draft to target vocabulary mapping + DSPARK_MARKOV_W1 = auto() # dspark markov head: prev-token embed + DSPARK_MARKOV_W2 = auto() # dspark markov head: bias projection + DSPARK_CONF_PROJ = auto() # dspark confidence head: proj # lfm2 audio A_ENC_NORM_CONV = auto() A_ENC_LINEAR_POS = auto() @@ -1111,6 +1116,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MISTRAL3: "mistral3", MODEL_ARCH.EAGLE3: "eagle3", MODEL_ARCH.DFLASH: "dflash", + MODEL_ARCH.DSPARK: "dspark", MODEL_ARCH.MISTRAL4: "mistral4", MODEL_ARCH.PADDLEOCR: "paddleocr", MODEL_ARCH.MIMO2: "mimo2", @@ -1559,6 +1565,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", MODEL_TENSOR.FC: "fc", + MODEL_TENSOR.DSPARK_MARKOV_W1: "markov_w1", + MODEL_TENSOR.DSPARK_MARKOV_W2: "markov_w2", + MODEL_TENSOR.DSPARK_CONF_PROJ: "conf_proj", MODEL_TENSOR.D2T: "d2t", } @@ -4204,6 +4213,25 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FC, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.DSPARK: [ + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FC, + MODEL_TENSOR.ENC_OUTPUT_NORM, + MODEL_TENSOR.DSPARK_MARKOV_W1, + MODEL_TENSOR.DSPARK_MARKOV_W2, + MODEL_TENSOR.DSPARK_CONF_PROJ, + ], MODEL_ARCH.MISTRAL4: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 1e277f0687c5..09522d79aab9 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -946,6 +946,9 @@ def add_sliding_window(self, value: int) -> None: def add_block_size(self, value: int) -> None: self.add_uint32(Keys.LLM.BLOCK_SIZE.format(arch=self.arch), value) + def add_markov_rank(self, value: int) -> None: + self.add_uint32(Keys.LLM.MARKOV_RANK.format(arch=self.arch), value) + def add_target_layers(self, value: Sequence[int]) -> None: self.add_array(Keys.LLM.TARGET_LAYERS.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 9efb36f8a447..011dae886789 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1290,6 +1290,18 @@ class TensorNameMap: "model.fc", # dflash ), + MODEL_TENSOR.DSPARK_MARKOV_W1: ( + "model.markov_head.markov_w1", # dspark + ), + + MODEL_TENSOR.DSPARK_MARKOV_W2: ( + "model.markov_head.markov_w2", # dspark + ), + + MODEL_TENSOR.DSPARK_CONF_PROJ: ( + "model.confidence_head.proj", # dspark + ), + MODEL_TENSOR.CLS: ( "classifier", # jina "classifier.dense", # roberta diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b890e66fcf6e..a66af71f7114 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -131,6 +131,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_DFLASH, "dflash" }, + { LLM_ARCH_DSPARK, "dspark" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, @@ -307,6 +308,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TARGET_LAYERS, "%s.target_layers" }, { LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_MARKOV_RANK, "%s.markov_rank" }, + { LLM_KV_BLOCK_SIZE, "%s.block_size" }, { LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, @@ -603,6 +606,9 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, { LLM_TENSOR_FC, "fc" }, { LLM_TENSOR_D2T, "d2t" }, + { LLM_TENSOR_DSPARK_MARKOV_W1, "markov_w1" }, + { LLM_TENSOR_DSPARK_MARKOV_W2, "markov_w2" }, + { LLM_TENSOR_DSPARK_CONF_PROJ, "conf_proj" }, }; // declare information about the model weight tensors: @@ -854,6 +860,10 @@ static const std::map LLM_TENSOR_INFOS = { // eagle3 {LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + // dspark + {LLM_TENSOR_DSPARK_MARKOV_W1, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_DSPARK_MARKOV_W2, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_DSPARK_CONF_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index a4f5091e7170..ba6fdeaf2d0a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -145,6 +145,7 @@ enum llm_arch { LLM_ARCH_MELLUM, LLM_ARCH_EAGLE3, LLM_ARCH_DFLASH, + LLM_ARCH_DSPARK, LLM_ARCH_UNKNOWN, }; @@ -353,6 +354,8 @@ enum llm_kv { LLM_KV_TARGET_LAYERS, LLM_KV_TARGET_HIDDEN_SIZE, + LLM_KV_MARKOV_RANK, + LLM_KV_BLOCK_SIZE, LLM_KV_NORM_BEFORE_RESIDUAL, LLM_KV_SHORTCONV_L_CACHE, @@ -611,6 +614,9 @@ enum llm_tensor { LLM_TENSOR_MASKED_EMBD_ORDERING, LLM_TENSOR_FC, LLM_TENSOR_D2T, + LLM_TENSOR_DSPARK_MARKOV_W1, + LLM_TENSOR_DSPARK_MARKOV_W2, + LLM_TENSOR_DSPARK_CONF_PROJ, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0465430df43a..77b8450ea837 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -100,7 +100,7 @@ llama_context::llama_context( cparams.ctx_other = params.ctx_other; } - if (model.arch == LLM_ARCH_EAGLE3 || model.arch == LLM_ARCH_DFLASH) { + if (model.arch == LLM_ARCH_EAGLE3 || model.arch == LLM_ARCH_DFLASH || model.arch == LLM_ARCH_DSPARK) { if (model.tok_embd == nullptr || model.output == nullptr) { if (params.ctx_other == nullptr) { throw std::runtime_error(model.arch_name() + " requires ctx_other to be set (this warning is normal during memory fitting)"); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 8be5f28f39e6..e0e4ef245bfd 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -187,6 +187,9 @@ struct llama_hparams { // for Classifiers uint32_t n_cls_out = 1; + // for DSpark: the trained draft block size, in tokens (anchor + n-1 masks) + uint32_t n_dspark_block = 0; + // input embedding dimension (0 = use n_embd) uint32_t n_embd_inp_impl = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d58ebac28b9b..0598437b170b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -296,6 +296,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_eagle3(params); case LLM_ARCH_DFLASH: return new llama_model_dflash(params); + case LLM_ARCH_DSPARK: + return new llama_model_dspark(params); case LLM_ARCH_MIMO2: return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: @@ -2524,6 +2526,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_TALKIE: case LLM_ARCH_MELLUM: case LLM_ARCH_DFLASH: + case LLM_ARCH_DSPARK: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: @@ -2648,7 +2651,8 @@ bool llama_model_has_encoder(const llama_model * model) { case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: case LLM_ARCH_EAGLE3: - case LLM_ARCH_DFLASH: return true; + case LLM_ARCH_DFLASH: + case LLM_ARCH_DSPARK: return true; default: return false; } } diff --git a/src/llama-model.h b/src/llama-model.h index 4800d2928c52..e7295db30ab6 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -599,6 +599,12 @@ struct llama_model { struct ggml_tensor * fc = nullptr; // feature fusion layer struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + // dspark + struct ggml_tensor * dspark_markov_w1 = nullptr; + struct ggml_tensor * dspark_markov_w2 = nullptr; + struct ggml_tensor * dspark_conf_proj = nullptr; + struct ggml_tensor * dspark_conf_proj_b = nullptr; + // unified vector to store target-model extracted layer ids in eagle3, dflash, etc. std::vector target_layer_ids; diff --git a/src/models/dspark.cpp b/src/models/dspark.cpp new file mode 100644 index 000000000000..1c873407ab4a --- /dev/null +++ b/src/models/dspark.cpp @@ -0,0 +1,126 @@ +#include "models.h" + +// DSpark = DFlash backbone + a semi-autoregressive Markov head applied in-graph by the decoder + +void llama_model_dspark::load_arch_hparams(llama_model_loader & ml) { + llama_model_dflash::load_arch_hparams(ml); + + ml.get_key(LLM_KV_BLOCK_SIZE, hparams.n_dspark_block, /*required*/ true); +} + +void llama_model_dspark::load_arch_tensors(llama_model_loader & ml) { + llama_model_dflash::load_arch_tensors(ml); + + LLAMA_LOAD_LOCALS; + + uint32_t markov_rank = 0; + ml.get_key(LLM_KV_MARKOV_RANK, markov_rank, /*required*/ true); + const int64_t R = (int64_t) markov_rank; + + dspark_markov_w1 = create_tensor(tn(LLM_TENSOR_DSPARK_MARKOV_W1, "weight"), { R, n_vocab }, 0); + dspark_markov_w2 = create_tensor(tn(LLM_TENSOR_DSPARK_MARKOV_W2, "weight"), { R, n_vocab }, 0); + + dspark_conf_proj = create_tensor(tn(LLM_TENSOR_DSPARK_CONF_PROJ, "weight"), { n_embd + R, 1 }, TENSOR_NOT_REQUIRED); + dspark_conf_proj_b = create_tensor(tn(LLM_TENSOR_DSPARK_CONF_PROJ, "bias"), { 1 }, TENSOR_NOT_REQUIRED); +} + +std::unique_ptr llama_model_dspark::build_arch_graph(const llm_graph_params & params) const { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + return std::make_unique>(*this, params); + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + return std::make_unique>(*this, params); + default: + GGML_ABORT("invalid graph type"); + }; +} + +// DSpark encoder == DFlash encoder +template <> +llama_model_dspark::graph::graph(const llama_model & model, const llm_graph_params & params) + : llama_model_dflash::graph(model, params) {} + +// anchor (committed last) token of every draft block: token 0 of each block in the ubatch +class llm_graph_input_dspark_anchor : public llm_graph_input_i { +public: + llm_graph_input_dspark_anchor(uint32_t block_size) : block_size(block_size) {} + virtual ~llm_graph_input_dspark_anchor() = default; + + void set_input(const llama_ubatch * ubatch) override { + GGML_ASSERT(ubatch->token); + const int64_t n_blocks = anchors->ne[0]; + std::vector buf(n_blocks); + for (int64_t j = 0; j < n_blocks; ++j) { + buf[j] = ubatch->token[j*block_size]; + } + ggml_backend_tensor_set(anchors, buf.data(), 0, n_blocks*sizeof(int32_t)); + } + + bool can_reuse(const llm_graph_params & params) override { + return params.ubatch.token && anchors && + anchors->ne[0]*(int64_t) block_size == (int64_t) params.ubatch.n_tokens; + } + + ggml_tensor * anchors = nullptr; // I32 [n_blocks] + + const uint32_t block_size; +}; + +// DSpark decoder: DFlash decoder + Markov bias on the draft logits, chained per block position: +// logits'(i) = logits(i) + markov_w2 . markov_w1[prev(i)] +// prev(0) = the block's anchor token, prev(i>0) = argmax(logits'(i-1)) +template <> +llama_model_dspark::graph::graph(const llama_model & model, const llm_graph_params & params) + : llama_model_dflash::graph(model, params) { + // KV-injection (embd) batch: no logits to bias + if (ubatch.embd) { + return; + } + + ggml_tensor * w1 = model.dspark_markov_w1; + ggml_tensor * w2 = model.dspark_markov_w2; + GGML_ASSERT(w1 && w2 && "DSpark markov weights not loaded"); + + ggml_tensor * base = res->t_logits; // [n_vocab, n_tokens] + const int64_t n_vocab = base->ne[0]; + const int64_t n_tok = base->ne[1]; + + const int64_t bs = model.hparams.n_dspark_block; + GGML_ASSERT(bs > 0); + + // the drafting loop always submits whole anchor-first blocks + if (n_tok % bs != 0) { + return; + } + const int64_t n_blocks = n_tok / bs; + + auto inp = std::make_unique((uint32_t) bs); + inp->anchors = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_blocks); + ggml_set_input(inp->anchors); + ggml_tensor * prev = inp->anchors; // I32 [n_blocks] + res->add_input(std::move(inp)); + + ggml_tensor * cat = nullptr; + for (int64_t i = 0; i < bs; ++i) { + ggml_tensor * bias = ggml_mul_mat(ctx0, w2, ggml_get_rows(ctx0, w1, prev)); // [n_vocab, n_blocks] + + // position i of every block: strided view [n_vocab, n_blocks] + ggml_tensor * base_i = ggml_view_2d(ctx0, base, n_vocab, n_blocks, bs*base->nb[1], i*base->nb[1]); + ggml_tensor * col = ggml_add(ctx0, base_i, bias); + + cat = cat ? ggml_concat(ctx0, cat, col, 1) : col; + + if (i + 1 < bs) { + prev = ggml_argmax(ctx0, col); // I32 [n_blocks] + } + } + + // cat is position-major; restore the ubatch's block-major order + ggml_tensor * out = ggml_reshape_3d(ctx0, cat, n_vocab, n_blocks, bs); + out = ggml_cont(ctx0, ggml_permute(ctx0, out, 0, 2, 1, 3)); // [n_vocab, bs, n_blocks] + out = ggml_reshape_2d(ctx0, out, n_vocab, n_tok); + + res->t_logits = out; + ggml_build_forward_expand(gf, out); +} diff --git a/src/models/models.h b/src/models/models.h index 7a52e7bc1ab7..d62e162862bb 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1253,6 +1253,22 @@ struct llama_model_dflash : public llama_model_base { }; +struct llama_model_dspark : public llama_model_dflash { + llama_model_dspark(const struct llama_model_params & params) : llama_model_dflash(params) {} + // extend the DFlash hparams/tensors with the block size and the Markov / confidence heads + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + // the DFlash graphs plus the in-graph Markov head on the decoder's draft logits + template + struct graph : public llama_model_dflash::graph { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2 diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index f39abe773fc6..5a86d7365847 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -454,7 +454,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } - if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) { + if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH || arch == LLM_ARCH_DSPARK) { continue; } for (bool moe : {false, true}) { @@ -560,7 +560,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } - if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) { + if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH || arch == LLM_ARCH_DSPARK) { continue; } From 8c548e7a8f1b6b72f5a26c69b8c7815b40bb3bd2 Mon Sep 17 00:00:00 2001 From: wjinxu <1299461899@qq.com> Date: Sat, 4 Jul 2026 14:44:41 +0800 Subject: [PATCH 2/4] spec: read draft block size in the dflash impl --- common/speculative.cpp | 46 ++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 29 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 7fc6f9225c2e..cb58d7b8b84c 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -923,7 +923,8 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { std::vector features_buf; common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq, - common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH) + common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, + bool draft_at_anchor = false) : common_speculative_impl(type, n_seq) , params(params.draft) { @@ -942,25 +943,30 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { n_embd_dec = llama_model_n_embd(model_dft); n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt; - // read the trained block size from the dflash.block_size metadata key + // read the trained block size from the .block_size metadata key block_size = 16; { + char arch[64] = {}; + llama_model_meta_val_str(model_dft, "general.architecture", arch, sizeof(arch)); + char buf[32] = {}; - if (llama_model_meta_val_str(model_dft, "dflash.block_size", buf, sizeof(buf)) >= 0) { + if (llama_model_meta_val_str(model_dft, (std::string(arch) + ".block_size").c_str(), buf, sizeof(buf)) >= 0) { block_size = std::atoi(buf); } } mask_token_id = llama_vocab_mask(llama_model_get_vocab(model_dft)); - LOG_INF("%s: adding speculative implementation 'draft-dflash'\n", __func__); + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(type).c_str()); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n); - // DFlash input is [id_last, * (block_size-1)], so it can draft at most block_size-1 tokens per step - if (this->params.n_max > block_size - 1) { - LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n", - __func__, this->params.n_max, block_size - 1, block_size - 1); - this->params.n_max = block_size - 1; + // the input block is [id_last, * (block_size-1)], so a step drafts at most block_size-1 + // tokens from the mask positions, plus one more when the head also drafts at the anchor position + const int32_t n_draft_max = draft_at_anchor ? block_size : block_size - 1; + if (this->params.n_max > n_draft_max) { + LOG_WRN("%s: requested draft size %d exceeds the trained block size %d -- clamping to %d draft tokens per step\n", + __func__, this->params.n_max, block_size, n_draft_max); + this->params.n_max = n_draft_max; } batch = llama_batch_init(llama_n_batch(ctx_dft), 0, n_seq); @@ -1193,26 +1199,8 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { // DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash { - common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq) - : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK) - { - auto * ctx_dft = params.ctx_dft; - const llama_model * model_dft = llama_get_model(ctx_dft); - - { - char buf[32] = {}; - if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) { - GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'"); - } - block_size = std::atoi(buf); - } - if (params.n_max > block_size) { - params.n_max = block_size; - } - - LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__); - LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max); - } + common_speculative_impl_draft_dspark(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl_draft_dflash(params, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, /*draft_at_anchor*/ true) {} void draft(common_speculative_draft_params_vec & dparams) override { auto * ctx_dft = params.ctx_dft; From 47932db0e1e555ac7aa51b97b29e2eb402361712 Mon Sep 17 00:00:00 2001 From: wjinxu <1299461899@qq.com> Date: Sat, 4 Jul 2026 14:44:53 +0800 Subject: [PATCH 3/4] docs: add DSpark section to speculative.md --- docs/speculative.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/speculative.md b/docs/speculative.md index 4100b92f8f18..810e05e60900 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -78,6 +78,32 @@ See: - #22105 +### DSpark (`draft-dspark`) + +DSpark extends DFlash with a semi-autoregressive _Markov head_: the draft still emits a whole +block per forward pass, but each block position's logits are biased by a low-rank term keyed on +the previous token, chained in-graph across the block. This keeps drafting at one decode per +block while recovering some of the left-to-right signal that pure block diffusion loses. + +The draft is a small DeepSpec checkpoint trained for a specific target (for example +[`deepseek-ai/dspark_qwen3_4b_block7`](https://huggingface.co/deepseek-ai/dspark_qwen3_4b_block7) +for `Qwen/Qwen3-4B`). Convert it with `--target-model-dir` so it inherits the target's tokenizer +and token embeddings: + +```bash +python convert_hf_to_gguf.py deepseek-ai/dspark_qwen3_4b_block7 \ + --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-DSpark.gguf + +llama-server -m Qwen3-4B.gguf -md Qwen3-4B-DSpark.gguf \ + --spec-type draft-dspark --spec-draft-n-max 7 -fa on --jinja +``` + +`--spec-draft-n-max` is clamped to the draft model's trained block size. + +See: + +- #25173 + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -173,7 +199,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] +--spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-dspark|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) @@ -314,6 +340,7 @@ Specifies a comma-separated list of speculative decoding types to use. | `draft-simple` | Use a simple draft model for speculation | | `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states | | `draft-dflash` | Use a DFlash block-diffusion draft model that emits a block per step | +| `draft-dspark` | Use a DSpark draft model (DFlash backbone + semi-autoregressive Markov head) | | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | From 1ba891a03f0ba6fa7669160f9fade4f20dec3f6b Mon Sep 17 00:00:00 2001 From: wjinxu <1299461899@qq.com> Date: Sat, 4 Jul 2026 14:59:45 +0800 Subject: [PATCH 4/4] spec: keep dspark block size read in the dspark impl --- common/speculative.cpp | 46 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index cb58d7b8b84c..7fc6f9225c2e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -923,8 +923,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { std::vector features_buf; common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq, - common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, - bool draft_at_anchor = false) + common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH) : common_speculative_impl(type, n_seq) , params(params.draft) { @@ -943,30 +942,25 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { n_embd_dec = llama_model_n_embd(model_dft); n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt; - // read the trained block size from the .block_size metadata key + // read the trained block size from the dflash.block_size metadata key block_size = 16; { - char arch[64] = {}; - llama_model_meta_val_str(model_dft, "general.architecture", arch, sizeof(arch)); - char buf[32] = {}; - if (llama_model_meta_val_str(model_dft, (std::string(arch) + ".block_size").c_str(), buf, sizeof(buf)) >= 0) { + if (llama_model_meta_val_str(model_dft, "dflash.block_size", buf, sizeof(buf)) >= 0) { block_size = std::atoi(buf); } } mask_token_id = llama_vocab_mask(llama_model_get_vocab(model_dft)); - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(type).c_str()); + LOG_INF("%s: adding speculative implementation 'draft-dflash'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n); - // the input block is [id_last, * (block_size-1)], so a step drafts at most block_size-1 - // tokens from the mask positions, plus one more when the head also drafts at the anchor position - const int32_t n_draft_max = draft_at_anchor ? block_size : block_size - 1; - if (this->params.n_max > n_draft_max) { - LOG_WRN("%s: requested draft size %d exceeds the trained block size %d -- clamping to %d draft tokens per step\n", - __func__, this->params.n_max, block_size, n_draft_max); - this->params.n_max = n_draft_max; + // DFlash input is [id_last, * (block_size-1)], so it can draft at most block_size-1 tokens per step + if (this->params.n_max > block_size - 1) { + LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n", + __func__, this->params.n_max, block_size - 1, block_size - 1); + this->params.n_max = block_size - 1; } batch = llama_batch_init(llama_n_batch(ctx_dft), 0, n_seq); @@ -1199,8 +1193,26 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { // DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash { - common_speculative_impl_draft_dspark(const common_params_speculative & params, uint32_t n_seq) - : common_speculative_impl_draft_dflash(params, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, /*draft_at_anchor*/ true) {} + common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq) + : common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK) + { + auto * ctx_dft = params.ctx_dft; + const llama_model * model_dft = llama_get_model(ctx_dft); + + { + char buf[32] = {}; + if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) { + GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'"); + } + block_size = std::atoi(buf); + } + if (params.n_max > block_size) { + params.n_max = block_size; + } + + LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__); + LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max); + } void draft(common_speculative_draft_params_vec & dparams) override { auto * ctx_dft = params.ctx_dft;