From 7f91a2046cce492ce89b00652da56773eb3c6fba Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 24 Jun 2026 18:10:29 +0000 Subject: [PATCH 1/3] spec: add DFlash v2 support --- common/common.h | 3 +- common/speculative.cpp | 303 ++++++++++++++++++++++++++++++++++++- conversion/__init__.py | 1 + conversion/qwen.py | 44 ++++++ gguf-py/gguf/constants.py | 18 +++ src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-context.cpp | 4 +- src/llama-graph.cpp | 7 +- src/llama-model.cpp | 6 +- src/models/dflash.cpp | 246 ++++++++++++++++++++++++++++++ src/models/models.h | 16 ++ tests/test-llama-archs.cpp | 4 +- 13 files changed, 646 insertions(+), 8 deletions(-) create mode 100644 src/models/dflash.cpp diff --git a/common/common.h b/common/common.h index 94147d5d8cf1..62b0ed9d1617 100644 --- a/common/common.h +++ b/common/common.h @@ -162,6 +162,7 @@ enum common_speculative_type { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction + COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, // DFlash speculative decoding COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values @@ -377,7 +378,7 @@ struct common_params_speculative { uint32_t need_n_rs_seq() const { bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { - return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3; + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH; }); return needs_rs_seq ? draft.n_max : 0u; diff --git a/common/speculative.cpp b/common/speculative.cpp index c922a3f592a6..3d46d21a030f 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -26,6 +26,7 @@ const std::map common_speculative_type_fro {"draft-simple", COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE}, {"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3}, {"draft-mtp", COMMON_SPECULATIVE_TYPE_DRAFT_MTP}, + {"draft-dflash", COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH}, {"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE}, {"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K}, {"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V}, @@ -893,6 +894,296 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } }; +// DFlash: block-diffusion drafting with a draft-side KV cache injection +struct common_speculative_impl_draft_dflash : public common_speculative_impl { + common_params_speculative_draft params; + + llama_batch batch; // noise tokens + llama_batch batch_inject; // target features for KV cache injection + + std::vector smpls; + + int32_t n_embd_dec = 0; // draft hidden size + int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size + int32_t n_embd_tgt = 0; // target model hidden size + + int32_t block_size = 0; + llama_token mask_token_id = 0; + + const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices + uint32_t target_layer_ids_n = 0; + + // scratch buffer for concatenated target features [n_tokens, n_embd_enc] + std::vector features_buf; + + common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, n_seq) + , params(params.draft) + { + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + GGML_ASSERT(ctx_tgt && ctx_dft && "DFlash requires ctx_tgt and ctx_dft to be set"); + + const llama_model * model_dft = llama_get_model(ctx_dft); + const llama_model * model_tgt = llama_get_model(ctx_tgt); + + target_layer_ids = llama_model_target_layer_ids (model_dft); + target_layer_ids_n = llama_model_target_layer_ids_n(model_dft); + GGML_ASSERT(target_layer_ids_n > 0 && "DFlash model has no target_layer_ids"); + + n_embd_tgt = llama_model_n_embd(model_tgt); + n_embd_dec = llama_model_n_embd(model_dft); + n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt; + + // read the trained block size from the dflash.block_size metadata key + block_size = 16; + { + char buf[32] = {}; + if (llama_model_meta_val_str(model_dft, "dflash.block_size", buf, sizeof(buf)) >= 0) { + block_size = std::atoi(buf); + } + } + mask_token_id = llama_vocab_mask(llama_model_get_vocab(model_dft)); + + LOG_INF("%s: adding speculative implementation 'draft-dflash'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - block_size=%d, mask_token_id=%d, n_extract=%u\n", __func__, block_size, mask_token_id, target_layer_ids_n); + + // DFlash input is [id_last, * (block_size-1)], so it can draft at most block_size-1 tokens per step + if (this->params.n_max > block_size - 1) { + LOG_WRN("%s: requested draft size %d exceeds the trained DFlash block size %d -- clamping to %d draft tokens per step\n", + __func__, this->params.n_max, block_size - 1, block_size - 1); + this->params.n_max = block_size - 1; + } + + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, n_seq); + batch_inject = llama_batch_init(llama_n_batch(ctx_dft), n_embd_dec, n_seq); + + smpls.resize(n_seq); + for (auto & s : smpls) { + common_params_sampling sparams; + sparams.no_perf = false; + sparams.top_k = 1; + sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + s.reset(common_sampler_init(model_dft, sparams)); + } + + // turn on extraction of the target layers' input embeddings + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true); + } + + llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + llama_set_causal_attn(ctx_dft, false); // DFlash needs non-causal attention + } + + ~common_speculative_impl_draft_dflash() override { + llama_batch_free(batch); + llama_batch_free(batch_inject); + } + + void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + return; + } + + const int32_t N = (int32_t) prompt.size(); + if (N <= 0) { + return; + } + + const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(params.ctx_dft), seq_id); + if (pos_max < N - 1) { + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - process() did not run on every prefill ubatch. " + "Drafts may degrade.\n", + __func__, (int) pos_max, N - 1); + } + } + + bool process(const llama_batch & batch_in) override { + if (batch_in.n_tokens <= 0) { + return true; + } + + if (batch_in.token == nullptr || batch_in.embd != nullptr) { + return true; + } + + const int32_t n_tokens = batch_in.n_tokens; + + // per-seq inclusive batch range (assumes each seq's tokens are contiguous in the batch) + std::vector i_batch_beg(n_seq, -1); + std::vector i_batch_end(n_seq, -1); + for (int32_t k = 0; k < n_tokens; ++k) { + GGML_ASSERT(batch_in.n_seq_id[k] == 1); + const llama_seq_id seq_id = batch_in.seq_id[k][0]; + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + continue; + } + i_batch_end[seq_id] = k; + if (i_batch_beg[seq_id] < 0) { + i_batch_beg[seq_id] = k; + } + } + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + + const int32_t n_ubatch = (int32_t) llama_n_ubatch(ctx_dft); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } + const int32_t n_rows = i_batch_end[seq_id] - i_batch_beg[seq_id] + 1; + + for (int32_t offset = 0; offset < n_rows; offset += n_ubatch) { + const int32_t n_chunk = std::min(n_ubatch, n_rows - offset); + + // gather this chunk's target features, interleaved by extract layer + features_buf.resize((size_t) n_chunk * n_embd_enc); + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]); + if (!layer) { + GGML_ABORT("DFlash: target layer %d input not extracted.", target_layer_ids[k]); + } + for (int32_t i = 0; i < n_chunk; ++i) { + float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt; + const float * src = layer + (size_t) (i_batch_beg[seq_id] + offset + i) * n_embd_tgt; + std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float)); + } + } + + // fuse extracted features through DFlash encoder + llama_batch enc_batch = { + /*.n_tokens =*/ n_chunk, + /*.token =*/ nullptr, + /*.embd =*/ features_buf.data(), + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + + int32_t rc = llama_encode(ctx_dft, enc_batch); + if (rc != 0) { + LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n", + __func__, rc, (int) n_chunk, (int) offset); + return false; + } + + const float * inp_g = llama_get_embeddings_nextn(ctx_dft); + GGML_ASSERT(inp_g && "DFlash encoder produced no output."); + + // inject the DFlash decoder K/V cache at the tokens' target positions + batch_inject.n_tokens = n_chunk; + std::memcpy(batch_inject.embd, inp_g, (size_t) n_chunk * n_embd_dec * sizeof(float)); + + for (int32_t i = 0; i < n_chunk; ++i) { + batch_inject.pos[i] = batch_in.pos[i_batch_beg[seq_id] + offset + i]; + batch_inject.n_seq_id[i] = 1; + batch_inject.seq_id[i][0] = seq_id; + batch_inject.logits[i] = false; + } + rc = llama_decode(ctx_dft, batch_inject); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n", + __func__, rc, (int) n_chunk, (int) offset); + return false; + } + } + } + + return true; + } + + void draft(common_speculative_draft_params_vec & dparams) override { + auto & ctx_dft = params.ctx_dft; + + common_batch_clear(batch); + + // build one batch holding every drafting sequence's noise block into a single decode) + // record where each block starts and its size + std::vector i_block_beg(n_seq, -1); + std::vector n_block (n_seq, 0); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + if (!dp.drafting) { + continue; + } + + common_sampler_reset(smpls[seq_id].get()); + + const int32_t n = (int32_t) dp.n_past; + + int32_t n_draft = params.n_max; + if (dp.n_max > 0) { + n_draft = std::min(n_draft, dp.n_max); + } + + const int32_t n_block_tokens = n_draft + 1; // id_last + n_draft * + i_block_beg[seq_id] = batch.n_tokens; + n_block [seq_id] = n_block_tokens; + for (int32_t i = 0; i < n_block_tokens; ++i) { + common_batch_add(batch, i == 0 ? dp.id_last : mask_token_id, n + i, { seq_id }, true); + } + } + + if (batch.n_tokens == 0) { + return; + } + + // decode all sequence's noise block in a single batch + int ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode returned %d\n", __func__, ret); + return; + } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_block_beg[seq_id] < 0) { + continue; + } + auto & dp = dparams[seq_id]; + + const int32_t beg = i_block_beg[seq_id]; + const int32_t n_block_tokens = n_block[seq_id]; + + auto * smpl = smpls[seq_id].get(); + + auto & result = *dp.result; + + // greedily read the predicted block at this sequence's noise positions 1..n_block_tokens-1 + for (int32_t i = 1; i < n_block_tokens; ++i) { + common_sampler_sample(smpl, ctx_dft, beg + i, true); + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + seq_id, k, i - 1, cur_p->data[k].id, cur_p->data[k].p, + common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + } + + const llama_token id = cur_p->data[0].id; + + common_sampler_accept(smpl, id, true); + + result.push_back(id); + } + } + } + + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { + // noop + } + + bool need_embd() const override { + return false; + } +}; + struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) @@ -1836,6 +2127,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) { case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: return "draft-simple"; case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3"; case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: return "draft-mtp"; + case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: return "draft-dflash"; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k"; case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v"; @@ -1888,6 +2180,7 @@ int32_t common_speculative_n_max(const common_params_speculative * spec) { case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: + case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: n_max = std::max(n_max, std::max(0, spec->draft.n_max)); break; case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: @@ -1925,6 +2218,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr; bool has_draft_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; + bool has_draft_dflash = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)) && params.draft.ctx_dft != nullptr; @@ -1935,7 +2229,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, bool has_ngram_mod = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD)); // when adding a new type - update here the logic above - static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 9); + static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 10); // this list here defines the priority of the speculators // the one with highest priority are listed first @@ -1965,6 +2259,9 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_draft_mtp) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params)); } + if (has_draft_dflash) { + configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, params)); + } } std::vector> impls = {}; @@ -1985,6 +2282,10 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::make_unique(config.params, n_seq)); break; } + case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: { + impls.push_back(std::make_unique(config.params, n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple); diff --git a/conversion/__init__.py b/conversion/__init__.py index 5aad203e53c3..4a1fd5bb70f0 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -50,6 +50,7 @@ "DeepseekV2ForCausalLM": "deepseek", "DeepseekV3ForCausalLM": "deepseek", "DeepseekV32ForCausalLM": "deepseek", + "DFlashDraftModel": "qwen", "DistilBertForMaskedLM": "bert", "DistilBertForSequenceClassification": "bert", "DistilBertModel": "bert", diff --git a/conversion/qwen.py b/conversion/qwen.py index 6b85eb9aaf88..cadcd8fef73e 100644 --- a/conversion/qwen.py +++ b/conversion/qwen.py @@ -625,3 +625,47 @@ class Qwen3_5TextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReor @ModelBase.register("Qwen3_5MoeForConditionalGeneration", "Qwen3_5MoeForCausalLM") class Qwen3_5MoeTextModel(_Qwen35MtpMixin, _Qwen35MRopeMixin, _LinearAttentionVReorderBase): model_arch = gguf.MODEL_ARCH.QWEN35MOE + + +@ModelBase.register("DFlashDraftModel") +class DFlashModel(Qwen3Model): + model_arch = gguf.MODEL_ARCH.DFLASH + + def set_vocab(self): + if self.target_model_dir is None: + raise ValueError( + "DFlash draft model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory containing the tokenizer." + ) + logger.info(f"DFlash: Using tokenizer from target model: {self.target_model_dir}") + original_dir = self.dir_model + self.dir_model = self.target_model_dir + super().set_vocab() + self.dir_model = original_dir + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + block_size = self.hparams.get("block_size", 16) + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.block_size", block_size) + dflash_config = self.hparams.get("dflash_config", {}) + + target_layer_ids = dflash_config.get("target_layer_ids", []) + if target_layer_ids: + extract_layer_ids = [i + 1 for i in target_layer_ids] + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", extract_layer_ids) + + mask_token_id = dflash_config.get("mask_token_id", None) + if mask_token_id is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name == "fc.weight": + yield (name, data_torch) + return + if name == "hidden_norm.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ENC_OUTPUT_NORM), data_torch) + return + if not name.startswith("model."): + name = "model." + name + yield from super().modify_tensors(data_torch, name, bid) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1bda9452dde2..bcd10beb0418 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -517,6 +517,7 @@ class MODEL_ARCH(IntEnum): PANGU_EMBED = auto() MISTRAL3 = auto() EAGLE3 = auto() + DFLASH = auto() MISTRAL4 = auto() PADDLEOCR = auto() MIMO2 = auto() @@ -1074,6 +1075,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", MODEL_ARCH.EAGLE3: "eagle3", + MODEL_ARCH.DFLASH: "dflash", MODEL_ARCH.MISTRAL4: "mistral4", MODEL_ARCH.PADDLEOCR: "paddleocr", MODEL_ARCH.MIMO2: "mimo2", @@ -4086,6 +4088,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FC, MODEL_TENSOR.D2T, ], + MODEL_ARCH.DFLASH: [ + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FC, + MODEL_TENSOR.ENC_OUTPUT_NORM, + ], MODEL_ARCH.MISTRAL4: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 4a52d977297c..d80915ffdba5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -129,6 +129,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, { LLM_ARCH_EAGLE3, "eagle3" }, + { LLM_ARCH_DFLASH, "dflash" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 989da06d8d51..946518d5f224 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -143,6 +143,7 @@ enum llm_arch { LLM_ARCH_TALKIE, LLM_ARCH_MELLUM, LLM_ARCH_EAGLE3, + LLM_ARCH_DFLASH, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 220240ea952b..aa2b83026825 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -100,10 +100,10 @@ llama_context::llama_context( cparams.ctx_other = params.ctx_other; } - if (model.arch == LLM_ARCH_EAGLE3) { + if (model.arch == LLM_ARCH_EAGLE3 || model.arch == LLM_ARCH_DFLASH) { if (model.tok_embd == nullptr || model.output == nullptr) { if (params.ctx_other == nullptr) { - throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)"); + throw std::runtime_error(model.arch_name() + " requires ctx_other to be set (this warning is normal during memory fitting)"); } cparams.ctx_other = params.ctx_other; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 68c9e606c3e3..3ded70bc0f71 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -486,7 +486,11 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) { mctx->set_input_k_idxs(self_k_idxs, ubatch); mctx->set_input_v_idxs(self_v_idxs, ubatch); - mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // the mask is left unallocated when the graph only stores K/V without attending + // (e.g. DFlash's KV-injection pass) + if (self_kq_mask && self_kq_mask->buffer) { + mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->set_input_k_rot(self_k_rot); @@ -904,6 +908,7 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + t_h_nextn = nullptr; t_layer_inp.resize(LLAMA_MAX_LAYERS); std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d041a9ce3e27..7ac486249781 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -291,6 +291,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_mistral3(params); case LLM_ARCH_EAGLE3: return new llama_model_eagle3(params); + case LLM_ARCH_DFLASH: + return new llama_model_dflash(params); case LLM_ARCH_MIMO2: return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: @@ -2493,6 +2495,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_STEP35: case LLM_ARCH_TALKIE: case LLM_ARCH_MELLUM: + case LLM_ARCH_DFLASH: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: @@ -2616,7 +2619,8 @@ bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: case LLM_ARCH_T5ENCODER: - case LLM_ARCH_EAGLE3: return true; + case LLM_ARCH_EAGLE3: + case LLM_ARCH_DFLASH: return true; default: return false; } } diff --git a/src/models/dflash.cpp b/src/models/dflash.cpp new file mode 100644 index 000000000000..40951d52414d --- /dev/null +++ b/src/models/dflash.cpp @@ -0,0 +1,246 @@ +#include "models.h" + +#include "llama-kv-cache.h" + +void llama_model_dflash::load_arch_hparams(llama_model_loader & ml) { + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) { + throw std::runtime_error("DFlash model requires 'target_layers' in GGUF metadata"); + } + + hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * hparams.n_embd; + + LLAMA_LOG_INFO("%s: DFlash extract_layers = [", __func__); + for (size_t i = 0; i < target_layer_ids.size(); ++i) { + LLAMA_LOG_INFO("%d%s", target_layer_ids[i], i + 1 < target_layer_ids.size() ? ", " : ""); + } + LLAMA_LOG_INFO("]\n"); + + type = LLM_TYPE_UNKNOWN; +} + +void llama_model_dflash::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + const int64_t n_embd_inp = hparams.n_embd_inp_enc(); + + fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), { n_embd_inp, n_embd }, 0); + output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), { n_embd }, 0); // encoder hidden_norm (after fc) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); // decoder final norm + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + } +} + +std::unique_ptr llama_model_dflash::build_arch_graph(const llm_graph_params & params) const { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + return std::make_unique>(*this, params); + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + return std::make_unique>(*this, params); + default: + GGML_ABORT("invalid graph type"); + }; +} + +template <> +ggml_tensor * llama_model_dflash::graph::build_inp_embd_enc() const { + auto inp_target = std::make_unique(hparams.n_embd_inp_enc()); + + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens); + ggml_set_input(inp_target->embd); + + ggml_tensor * cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + +// DFlash Encoder: processes target model features through feature fusion layer +template <> +llama_model_dflash::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = build_inp_embd_enc(); + + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); + + cur = build_norm(cur, model.output_norm_enc, NULL, LLM_NORM_RMS, -1); + cb(cur, "enc_norm_out", -1); + + ggml_set_output(cur); + res->t_h_nextn = cur; + + ggml_build_forward_expand(gf, cur); +} + +// DFlash decoder, dual-mode by batch type: +// * embd batch -> fused target features: project + inject K/V into the cache. +// * token batch -> noise-block diffusion: attend over [committed, MASK...] to generate draft tokens +template <> +llama_model_dflash::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + // KV cache injection + if (ubatch.embd) { + auto inp = std::make_unique(n_embd); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp->embd); + + ggml_tensor * inp_g = inp->embd; + cb(inp_g, "inp_g_embeddings", -1); + + res->add_input(std::move(inp)); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + + ggml_tensor * Kcur = build_lora_mm(layer.wk, inp_g); + ggml_tensor * Vcur = build_lora_mm(layer.wv, inp_g); + + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Kcur = build_norm(Kcur, layer.attn_k_norm, NULL, LLM_NORM_RMS, il); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur_injected", il); + cb(Vcur, "Vcur_injected", il); + + ggml_build_forward_expand(gf, inp_attn->mctx->cpy_k(ctx0, Kcur, inp_attn->get_k_idxs(), il)); + ggml_build_forward_expand(gf, inp_attn->mctx->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il)); + } + + res->t_embd = inp_g; + + ggml_build_forward_expand(gf, inp_g); + return; + } + + // tok_embd from the target model (shared via ctx_other) + auto * tok_embd = model.tok_embd; + if (tok_embd == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->tok_embd != nullptr && "DFlash decoder requires the target model's token embeddings"); + tok_embd = model_other->tok_embd; + } + + auto inp = std::make_unique(n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + ggml_tensor * inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens); + cb(inpL, "inp_noise_embd", -1); + + res->add_input(std::move(inp)); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + + ggml_tensor * noise_norm = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + cb(noise_norm, "noise_norm", il); + + ggml_tensor * Qcur = build_lora_mm(layer.wq, noise_norm); + ggml_tensor * Kcur = build_lora_mm(layer.wk, noise_norm); + ggml_tensor * Vcur = build_lora_mm(layer.wv, noise_norm); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = build_norm(Qcur, layer.attn_q_norm, NULL, LLM_NORM_RMS, il); + Kcur = build_norm(Kcur, layer.attn_k_norm, NULL, LLM_NORM_RMS, il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // cache-aware, non-causal attention + ggml_tensor * cur = build_attn(inp_attn, + layer.wo, NULL, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + inpL = cur; + } + + ggml_tensor * cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + res->t_embd = cur; + + // lm_head from the target model (shared via ctx_other) + auto * output = model.output; + if (output == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + GGML_ASSERT(model_other->output != nullptr && "DFlash decoder requires the target model's output projection"); + output = model_other->output; + } + + cur = build_lora_mm(output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} \ No newline at end of file diff --git a/src/models/models.h b/src/models/models.h index 2ac8415a3639..d89ab96d0271 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1122,6 +1122,22 @@ struct llama_model_eagle3 : public llama_model_base { }; +struct llama_model_dflash : public llama_model_base { + llama_model_dflash(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + template + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + + ggml_tensor * build_inp_embd_enc() const; + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} // reuse load_arch_hparams and load_arch_tensors from llama_model_deepseek2 diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 524971ae4b30..c781d2903e3d 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -451,7 +451,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } - if (arch == LLM_ARCH_EAGLE3) { + if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) { continue; } for (bool moe : {false, true}) { @@ -557,7 +557,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } - if (arch == LLM_ARCH_EAGLE3) { + if (arch == LLM_ARCH_EAGLE3 || arch == LLM_ARCH_DFLASH) { continue; } From 64a4744e6bec0aa0a0dd98c1f04347f0f0efee29 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 25 Jun 2026 15:20:59 +0000 Subject: [PATCH 2/3] dflash: support sliding window attention per layer_types --- common/speculative.cpp | 6 ++--- conversion/qwen.py | 8 +++++++ src/models/dflash.cpp | 52 +++++++++++++++++++++++++++++++++--------- 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3d46d21a030f..afd36bc49555 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -986,7 +986,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } - + const int32_t N = (int32_t) prompt.size(); if (N <= 0) { return; @@ -1064,7 +1064,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { /*.seq_id =*/ nullptr, /*.logits =*/ nullptr, }; - + int32_t rc = llama_encode(ctx_dft, enc_batch); if (rc != 0) { LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n", @@ -1078,7 +1078,7 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { // inject the DFlash decoder K/V cache at the tokens' target positions batch_inject.n_tokens = n_chunk; std::memcpy(batch_inject.embd, inp_g, (size_t) n_chunk * n_embd_dec * sizeof(float)); - + for (int32_t i = 0; i < n_chunk; ++i) { batch_inject.pos[i] = batch_in.pos[i_batch_beg[seq_id] + offset + i]; batch_inject.n_seq_id[i] = 1; diff --git a/conversion/qwen.py b/conversion/qwen.py index cadcd8fef73e..81f450e40957 100644 --- a/conversion/qwen.py +++ b/conversion/qwen.py @@ -659,6 +659,14 @@ def set_gguf_parameters(self): if mask_token_id is not None: self.gguf_writer.add_mask_token_id(mask_token_id) + use_sliding_window = self.hparams.get("use_sliding_window", False) + sliding_window = self.hparams.get("sliding_window") + layer_types = self.hparams.get("layer_types") + if use_sliding_window and sliding_window and layer_types: + is_swa = [lt == "sliding_attention" for lt in layer_types] + self.gguf_writer.add_sliding_window(sliding_window) + self.gguf_writer.add_sliding_window_pattern(is_swa) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name == "fc.weight": yield (name, data_torch) diff --git a/src/models/dflash.cpp b/src/models/dflash.cpp index 40951d52414d..a7b4f4435a88 100644 --- a/src/models/dflash.cpp +++ b/src/models/dflash.cpp @@ -1,6 +1,7 @@ #include "models.h" #include "llama-kv-cache.h" +#include "llama-kv-cache-iswa.h" void llama_model_dflash::load_arch_hparams(llama_model_loader & ml) { @@ -18,6 +19,15 @@ void llama_model_dflash::load_arch_hparams(llama_model_loader & ml) { } LLAMA_LOG_INFO("]\n"); + // optional interleaved sliding-window attention with per-layer pattern array. + // DFlash has a single rope, so the SWA rope == main rope. + if (ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false) && hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + } + type = LLM_TYPE_UNKNOWN; } @@ -104,7 +114,17 @@ llama_model_dflash::graph::graph(const llama_model & model, const llm_gra GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + + // optional iSWA: pick the matching attention input + const bool use_iswa = hparams.swa_type != LLAMA_SWA_TYPE_NONE; + + llm_graph_input_attn_kv * inp_attn = nullptr; + llm_graph_input_attn_kv_iswa * inp_attn_iswa = nullptr; + if (use_iswa) { + inp_attn_iswa = build_attn_inp_kv_iswa(); + } else { + inp_attn = build_attn_inp_kv(); + } const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); @@ -138,8 +158,18 @@ llama_model_dflash::graph::graph(const llama_model & model, const llm_gra cb(Kcur, "Kcur_injected", il); cb(Vcur, "Vcur_injected", il); - ggml_build_forward_expand(gf, inp_attn->mctx->cpy_k(ctx0, Kcur, inp_attn->get_k_idxs(), il)); - ggml_build_forward_expand(gf, inp_attn->mctx->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il)); + if (use_iswa) { + // route each layer's K/V to its sub-cache: SWA layers -> sliding cache, full -> dense + const bool is_swa = hparams.is_swa(il); + const auto * kv = is_swa ? inp_attn_iswa->mctx->get_swa() : inp_attn_iswa->mctx->get_base(); + ggml_tensor * k_idxs = is_swa ? inp_attn_iswa->get_k_idxs_swa() : inp_attn_iswa->get_k_idxs(); + ggml_tensor * v_idxs = is_swa ? inp_attn_iswa->get_v_idxs_swa() : inp_attn_iswa->get_v_idxs(); + ggml_build_forward_expand(gf, kv->cpy_k(ctx0, Kcur, k_idxs, il)); + ggml_build_forward_expand(gf, kv->cpy_v(ctx0, Vcur, v_idxs, il)); + } else { + ggml_build_forward_expand(gf, inp_attn->mctx->cpy_k(ctx0, Kcur, inp_attn->get_k_idxs(), il)); + ggml_build_forward_expand(gf, inp_attn->mctx->cpy_v(ctx0, Vcur, inp_attn->get_v_idxs(), il)); + } } res->t_embd = inp_g; @@ -153,19 +183,19 @@ llama_model_dflash::graph::graph(const llama_model & model, const llm_gra if (tok_embd == nullptr) { GGML_ASSERT(cparams.ctx_other != nullptr); const auto * model_other = llama_get_model(cparams.ctx_other); - + GGML_ASSERT(model_other->tok_embd != nullptr && "DFlash decoder requires the target model's token embeddings"); tok_embd = model_other->tok_embd; } auto inp = std::make_unique(n_embd); - + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_input(inp->tokens); - + ggml_tensor * inpL = ggml_get_rows(ctx0, tok_embd, inp->tokens); cb(inpL, "inp_noise_embd", -1); - + res->add_input(std::move(inp)); for (int il = 0; il < n_layer; ++il) { @@ -200,9 +230,9 @@ llama_model_dflash::graph::graph(const llama_model & model, const llm_gra cb(Vcur, "Vcur", il); // cache-aware, non-causal attention - ggml_tensor * cur = build_attn(inp_attn, - layer.wo, NULL, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + ggml_tensor * cur = use_iswa + ? build_attn(inp_attn_iswa, layer.wo, NULL, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il) + : build_attn(inp_attn, layer.wo, NULL, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); @@ -243,4 +273,4 @@ llama_model_dflash::graph::graph(const llama_model & model, const llm_gra res->t_logits = cur; ggml_build_forward_expand(gf, cur); -} \ No newline at end of file +} From bed37faf094f5426485c1efb32737daa484d0c1a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 26 Jun 2026 13:37:26 +0000 Subject: [PATCH 3/3] docs: add dflash section --- docs/speculative.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/speculative.md b/docs/speculative.md index 8f91256c4a4d..4100b92f8f18 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -52,6 +52,32 @@ Supported EAGLE-3 draft models include: For the full and up-to-date list of supported models, see #18039. +### DFlash (`draft-dflash`) + +DFlash produces an entire block of draft tokens in a single forward pass (block diffusion) and +injects the target model's hidden states into the draft model's attention, instead of drafting one +token at a time. This keeps the draft model small while making drafting GPU-friendly. Unlike EAGLE-3 +(a single-layer autoregressive draft), the DFlash draft uses several transformer layers but emits a +whole block per draft step. + +The draft is a small block-diffusion model trained for a specific target (for example +`z-lab/Qwen3-4B-DFlash` for `Qwen/Qwen3-4B`). Convert it with `--target-model-dir` so it inherits the +target's tokenizer and token embeddings: + +```bash +python convert_hf_to_gguf.py z-lab/Qwen3-4B-DFlash \ + --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-DFlash.gguf + +llama-server -m Qwen3-4B.gguf -md Qwen3-4B-DFlash.gguf \ + --spec-type draft-dflash --spec-draft-n-max 15 -fa on --jinja +``` + +`--spec-draft-n-max` is clamped to the draft model's trained block size. + +See: + +- #22105 + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -147,7 +173,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] +--spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) @@ -287,6 +313,7 @@ Specifies a comma-separated list of speculative decoding types to use. | `none` | No speculative decoding (default) | | `draft-simple` | Use a simple draft model for speculation | | `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states | +| `draft-dflash` | Use a DFlash block-diffusion draft model that emits a block per step | | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching |