Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ enum common_speculative_type {
COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction
COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, // DFlash speculative decoding
COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, // DSpark speculative decoding (DFlash + Markov head)
COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only
COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
Expand Down Expand Up @@ -385,7 +386,7 @@ struct common_params_speculative {

uint32_t need_n_rs_seq() const {
bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH;
return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3 || t == COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH || t == COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK;
});

return needs_rs_seq ? draft.n_max : 0u;
Expand Down
113 changes: 110 additions & 3 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ const std::map<std::string, common_speculative_type> common_speculative_type_fro
{"draft-eagle3", COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3},
{"draft-mtp", COMMON_SPECULATIVE_TYPE_DRAFT_MTP},
{"draft-dflash", COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH},
{"draft-dspark", COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK},
{"ngram-simple", COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE},
{"ngram-map-k", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K},
{"ngram-map-k4v", COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V},
Expand Down Expand Up @@ -921,8 +922,9 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
// scratch buffer for concatenated target features [n_tokens, n_embd_enc]
std::vector<float> features_buf;

common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq)
: common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, n_seq)
common_speculative_impl_draft_dflash(const common_params_speculative & params, uint32_t n_seq,
common_speculative_type type = COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)
: common_speculative_impl(type, n_seq)
, params(params.draft)
{
auto * ctx_tgt = this->params.ctx_tgt;
Expand Down Expand Up @@ -1189,6 +1191,101 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl {
}
};

// DSpark: DFlash backbone + a semi-autoregressive Markov head; reuses process(), only draft() differs
struct common_speculative_impl_draft_dspark : public common_speculative_impl_draft_dflash {
common_speculative_impl_draft_dspark(const common_params_speculative & params_in, uint32_t n_seq)
: common_speculative_impl_draft_dflash(params_in, n_seq, COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)
{
auto * ctx_dft = params.ctx_dft;
const llama_model * model_dft = llama_get_model(ctx_dft);

{
char buf[32] = {};
if (llama_model_meta_val_str(model_dft, "dspark.block_size", buf, sizeof(buf)) < 0) {
GGML_ABORT("DSpark: missing required metadata key 'dspark.block_size'");
}
block_size = std::atoi(buf);
}
if (params.n_max > block_size) {
params.n_max = block_size;
}

LOG_INF("%s: adding speculative implementation 'draft-dspark'\n", __func__);
LOG_INF("%s: - block_size=%d, n_max=%d\n", __func__, block_size, params.n_max);
}

void draft(common_speculative_draft_params_vec & dparams) override {
auto * ctx_dft = params.ctx_dft;

common_batch_clear(batch);

std::vector<int32_t> i_block_beg(n_seq, -1);
std::vector<int32_t> n_block (n_seq, 0);

for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
auto & dp = dparams[seq_id];
if (!dp.drafting) {
continue;
}

common_sampler_reset(smpls[seq_id].get());

const int32_t n = (int32_t) dp.n_past;

int32_t n_draft = params.n_max;
if (dp.n_max > 0) {
n_draft = std::min(n_draft, dp.n_max);
}
n_draft = std::min(n_draft, block_size);
if (n_draft <= 0) {
continue;
}

// anchor-first block [id_last, <mask> * (block_size-1)]: submit the whole block so the
// in-graph Markov head can key anchors off the block boundaries; keep the first n_draft
i_block_beg[seq_id] = batch.n_tokens;
n_block [seq_id] = n_draft;
for (int32_t i = 0; i < block_size; ++i) {
common_batch_add(batch, i == 0 ? dp.id_last : mask_token_id, n + i, { seq_id }, true);
}
}

if (batch.n_tokens == 0) {
return;
}

if (llama_decode(ctx_dft, batch) != 0) {
LOG_WRN("%s: llama_decode failed\n", __func__);
return;
}

for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
if (i_block_beg[seq_id] < 0) {
continue;
}
auto & dp = dparams[seq_id];
auto & result = *dp.result;

const int32_t beg = i_block_beg[seq_id];
const int32_t nb = n_block[seq_id]; // drafts to keep (<= block_size)

auto * smpl = smpls[seq_id].get();
// greedily read the predicted block at this sequence's noise positions 1..nb-1
for (int32_t i = 0; i < nb; ++i) {
common_sampler_sample(smpl, ctx_dft, beg + i, true);

const auto * cur_p = common_sampler_get_candidates(smpl, true);

const llama_token id = cur_p->data[0].id;

common_sampler_accept(smpl, id, true);

result.push_back(id);
}
}
}
};

struct common_speculative_impl_draft_mtp : public common_speculative_impl {
common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft)

Expand Down Expand Up @@ -2133,6 +2230,7 @@ std::string common_speculative_type_to_str(common_speculative_type type) {
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: return "draft-eagle3";
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: return "draft-mtp";
case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH: return "draft-dflash";
case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: return "draft-dspark";
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: return "ngram-simple";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: return "ngram-map-k";
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: return "ngram-map-k4v";
Expand Down Expand Up @@ -2186,6 +2284,7 @@ int32_t common_speculative_n_max(const common_params_speculative * spec) {
case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3:
case COMMON_SPECULATIVE_TYPE_DRAFT_MTP:
case COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH:
case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK:
n_max = std::max(n_max, std::max(0, spec->draft.n_max));
break;
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE:
Expand Down Expand Up @@ -2224,6 +2323,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
bool has_draft_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
bool has_draft_dflash = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH)) && params.draft.ctx_dft != nullptr;
bool has_draft_dspark = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK)) && params.draft.ctx_dft != nullptr;



Expand All @@ -2234,7 +2334,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
bool has_ngram_mod = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MOD));

// when adding a new type - update here the logic above
static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 10);
static_assert(COMMON_SPECULATIVE_TYPE_COUNT == 11);

// this list here defines the priority of the speculators
// the one with highest priority are listed first
Expand Down Expand Up @@ -2267,6 +2367,9 @@ common_speculative * common_speculative_init(common_params_speculative & params,
if (has_draft_dflash) {
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DFLASH, params));
}
if (has_draft_dspark) {
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK, params));
}
}

std::vector<std::unique_ptr<common_speculative_impl>> impls = {};
Expand All @@ -2291,6 +2394,10 @@ common_speculative * common_speculative_init(common_params_speculative & params,
impls.push_back(std::make_unique<common_speculative_impl_draft_dflash>(config.params, n_seq));
break;
}
case COMMON_SPECULATIVE_TYPE_DRAFT_DSPARK: {
impls.push_back(std::make_unique<common_speculative_impl_draft_dspark>(config.params, n_seq));
break;
}
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);

Expand Down
1 change: 1 addition & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
"DFlashDraftModel": "qwen",
"Qwen3DSparkModel": "qwen",
"DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
Expand Down
50 changes: 50 additions & 0 deletions conversion/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,3 +673,53 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
if not name.startswith("model."):
name = "model." + name
return super().filter_tensors((name, gen))


@ModelBase.register("Qwen3DSparkModel")
class DSparkModel(Qwen3Model):
# DSpark = DFlash backbone + a semi-autoregressive Markov head (+ optional confidence head).
# The DeepSpec checkpoint stores its config flat (block_size / target_layer_ids / mask_token_id /
# markov_rank at top level). embed_tokens / lm_head are byte-identical to the target, so they are
# NOT emitted here -- the DSpark decoder shares the target's via ctx_other (same as DFlash).
model_arch = gguf.MODEL_ARCH.DSPARK

def set_vocab(self):
if self.target_model_dir is None:
raise ValueError(
"DSpark draft model requires --target-model-dir to be specified. "
"Please provide the path to the target model directory containing the tokenizer."
)
logger.info(f"DSpark: Using tokenizer from target model: {self.target_model_dir}")
original_dir = self.dir_model
self.dir_model = self.target_model_dir
super().set_vocab()
self.dir_model = original_dir

mask_token_id = self.hparams.get("mask_token_id")
if mask_token_id is not None:
self.gguf_writer.add_mask_token_id(mask_token_id)

def set_gguf_parameters(self):
super().set_gguf_parameters()

block_size = self.hparams.get("block_size", 7)
self.gguf_writer.add_block_size(block_size)

# flat DeepSpec schema; mirror DFlash's +1 extract-layer convention
target_layer_ids = self.hparams.get("target_layer_ids", [])
if target_layer_ids:
extract_layer_ids = [i + 1 for i in target_layer_ids]
self.gguf_writer.add_target_layers(extract_layer_ids)

markov_rank = self.hparams.get("markov_rank", 0)
self.gguf_writer.add_markov_rank(markov_rank)

@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
# embed_tokens / lm_head are byte-identical to the target and shared at runtime -- drop them
if name.endswith(("embed_tokens.weight", "lm_head.weight")):
return None
if not name.startswith("model."):
name = "model." + name
return super().filter_tensors((name, gen))
29 changes: 28 additions & 1 deletion docs/speculative.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,32 @@ See:

- #22105

### DSpark (`draft-dspark`)

DSpark extends DFlash with a semi-autoregressive _Markov head_: the draft still emits a whole
block per forward pass, but each block position's logits are biased by a low-rank term keyed on
the previous token, chained in-graph across the block. This keeps drafting at one decode per
block while recovering some of the left-to-right signal that pure block diffusion loses.

The draft is a small DeepSpec checkpoint trained for a specific target (for example
[`deepseek-ai/dspark_qwen3_4b_block7`](https://huggingface.co/deepseek-ai/dspark_qwen3_4b_block7)
for `Qwen/Qwen3-4B`). Convert it with `--target-model-dir` so it inherits the target's tokenizer
and token embeddings:

```bash
python convert_hf_to_gguf.py deepseek-ai/dspark_qwen3_4b_block7 \
--target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-DSpark.gguf

llama-server -m Qwen3-4B.gguf -md Qwen3-4B-DSpark.gguf \
--spec-type draft-dspark --spec-draft-n-max 7 -fa on --jinja
```

`--spec-draft-n-max` is clamped to the draft model's trained block size.

See:

- #25173

### n-gram Cache (`ngram-cache`)

An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
Expand Down Expand Up @@ -173,7 +199,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
### General Speculative Parameters

```
--spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
--spec-type [none|draft-simple|draft-eagle3|draft-dflash|draft-dspark|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
comma-separated list of types of speculative decoding to use
(default: none)
(env: LLAMA_ARG_SPEC_TYPE)
Expand Down Expand Up @@ -314,6 +340,7 @@ Specifies a comma-separated list of speculative decoding types to use.
| `draft-simple` | Use a simple draft model for speculation |
| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
| `draft-dflash` | Use a DFlash block-diffusion draft model that emits a block per step |
| `draft-dspark` | Use a DSpark draft model (DFlash backbone + semi-autoregressive Markov head) |
| `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
| `ngram-cache` | Use n-gram cache lookup |
| `ngram-simple` | Use simple n-gram pattern matching |
Expand Down
28 changes: 28 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ class LLM:
TARGET_LAYERS = "{arch}.target_layers"
TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
BLOCK_SIZE = "{arch}.block_size"
MARKOV_RANK = "{arch}.markov_rank"
NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"

class Attention:
Expand Down Expand Up @@ -530,6 +531,7 @@ class MODEL_ARCH(IntEnum):
MISTRAL3 = auto()
EAGLE3 = auto()
DFLASH = auto()
DSPARK = auto()
MISTRAL4 = auto()
PADDLEOCR = auto()
MIMO2 = auto()
Expand Down Expand Up @@ -953,6 +955,9 @@ class MODEL_TENSOR(IntEnum):
# eagle3
FC = auto() # feature fusion layer
D2T = auto() # draft to target vocabulary mapping
DSPARK_MARKOV_W1 = auto() # dspark markov head: prev-token embed
DSPARK_MARKOV_W2 = auto() # dspark markov head: bias projection
DSPARK_CONF_PROJ = auto() # dspark confidence head: proj
# lfm2 audio
A_ENC_NORM_CONV = auto()
A_ENC_LINEAR_POS = auto()
Expand Down Expand Up @@ -1111,6 +1116,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.MISTRAL3: "mistral3",
MODEL_ARCH.EAGLE3: "eagle3",
MODEL_ARCH.DFLASH: "dflash",
MODEL_ARCH.DSPARK: "dspark",
MODEL_ARCH.MISTRAL4: "mistral4",
MODEL_ARCH.PADDLEOCR: "paddleocr",
MODEL_ARCH.MIMO2: "mimo2",
Expand Down Expand Up @@ -1559,6 +1565,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
MODEL_TENSOR.FC: "fc",
MODEL_TENSOR.DSPARK_MARKOV_W1: "markov_w1",
MODEL_TENSOR.DSPARK_MARKOV_W2: "markov_w2",
MODEL_TENSOR.DSPARK_CONF_PROJ: "conf_proj",
MODEL_TENSOR.D2T: "d2t",
}

Expand Down Expand Up @@ -4204,6 +4213,25 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FC,
MODEL_TENSOR.ENC_OUTPUT_NORM,
],
MODEL_ARCH.DSPARK: [
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FC,
MODEL_TENSOR.ENC_OUTPUT_NORM,
MODEL_TENSOR.DSPARK_MARKOV_W1,
MODEL_TENSOR.DSPARK_MARKOV_W2,
MODEL_TENSOR.DSPARK_CONF_PROJ,
],
MODEL_ARCH.MISTRAL4: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,6 +946,9 @@ def add_sliding_window(self, value: int) -> None:
def add_block_size(self, value: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_SIZE.format(arch=self.arch), value)

def add_markov_rank(self, value: int) -> None:
self.add_uint32(Keys.LLM.MARKOV_RANK.format(arch=self.arch), value)

def add_target_layers(self, value: Sequence[int]) -> None:
self.add_array(Keys.LLM.TARGET_LAYERS.format(arch=self.arch), value)

Expand Down
Loading
Loading