From 41948ac48277ffd84afbf3472625e3814ef03ec9 Mon Sep 17 00:00:00 2001 From: mmca Date: Sun, 28 Jun 2026 00:51:43 -0700 Subject: [PATCH] dflash: fix VL/multimodal crash by clearing draft on position gap When image/audio embeddings are processed by the target model, the draft context's KV cache misses those positions because embedding-only batches are correctly skipped in process(). This causes a non-consecutive position error ("Y = X + 1" assertion) on the next text batch, crashing the server with "failed to process speculative batch". Fix: detect position gaps between the draft KV cache and incoming batch positions. When a gap is found (indicating image/audio tokens were processed in between), clear the draft sequence and let processing rebuild from the current text position. Speculative decoding gracefully degrades for the first few tokens after an image, then resumes normally. This is the same class of bug as: - EAGLE3 + VL (issue #24816) - MTP + VL (issue #22867, fixed by calvarado2004) Tested on Qwen3.6-27B VL with DFlash drafter: - Text-only: 37-64% draft acceptance, ~2.2x speedup - VL with images: no crash, face recognition working - BrightPath benchmark: 0.976 score (vs 0.965 baseline), 264s (vs 581s) Co-authored-by: Claude Opus 4.6 --- common/speculative.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/common/speculative.cpp b/common/speculative.cpp index afd36bc49555..79bd91cf8e7e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1031,6 +1031,28 @@ struct common_speculative_impl_draft_dflash : public common_speculative_impl { const int32_t n_ubatch = (int32_t) llama_n_ubatch(ctx_dft); + // VL support: handle position gaps from image/audio embeddings. + // Embedding batches are skipped (line ~1008), so the draft KV cache + // misses positions filled by image tokens in the target. Detect any + // non-consecutive position and clear the draft sequence so processing + // can rebuild from the current text position. + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } + GGML_ASSERT(batch_in.embd == nullptr); + const llama_pos dft_pos_max = llama_memory_seq_pos_max( + llama_get_memory(ctx_dft), seq_id); + const llama_pos batch_pos = batch_in.pos[i_batch_beg[seq_id]]; + if (dft_pos_max >= 0 && batch_pos != dft_pos_max + 1) { + LOG_DBG("%s: non-consecutive pos for seq %d " + "(dft_pos_max=%d, batch_pos=%d) — clearing draft\n", + __func__, (int) seq_id, + (int) dft_pos_max, (int) batch_pos); + llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, -1, -1); + } + } + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { if (i_batch_beg[seq_id] < 0) { continue;