lightseekorg · mesaleh · May 26, 2026
diff --git a/docs/configuration/server.md b/docs/configuration/server.md
@@ -120,7 +120,7 @@ the values accepted by the bundled `tokenspeed-smg` package.
 | Parameter | Purpose |
 | --- | --- |
 | `--speculative-config` | JSON speculative decoding configuration. |
-| `--speculative-algorithm` | Speculative algorithm, such as `EAGLE3` or `MTP`. |
+| `--speculative-algorithm` | Speculative algorithm, such as `EAGLE3`, `MTP`, or `DFLASH`. |
 | `--speculative-draft-model-path` | Draft model path or repo ID. |
 | `--speculative-draft-model-quantization` | Draft model quantization. Defaults to `unquant`. |
 | `--speculative-num-steps` | Number of draft model steps. Defaults to `3`. |

diff --git a/docs/recipes/models.md b/docs/recipes/models.md
@@ -33,6 +33,38 @@ tokenspeed serve nvidia/Kimi-K2.5-NVFP4 \
 For K2.6, keep the same parameter shape and change the checkpoint and parser
 only if the model card requires a different value.
 
+To enable a compatible DFlash draft model, keep the target launch shape and add
+the draft model path plus DFlash speculative decoding options:
+
+```bash
+tokenspeed serve nvidia/Kimi-K2.6-NVFP4 \
+  --served-model-name kimi-k2.6 \
+  --trust-remote-code \
+  --max-model-len 262144 \
+  --kv-cache-dtype fp8 \
+  --quantization nvfp4 \
+  --tensor-parallel-size 4 \
+  --enable-expert-parallel \
+  --chunked-prefill-size 8192 \
+  --max-num-seqs 256 \
+  --attention-backend tokenspeed_mla \
+  --moe-backend flashinfer_trtllm \
+  --reasoning-parser kimi_k25 \
+  --tool-call-parser kimik2 \
+  --speculative-algorithm DFLASH \
+  --speculative-draft-model-path /path/to/kimi-k2.6-dflash \
+  --speculative-num-draft-tokens 8 \
+  --speculative-num-steps 7 \
+  --drafter-attention-backend fa4 \
+  --host 0.0.0.0 \
+  --port 8000
+```
+
+Known limitation: native TokenSpeed DFlash currently uses full-history draft
+attention. It does not yet expose an equivalent of SGLang's
+`--speculative-dflash-draft-window-size`; add such a flag before relying on
+bounded draft attention for long-context deployments.
+
 ## Qwen3 Dense / Qwen3 30B-A3B
 
 Qwen2, dense Qwen3, and Qwen3 MoE checkpoints use different architecture names.

@@ -349,6 +349,44 @@ def _capture_one(self, bs: int):
             grammar_backend=self.grammar_backend,
         )
 
+        # Spec-decode capture runs a synthetic multi-token decode. Keep the
+        # dummy cache lengths internally consistent with that token count so
+        # attention warmup does not read an impossible q_len > seq_len state.
+        tokens_per_req = self.max_tokens_per_req
+        self.input_buffers.seq_lens_buf[:bs].fill_(tokens_per_req)
+        self.input_buffers.input_lengths_buf[:bs].fill_(tokens_per_req)
+
+        # Capture block tables point at synthetic per-request pages. Write the
+        # dummy KV tokens into those same slots so attention warmup/capture reads
+        # initialized keys instead of the reserved padding slot.
+        page_size = self.input_buffers.page_size
+        pages_per_req = (tokens_per_req + page_size - 1) // page_size
+        token_offsets = torch.arange(
+            tokens_per_req, dtype=torch.int32, device=self.device
+        )
+        request_offsets = (
+            torch.arange(bs, dtype=torch.int32, device=self.device).unsqueeze(1)
+            * pages_per_req
+            * page_size
+        )
+        self.input_buffers.out_cache_loc_buf[: bs * tokens_per_req].copy_(
+            (request_offsets + token_offsets).reshape(-1)
+        )
+
+        # Some fused decode kernels may read full page vectors during capture
+        # even when seq_lens bounds the logical context. Clear the synthetic
+        # pages so any padding read is deterministic zero, not allocator noise.
+        capture_slots = bs * pages_per_req * page_size
+        for pool in (self.token_to_kv_pool, self.draft_token_to_kv_pool):
+            if pool is None or not hasattr(pool, "kv_buffer"):
+                continue
+            for layer_buf in pool.kv_buffer:
+                if isinstance(layer_buf, (tuple, list)):
+                    for sub_buf in layer_buf:
+                        sub_buf[:capture_slots].zero_()
+                else:
+                    layer_buf[:capture_slots].zero_()
+
         self._init_capture_metadata(bs)
 
         def run_once():
@@ -379,6 +417,11 @@ def run_once():
         torch.cuda.synchronize()
         dist.barrier()
 
+        # Warmups can switch a backend back to eager metadata objects. Restore
+        # the graph-backed metadata immediately before capture so replay-time
+        # metadata refreshes update the same tensors recorded by the graph.
+        self._init_capture_metadata(bs)
+
         # Fill sampler buffers OUTSIDE the capture so RNG ops aren't recorded.
         if self.sampling_backend is not None:
             self.sampling_backend.prepare_capture(