leehack · leehack · Jul 5, 2026 · Jul 5, 2026
@@ -1,5 +1,9 @@
 ## Unreleased
 
+* Added llama.cpp ngram-simple speculative decoding via
+  `SpeculativeDecodingConfig.ngramSimple(...)`, including Dart routing, native
+  wrapper bindings, docs, and local benchmark matrix coverage.
+
 * Added `LlamaStructuredOutput` and `LlamaEngine.createStructuredJson(...)`
   helpers for strict JSON-object / JSON-schema generation with final-output
   validation and typed decoding.

diff --git a/README.md b/README.md
@@ -288,6 +288,31 @@ Higher `draftTokenMax` values can be faster on some models/devices, but they
 should be benchmarked with the target model because excess draft depth can add
 verification overhead.
 
+For GGUF models without an MTP or separate draft model, llama.cpp ngram-simple
+speculative decoding uses recent token history as the drafter:
+
+```dart
+params: const GenerationParams(
+  maxTokens: 128,
+  penalty: 1.0,
+  speculativeDecodingConfig: SpeculativeDecodingConfig.ngramSimple(
+    draftTokenMax: 4,
+    ngramSize: 12,
+  ),
+),
+```
+
+Reserve `ModelParams.speculativeRollbackTokenMax` at least as large as
+`draftTokenMax` before using ngram-simple. Deeper ngram-simple drafts
+(`draftTokenMax > 2`) require `GenerationParams.penalty: 1.0`; with repeat
+penalties, keep `draftTokenMax <= 2` so rejected draft tails cannot disturb
+deterministic sampler parity. Ngram-simple is workload-dependent and can be
+slower than baseline decoding on prompts with little repetition, so validate it
+with your model and prompt shape. For local measurements, set
+`LLAMADART_MTP_BENCHMARK_NGRAM=true` and
+`LLAMADART_MTP_BENCHMARK_NGRAM_ONLY=true` when running
+`tool/testing/llama_cpp_mtp_benchmark.dart`.
+
 For target/draft model pairs, pass the separate drafter GGUF with
 `draftModelPath`:
 

@@ -159,6 +159,12 @@ dart run tool/testing/native_inference_benchmark.dart \
   --mode all \
   --runs 3 \
   --max-tokens 128
+
+LLAMADART_MTP_BENCHMARK_NGRAM=true \
+LLAMADART_MTP_BENCHMARK_NGRAM_ONLY=true \
+LLAMADART_MTP_BENCHMARK_NGRAM_SIZE=1 \
+  dart run tool/testing/llama_cpp_mtp_benchmark.dart \
+  models/Qwen3.5-0.8B-Q4_K_M.gguf - 128 3 1,2,4 1
 ```
 
 Use `--dry-run` first when a scenario starts servers, builds Flutter web, or

diff --git a/lib/src/backends/litert_lm/litert_lm_service.dart b/lib/src/backends/litert_lm/litert_lm_service.dart
@@ -1013,6 +1013,10 @@ class LiteRtLmService {
     if (config == null) {
       return;
     }
+    if (config.strategy != SpeculativeDecodingStrategy.backendDefault &&
+        config.strategy != SpeculativeDecodingStrategy.mtp) {
+      unsupported.add('speculativeDecodingConfig.strategy');
+    }
     if (config.draftTokenMax != null) {
       unsupported.add('speculativeDecodingConfig.draftTokenMax');
     }
@@ -1025,6 +1029,9 @@ class LiteRtLmService {
     if (config.draftModelPath != null) {
       unsupported.add('speculativeDecodingConfig.draftModelPath');
     }
+    if (config.ngramSize != null) {
+      unsupported.add('speculativeDecodingConfig.ngramSize');
+    }
   }
 
   int _defaultSamplerSeed() {

diff --git a/lib/src/backends/llama_cpp/bindings.dart b/lib/src/backends/llama_cpp/bindings.dart
@@ -7841,6 +7841,70 @@ external void llama_dart_mtp_accept(
   int accepted_count,
 );
 
+@ffi.Native<ffi.Pointer<llama_dart_ngram> Function(ffi.Int32, ffi.Int32)>()
+external ffi.Pointer<llama_dart_ngram> llama_dart_ngram_simple_init(
+  int ngram_size,
+  int draft_token_max,
+);
+
+@ffi.Native<ffi.Void Function(ffi.Pointer<llama_dart_ngram>)>()
+external void llama_dart_ngram_free(ffi.Pointer<llama_dart_ngram> ngram);
+
+@ffi.Native<
+  ffi.Bool Function(
+    ffi.Pointer<llama_dart_ngram>,
+    llama_seq_id,
+    ffi.Pointer<llama_token>,
+    ffi.Int32,
+  )
+>()
+external bool llama_dart_ngram_begin(
+  ffi.Pointer<llama_dart_ngram> ngram,
+  int seq_id,
+  ffi.Pointer<llama_token> prompt,
+  int prompt_count,
+);
+
+@ffi.Native<ffi.Bool Function(ffi.Pointer<llama_dart_ngram>, llama_batch)>()
+external bool llama_dart_ngram_process_batch(
+  ffi.Pointer<llama_dart_ngram> ngram,
+  llama_batch batch,
+);
+
+@ffi.Native<
+  ffi.Int32 Function(
+    ffi.Pointer<llama_dart_ngram>,
+    llama_seq_id,
+    llama_pos,
+    llama_token,
+    ffi.Pointer<llama_token>,
+    ffi.Int32,
+    ffi.Int32,
+    ffi.Pointer<llama_token>,
+    ffi.Int32,
+  )
+>()
+external int llama_dart_ngram_draft(
+  ffi.Pointer<llama_dart_ngram> ngram,
+  int seq_id,
+  int n_past,
+  int id_last,
+  ffi.Pointer<llama_token> prompt,
+  int prompt_count,
+  int draft_token_max,
+  ffi.Pointer<llama_token> out_tokens,
+  int out_capacity,
+);
+
+@ffi.Native<
+  ffi.Void Function(ffi.Pointer<llama_dart_ngram>, llama_seq_id, ffi.Uint16)
+>()
+external void llama_dart_ngram_accept(
+  ffi.Pointer<llama_dart_ngram> ngram,
+  int seq_id,
+  int accepted_count,
+);
+
 @ffi.Native<
   ffi.Int32 Function(
     ffi.Pointer<llama_sampler>,
@@ -10279,6 +10343,8 @@ final class mtmd_helper_video_init_params extends ffi.Struct {
 
 final class llama_dart_mtp extends ffi.Opaque {}
 
+final class llama_dart_ngram extends ffi.Opaque {}
+
 const int LLAMA_DEFAULT_SEED = 4294967295;
 
 const int LLAMA_TOKEN_NULL = -1;