Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
## Unreleased

* Added llama.cpp ngram-simple speculative decoding via
`SpeculativeDecodingConfig.ngramSimple(...)`, including Dart routing, native
wrapper bindings, docs, and local benchmark matrix coverage.

* Added `LlamaStructuredOutput` and `LlamaEngine.createStructuredJson(...)`
helpers for strict JSON-object / JSON-schema generation with final-output
validation and typed decoding.
Expand Down
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,31 @@ Higher `draftTokenMax` values can be faster on some models/devices, but they
should be benchmarked with the target model because excess draft depth can add
verification overhead.

For GGUF models without an MTP or separate draft model, llama.cpp ngram-simple
speculative decoding uses recent token history as the drafter:

```dart
params: const GenerationParams(
maxTokens: 128,
penalty: 1.0,
speculativeDecodingConfig: SpeculativeDecodingConfig.ngramSimple(
draftTokenMax: 4,
ngramSize: 12,
),
),
```

Reserve `ModelParams.speculativeRollbackTokenMax` at least as large as
`draftTokenMax` before using ngram-simple. Deeper ngram-simple drafts
(`draftTokenMax > 2`) require `GenerationParams.penalty: 1.0`; with repeat
penalties, keep `draftTokenMax <= 2` so rejected draft tails cannot disturb
deterministic sampler parity. Ngram-simple is workload-dependent and can be
slower than baseline decoding on prompts with little repetition, so validate it
with your model and prompt shape. For local measurements, set
`LLAMADART_MTP_BENCHMARK_NGRAM=true` and
`LLAMADART_MTP_BENCHMARK_NGRAM_ONLY=true` when running
`tool/testing/llama_cpp_mtp_benchmark.dart`.

For target/draft model pairs, pass the separate drafter GGUF with
`draftModelPath`:

Expand Down
6 changes: 6 additions & 0 deletions doc/testing_matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,12 @@ dart run tool/testing/native_inference_benchmark.dart \
--mode all \
--runs 3 \
--max-tokens 128

LLAMADART_MTP_BENCHMARK_NGRAM=true \
LLAMADART_MTP_BENCHMARK_NGRAM_ONLY=true \
LLAMADART_MTP_BENCHMARK_NGRAM_SIZE=1 \
dart run tool/testing/llama_cpp_mtp_benchmark.dart \
models/Qwen3.5-0.8B-Q4_K_M.gguf - 128 3 1,2,4 1
```

Use `--dry-run` first when a scenario starts servers, builds Flutter web, or
Expand Down
7 changes: 7 additions & 0 deletions lib/src/backends/litert_lm/litert_lm_service.dart
Original file line number Diff line number Diff line change
Expand Up @@ -1013,6 +1013,10 @@ class LiteRtLmService {
if (config == null) {
return;
}
if (config.strategy != SpeculativeDecodingStrategy.backendDefault &&
config.strategy != SpeculativeDecodingStrategy.mtp) {
unsupported.add('speculativeDecodingConfig.strategy');
}
if (config.draftTokenMax != null) {
unsupported.add('speculativeDecodingConfig.draftTokenMax');
}
Expand All @@ -1025,6 +1029,9 @@ class LiteRtLmService {
if (config.draftModelPath != null) {
unsupported.add('speculativeDecodingConfig.draftModelPath');
}
if (config.ngramSize != null) {
unsupported.add('speculativeDecodingConfig.ngramSize');
}
}

int _defaultSamplerSeed() {
Expand Down
66 changes: 66 additions & 0 deletions lib/src/backends/llama_cpp/bindings.dart
Original file line number Diff line number Diff line change
Expand Up @@ -7841,6 +7841,70 @@ external void llama_dart_mtp_accept(
int accepted_count,
);

@ffi.Native<ffi.Pointer<llama_dart_ngram> Function(ffi.Int32, ffi.Int32)>()
external ffi.Pointer<llama_dart_ngram> llama_dart_ngram_simple_init(
int ngram_size,
int draft_token_max,
);

@ffi.Native<ffi.Void Function(ffi.Pointer<llama_dart_ngram>)>()
external void llama_dart_ngram_free(ffi.Pointer<llama_dart_ngram> ngram);

@ffi.Native<
ffi.Bool Function(
ffi.Pointer<llama_dart_ngram>,
llama_seq_id,
ffi.Pointer<llama_token>,
ffi.Int32,
)
>()
external bool llama_dart_ngram_begin(
ffi.Pointer<llama_dart_ngram> ngram,
int seq_id,
ffi.Pointer<llama_token> prompt,
int prompt_count,
);

@ffi.Native<ffi.Bool Function(ffi.Pointer<llama_dart_ngram>, llama_batch)>()
external bool llama_dart_ngram_process_batch(
ffi.Pointer<llama_dart_ngram> ngram,
llama_batch batch,
);

@ffi.Native<
ffi.Int32 Function(
ffi.Pointer<llama_dart_ngram>,
llama_seq_id,
llama_pos,
llama_token,
ffi.Pointer<llama_token>,
ffi.Int32,
ffi.Int32,
ffi.Pointer<llama_token>,
ffi.Int32,
)
>()
external int llama_dart_ngram_draft(
ffi.Pointer<llama_dart_ngram> ngram,
int seq_id,
int n_past,
int id_last,
ffi.Pointer<llama_token> prompt,
int prompt_count,
int draft_token_max,
ffi.Pointer<llama_token> out_tokens,
int out_capacity,
);

@ffi.Native<
ffi.Void Function(ffi.Pointer<llama_dart_ngram>, llama_seq_id, ffi.Uint16)
>()
external void llama_dart_ngram_accept(
ffi.Pointer<llama_dart_ngram> ngram,
int seq_id,
int accepted_count,
);

@ffi.Native<
ffi.Int32 Function(
ffi.Pointer<llama_sampler>,
Expand Down Expand Up @@ -10279,6 +10343,8 @@ final class mtmd_helper_video_init_params extends ffi.Struct {

final class llama_dart_mtp extends ffi.Opaque {}

final class llama_dart_ngram extends ffi.Opaque {}

const int LLAMA_DEFAULT_SEED = 4294967295;

const int LLAMA_TOKEN_NULL = -1;
Expand Down
Loading
Loading