From a36adcc74fb9633a4d14204138019b25d6c0c72a Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 24 Jun 2026 15:24:55 +0000 Subject: [PATCH 1/5] eagle3: accept Eagle3LlamaForCausalLM draft checkpoints --- conversion/__init__.py | 1 + conversion/llama.py | 1 + 2 files changed, 2 insertions(+) diff --git a/conversion/__init__.py b/conversion/__init__.py index 2bce1bbd7c02..a6d2fb7c704f 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -135,6 +135,7 @@ "LlamaModel": "llama", "Eagle3DraftModel": "llama", "Eagle3Speculator": "llama", + "Eagle3LlamaForCausalLM": "llama", "LlamaForCausalLMEagle3": "llama", "LlavaForConditionalGeneration": "llama", "LlavaStableLMEpochForCausalLM": "stablelm", diff --git a/conversion/llama.py b/conversion/llama.py index a0d39472ebb1..b43cc994aa3a 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -23,6 +23,7 @@ "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", "LlamaForCausalLMEagle3", + "Eagle3LlamaForCausalLM", "Eagle3Speculator", "Eagle3DraftModel", "IQuestCoderForCausalLM", From 5e579af58657e73aa44d484168da5fa2b492fe62 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Wed, 24 Jun 2026 15:24:57 +0000 Subject: [PATCH 2/5] docs: add eagle3 speculative decoding section --- docs/speculative.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/docs/speculative.md b/docs/speculative.md index 43d181858912..912e1efa5361 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -13,6 +13,25 @@ The `llama-server` application supports several implementations of speculative d A much smaller model (called the _draft model_) generates drafts. A draft model is the most used approach in speculative decoding. +### EAGLE-3 (`draft-eagle3`) + +EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it +reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer +trained for a specific target model; it shares the target's tokenizer and (optionally) a reduced draft vocabulary +mapped back with a `d2t` table. + +Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer +indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM` +checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) +for `Qwen/Qwen3-4B`): + +```bash +python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \ + --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf + +llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3 +``` + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -108,7 +127,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] +--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) From 69a3498ae1f6c94c87d68800dad5815757b7dd2f Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 25 Jun 2026 10:16:06 +0000 Subject: [PATCH 3/5] docs: address eagle3 review comments --- docs/speculative.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/speculative.md b/docs/speculative.md index 912e1efa5361..5abd0d1d7f95 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -17,8 +17,8 @@ A draft model is the most used approach in speculative decoding. EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer -trained for a specific target model; it shares the target's tokenizer and (optionally) a reduced draft vocabulary -mapped back with a `d2t` table. +trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft +vocabulary with its own `lm_head`, which is mapped back using a `d2t` table. Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM` @@ -32,6 +32,18 @@ python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \ llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3 ``` +Supported EAGLE-3 draft models include: + +- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B) +- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B) +- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3) +- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3) +- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3) +- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3) +- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3) +- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3) +- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3) + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -266,6 +278,7 @@ Specifies a comma-separated list of speculative decoding types to use. |------|-------------| | `none` | No speculative decoding (default) | | `draft-simple` | Use a simple draft model for speculation | +| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states | | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | From e97371696e16eaf9e1d580eaeaf84884f65dffd8 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 25 Jun 2026 10:19:03 +0000 Subject: [PATCH 4/5] docs: add more angelslim eagle3 models --- docs/speculative.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/speculative.md b/docs/speculative.md index 5abd0d1d7f95..5e226d8ed12f 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -40,9 +40,12 @@ Supported EAGLE-3 draft models include: - [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3) - [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3) - [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3) +- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3) +- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) - [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3) - [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3) - [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3) +- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3) ### n-gram Cache (`ngram-cache`) From 77b457baeedc6c8bcc68c1d4f4cf41dfc63a73bc Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Thu, 25 Jun 2026 12:24:26 +0000 Subject: [PATCH 5/5] docs: add gpt-oss eagle3 models and link to pr 18039 --- docs/speculative.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/speculative.md b/docs/speculative.md index 5e226d8ed12f..8f91256c4a4d 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -46,6 +46,11 @@ Supported EAGLE-3 draft models include: - [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3) - [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3) - [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3) +- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3) +- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16) +- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context) + +For the full and up-to-date list of supported models, see #18039. ### n-gram Cache (`ngram-cache`)