From a36adcc74fb9633a4d14204138019b25d6c0c72a Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 24 Jun 2026 15:24:55 +0000
Subject: [PATCH 1/5] eagle3: accept Eagle3LlamaForCausalLM draft checkpoints

---
 conversion/__init__.py | 1 +
 conversion/llama.py    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2bce1bbd7c02..a6d2fb7c704f 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -135,6 +135,7 @@
     "LlamaModel": "llama",
     "Eagle3DraftModel": "llama",
     "Eagle3Speculator": "llama",
+    "Eagle3LlamaForCausalLM": "llama",
     "LlamaForCausalLMEagle3": "llama",
     "LlavaForConditionalGeneration": "llama",
     "LlavaStableLMEpochForCausalLM": "stablelm",
diff --git a/conversion/llama.py b/conversion/llama.py
index a0d39472ebb1..b43cc994aa3a 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -23,6 +23,7 @@
     "LlavaForConditionalGeneration",
     "VoxtralForConditionalGeneration",
     "LlamaForCausalLMEagle3",
+    "Eagle3LlamaForCausalLM",
     "Eagle3Speculator",
     "Eagle3DraftModel",
     "IQuestCoderForCausalLM",

From 5e579af58657e73aa44d484168da5fa2b492fe62 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Wed, 24 Jun 2026 15:24:57 +0000
Subject: [PATCH 2/5] docs: add eagle3 speculative decoding section

---
 docs/speculative.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/speculative.md b/docs/speculative.md
index 43d181858912..912e1efa5361 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -13,6 +13,25 @@ The `llama-server` application supports several implementations of speculative d
 A much smaller model (called the _draft model_) generates drafts.
 A draft model is the most used approach in speculative decoding.
 
+### EAGLE-3 (`draft-eagle3`)
+
+EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
+reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
+trained for a specific target model; it shares the target's tokenizer and (optionally) a reduced draft vocabulary
+mapped back with a `d2t` table.
+
+Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
+indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
+checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
+for `Qwen/Qwen3-4B`):
+
+```bash
+python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
+    --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf
+
+llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
+```
+
 ### n-gram Cache (`ngram-cache`)
 
 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@@ -108,7 +127,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha
 ### General Speculative Parameters
 
 ```
---spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
+--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                         comma-separated list of types of speculative decoding to use
                                         (default: none)
                                         (env: LLAMA_ARG_SPEC_TYPE)

From 69a3498ae1f6c94c87d68800dad5815757b7dd2f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 25 Jun 2026 10:16:06 +0000
Subject: [PATCH 3/5] docs: address eagle3 review comments

---
 docs/speculative.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/speculative.md b/docs/speculative.md
index 912e1efa5361..5abd0d1d7f95 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -17,8 +17,8 @@ A draft model is the most used approach in speculative decoding.
 
 EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it
 reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer
-trained for a specific target model; it shares the target's tokenizer and (optionally) a reduced draft vocabulary
-mapped back with a `d2t` table.
+trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft
+vocabulary with its own `lm_head`, which is mapped back using a `d2t` table.
 
 Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer
 indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM`
@@ -32,6 +32,18 @@ python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \
 llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3
 ```
 
+Supported EAGLE-3 draft models include:
+
+- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B)
+- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B)
+- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3)
+- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
+- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
+- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
+- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
+- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
+- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
+
 ### n-gram Cache (`ngram-cache`)
 
 An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences.
@@ -266,6 +278,7 @@ Specifies a comma-separated list of speculative decoding types to use.
 |------|-------------|
 | `none` | No speculative decoding (default) |
 | `draft-simple` | Use a simple draft model for speculation |
+| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states |
 | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model |
 | `ngram-cache` | Use n-gram cache lookup |
 | `ngram-simple` | Use simple n-gram pattern matching |

From e97371696e16eaf9e1d580eaeaf84884f65dffd8 Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 25 Jun 2026 10:19:03 +0000
Subject: [PATCH 4/5] docs: add more angelslim eagle3 models

---
 docs/speculative.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/speculative.md b/docs/speculative.md
index 5abd0d1d7f95..5e226d8ed12f 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -40,9 +40,12 @@ Supported EAGLE-3 draft models include:
 - [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3)
 - [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3)
 - [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3)
+- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3)
+- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3)
 - [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3)
 - [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
 - [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
+- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
 
 ### n-gram Cache (`ngram-cache`)
 

From 77b457baeedc6c8bcc68c1d4f4cf41dfc63a73bc Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 25 Jun 2026 12:24:26 +0000
Subject: [PATCH 5/5] docs: add gpt-oss eagle3 models and link to pr 18039

---
 docs/speculative.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/speculative.md b/docs/speculative.md
index 5e226d8ed12f..8f91256c4a4d 100644
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -46,6 +46,11 @@ Supported EAGLE-3 draft models include:
 - [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3)
 - [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3)
 - [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3)
+- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3)
+- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16)
+- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context)
+
+For the full and up-to-date list of supported models, see #18039.
 
 ### n-gram Cache (`ngram-cache`)