From 0c1c14d5dadb29810bbdc5cfae782704c25a01f8 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 15:15:50 +1000
Subject: [PATCH 1/9] Enable package-declared draft speculation

---
 crates/mesh-llm-cli/src/models.rs             |  15 +
 crates/mesh-llm-cli/src/parser.rs             |  20 +
 crates/mesh-llm-commands/src/model_package.rs |  15 +
 .../src/inference/skippy/mod.rs               |   1 +
 .../src/inference/skippy/package.rs           |   4 +
 .../inference/skippy/resolver/speculative.rs  |   6 +
 .../src/inference/skippy/resolver/tests.rs    |   1 +
 .../inference/skippy/resolver/translation.rs  |   2 +-
 .../src/inference/skippy/resolver/types.rs    |   2 +
 .../src/plugin/config.rs                      |   5 +-
 .../src/runtime/local.rs                      | 129 ++++++-
 .../src/runtime/split_planning.rs             |   1 +
 crates/mesh-llm/src/commands/models/mod.rs    |  10 +
 crates/model-package/src/prepare.rs           | 127 +++++-
 crates/model-package/src/script.rs            |   4 +
 .../src/scripts/split-model-job.sh            |  18 +-
 crates/skippy-model-package/README.md         |  12 +
 crates/skippy-model-package/src/main.rs       | 188 ++++++++-
 crates/skippy-runtime/src/package.rs          | 197 ++++++++++
 docs/LAYER_PACKAGE_REPOS.md                   |   8 +
 docs/MTP.md                                   | 362 ++++++++++++++++++
 docs/USAGE.md                                 |   3 +-
 docs/specs/layer-package-repos.md             |  61 +++
 23 files changed, 1178 insertions(+), 13 deletions(-)
 create mode 100644 docs/MTP.md
diff --git a/crates/mesh-llm-cli/src/models.rs b/crates/mesh-llm-cli/src/models.rs
index 95fbe8a2c..8895cffa0 100644
--- a/crates/mesh-llm-cli/src/models.rs
+++ b/crates/mesh-llm-cli/src/models.rs
@@ -30,6 +30,21 @@ pub enum ModelsCommand {
         /// Override model ID in the manifest.
         #[arg(long)]
         model_id: Option<String>,
+        /// Draft model ref to declare for package default speculative decoding.
+        #[arg(long = "spec-draft-model")]
+        spec_draft_model: Option<String>,
+        /// Package speculative strategy name.
+        #[arg(long = "spec-strategy", default_value = "draft")]
+        spec_strategy: String,
+        /// Initial adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-initial-window", default_value_t = 16)]
+        spec_initial_window: u32,
+        /// Minimum adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-min-window", default_value_t = 2)]
+        spec_min_window: u32,
+        /// Maximum adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-max-window", default_value_t = 16)]
+        spec_max_window: u32,
         /// HF Job hardware flavor. Use auto for the default CPU splitter baseline.
         #[arg(long, default_value = "auto")]
         flavor: String,
diff --git a/crates/mesh-llm-cli/src/parser.rs b/crates/mesh-llm-cli/src/parser.rs
index 350f20e8e..f30b3d510 100644
--- a/crates/mesh-llm-cli/src/parser.rs
+++ b/crates/mesh-llm-cli/src/parser.rs
@@ -859,6 +859,26 @@ pub enum Command {
         #[arg(long)]
         model_id: Option<String>,
 
+        /// Draft model ref to declare for package default speculative decoding.
+        #[arg(long = "spec-draft-model")]
+        spec_draft_model: Option<String>,
+
+        /// Package speculative strategy name.
+        #[arg(long = "spec-strategy", default_value = "draft")]
+        spec_strategy: String,
+
+        /// Initial adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-initial-window", default_value_t = 16)]
+        spec_initial_window: u32,
+
+        /// Minimum adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-min-window", default_value_t = 2)]
+        spec_min_window: u32,
+
+        /// Maximum adaptive speculative decode window for the package strategy.
+        #[arg(long = "spec-max-window", default_value_t = 16)]
+        spec_max_window: u32,
+
         /// HF Job hardware flavor. Use auto for the default CPU splitter baseline.
         #[arg(long, default_value = "auto")]
         flavor: String,
diff --git a/crates/mesh-llm-commands/src/model_package.rs b/crates/mesh-llm-commands/src/model_package.rs
index 15c52163c..8377cb429 100644
--- a/crates/mesh-llm-commands/src/model_package.rs
+++ b/crates/mesh-llm-commands/src/model_package.rs
@@ -13,6 +13,11 @@ pub struct ModelPrepareArgs<'a> {
     pub quant: Option<&'a str>,
     pub target: Option<&'a str>,
     pub model_id: Option<&'a str>,
+    pub spec_draft_model: Option<&'a str>,
+    pub spec_strategy: &'a str,
+    pub spec_initial_window: u32,
+    pub spec_min_window: u32,
+    pub spec_max_window: u32,
     pub flavor: &'a str,
     pub timeout: &'a str,
     pub mesh_llm_ref: &'a str,
@@ -34,6 +39,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
         quant,
         target,
         model_id,
+        spec_draft_model,
+        spec_strategy,
+        spec_initial_window,
+        spec_min_window,
+        spec_max_window,
         flavor,
         timeout,
         mesh_llm_ref,
@@ -119,6 +129,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
         quant: source_quant.map(|s| s.to_string()),
         target: target.map(|s| s.to_string()),
         model_id: model_id.map(|s| s.to_string()),
+        spec_draft_model: spec_draft_model.map(|s| s.to_string()),
+        spec_strategy: spec_strategy.to_string(),
+        spec_initial_window,
+        spec_min_window,
+        spec_max_window,
         flavor: flavor.to_string(),
         timeout_seconds,
         mesh_llm_ref: mesh_llm_ref.to_string(),
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
index 04cfb4e55..c9fbb6b0c 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
@@ -1017,6 +1017,7 @@ mod tests {
             layer_count,
             activation_width: 4096,
             tensor_count: 100,
+            speculative_decoding: None,
         }
     }
 
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs
index 94d338f11..de6c94d77 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs
@@ -7,6 +7,7 @@ use std::{
 use anyhow::{Context, Result};
 use serde::Serialize;
 use sha2::{Digest, Sha256};
+use skippy_runtime::package::SpeculativeDecodingConfig;
 
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct SkippyPackageIdentity {
@@ -19,6 +20,7 @@ pub struct SkippyPackageIdentity {
     pub layer_count: u32,
     pub activation_width: u32,
     pub tensor_count: u64,
+    pub speculative_decoding: Option<SpeculativeDecodingConfig>,
 }
 
 #[derive(Clone, Debug, Eq, PartialEq, Serialize)]
@@ -109,6 +111,7 @@ pub fn synthetic_direct_gguf_package(
         layer_count: compact.layer_count,
         activation_width: compact.embedding_size,
         tensor_count,
+        speculative_decoding: None,
     })
 }
 
@@ -328,6 +331,7 @@ pub fn identity_from_layer_package(package_ref: &str) -> Result<SkippyPackageIde
         layer_count: info.layer_count,
         activation_width,
         tensor_count: info.layers.iter().map(|l| l.tensor_count as u64).sum(),
+        speculative_decoding: info.speculative_decoding,
     })
 }
 
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/speculative.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/speculative.rs
index f9f2fd939..0834e28a0 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/speculative.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/speculative.rs
@@ -95,6 +95,10 @@ pub(super) fn resolve_speculative_config(
     }
 
     let mut mode = mode;
+    let package_strategy = pick_owned(
+        model_config.and_then(|config| config.package_strategy.clone()),
+        global_config.and_then(|config| config.package_strategy.clone()),
+    );
     let mut draft_model_path = pick_owned(
         model_config.and_then(|config| config.draft_model_path.clone()),
         global_config.and_then(|config| config.draft_model_path.clone()),
@@ -147,9 +151,11 @@ pub(super) fn resolve_speculative_config(
     }
     Ok(ResolvedSpeculativeConfig {
         mode,
+        package_strategy,
         draft_model_path,
         pairing_fault,
         draft_max_tokens,
+        adaptive_window: false,
         explicit,
         draft_n_gpu_layers,
     })
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/tests.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/tests.rs
index d6bbea83a..9e25b0f78 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/tests.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/tests.rs
@@ -28,6 +28,7 @@ fn fake_package_identity(layer_count: u32) -> SkippyPackageIdentity {
         layer_count,
         activation_width: 4096,
         tensor_count: 100,
+        speculative_decoding: None,
     }
 }
 
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs
index d963f2fae..bc788d795 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs
@@ -234,7 +234,7 @@ impl ResolvedSkippyConfig {
             } else {
                 0
             },
-            adaptive_speculative_window: false,
+            adaptive_speculative_window: mode == "draft" && self.speculative.adaptive_window,
             draft_n_gpu_layers: if mode == "draft" {
                 self.speculative.draft_n_gpu_layers
             } else {
diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs
index 6e59fbb1c..98688d8c9 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs
@@ -88,9 +88,11 @@ pub(crate) struct ResolvedSkippyExecutionConfig {
 #[derive(Clone, Debug, PartialEq)]
 pub(crate) struct ResolvedSpeculativeConfig {
     pub(crate) mode: String,
+    pub(crate) package_strategy: Option<String>,
     pub(crate) draft_model_path: Option<PathBuf>,
     pub(crate) pairing_fault: String,
     pub(crate) draft_max_tokens: u32,
+    pub(crate) adaptive_window: bool,
     pub(crate) explicit: bool,
     pub(crate) draft_n_gpu_layers: Option<i32>,
 }
diff --git a/crates/mesh-llm-host-runtime/src/plugin/config.rs b/crates/mesh-llm-host-runtime/src/plugin/config.rs
index ee85080cc..19a1cbf37 100644
--- a/crates/mesh-llm-host-runtime/src/plugin/config.rs
+++ b/crates/mesh-llm-host-runtime/src/plugin/config.rs
@@ -16,8 +16,9 @@ pub use mesh_llm_config::{
     MultimodalConfig, OwnerControlConfig, PluginConfigEditor, PluginConfigEntry,
     PluginStartupConfig, PrefixCacheConfig, ReasoningBudget, ReasoningEnabled,
     RequestDefaultsConfig, ReservedObjectConfig, SkippyConfig, SpeculativeConfig,
-    StringOrStringList, TelemetryConfig, TelemetryMetricsConfig, TensorSplitConfig,
-    ThroughputConfig, config_path, config_to_toml, load_config, parse_config_toml, validate_config,
+    SpeculativeConfigEditor, StringOrStringList, TelemetryConfig, TelemetryMetricsConfig,
+    TensorSplitConfig, ThroughputConfig, config_path, config_to_toml, load_config,
+    parse_config_toml, validate_config,
 };
 use mesh_llm_plugin::MeshVisibility;
 use std::collections::BTreeMap;
diff --git a/crates/mesh-llm-host-runtime/src/runtime/local.rs b/crates/mesh-llm-host-runtime/src/runtime/local.rs
index 70ce51505..d96f6e8b0 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/local.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/local.rs
@@ -1205,7 +1205,7 @@ async fn load_split_runtime_generation_inner(
     spec: &SplitGenerationLoadSpec<'_>,
     cleanup_on_error: &mut bool,
 ) -> Result<SplitRuntimeGenerationHandle> {
-    let settings = split_generation_load_settings(spec)?;
+    let settings = split_generation_load_settings(spec).await?;
     anyhow::ensure!(
         settings.stage0.node_id == spec.node.id(),
         "split topology stage 0 moved to {}; local coordinator is {}",
@@ -1512,7 +1512,7 @@ fn split_runtime_stage_upstream(
     })
 }
 
-fn split_generation_load_settings<'a>(
+async fn split_generation_load_settings<'a>(
     spec: &'a SplitGenerationLoadSpec<'_>,
 ) -> Result<SplitGenerationLoadSettings<'a>> {
     let stage0 = spec
@@ -1554,6 +1554,7 @@ fn split_generation_load_settings<'a>(
     if let Some(gpu) = spec.pinned_gpu {
         resolved.hardware.device = Some(gpu.backend_device.clone());
     }
+    apply_package_speculative_defaults(&mut resolved, spec.package).await?;
     let embedded_openai = resolved.to_embedded_openai_args(activation_width, true)?;
     let runtime_options = resolved.to_embedded_runtime_options(
         &spec.skippy_telemetry,
@@ -1576,6 +1577,97 @@ fn split_generation_load_settings<'a>(
     })
 }
 
+async fn apply_package_speculative_defaults(
+    resolved: &mut skippy::ResolvedSkippyConfig,
+    package: &skippy::SkippyPackageIdentity,
+) -> Result<()> {
+    if resolved.speculative.explicit || resolved.speculative.mode == "draft" {
+        return Ok(());
+    }
+    let Some(config) = package.speculative_decoding.as_ref() else {
+        return Ok(());
+    };
+    let Some(strategy_name) = selected_package_speculative_strategy(config, resolved) else {
+        return Ok(());
+    };
+    let Some(strategy) = config.strategies.get(&strategy_name) else {
+        anyhow::bail!(
+            "speculative decoding strategy '{strategy_name}' is not declared by package {}",
+            package.package_ref
+        );
+    };
+    if strategy.strategy_type != "draft-model" {
+        tracing::warn!(
+            package_ref = package.package_ref,
+            strategy = strategy_name,
+            strategy_type = strategy.strategy_type,
+            "package speculative strategy is not supported by the embedded Skippy runtime"
+        );
+        return Ok(());
+    }
+    let draft_ref = strategy
+        .draft_model
+        .as_deref()
+        .context("draft-model speculative strategy is missing draft_model")?;
+    let draft_path = match models::download_model_ref_with_progress_details(draft_ref, true).await {
+        Ok((path, _)) => path,
+        Err(error) => {
+            tracing::warn!(
+                package_ref = package.package_ref,
+                strategy = strategy_name,
+                draft_ref,
+                error = %error,
+                "package draft model could not be resolved; starting without speculative decoding"
+            );
+            return Ok(());
+        }
+    };
+    let window = speculative_strategy_window(strategy)?;
+    let adaptive = strategy
+        .window_policy
+        .as_ref()
+        .and_then(|policy| policy.default.as_deref())
+        == Some("adaptive");
+    resolved.speculative.mode = "draft".to_string();
+    resolved.speculative.draft_model_path = Some(draft_path.clone());
+    resolved.speculative.draft_max_tokens = window;
+    resolved.speculative.adaptive_window = adaptive;
+    resolved.speculative.draft_n_gpu_layers = None;
+    tracing::info!(
+        package_ref = package.package_ref,
+        strategy = strategy_name,
+        draft_ref,
+        draft_path = %draft_path.display(),
+        window,
+        adaptive,
+        "enabled package-declared draft speculative decoding"
+    );
+    Ok(())
+}
+
+fn selected_package_speculative_strategy(
+    config: &skippy_runtime::package::SpeculativeDecodingConfig,
+    resolved: &skippy::ResolvedSkippyConfig,
+) -> Option<String> {
+    match resolved.speculative.package_strategy.as_deref() {
+        Some("default") | None => config.default.clone(),
+        Some(strategy) => Some(strategy.to_string()),
+    }
+}
+
+fn speculative_strategy_window(
+    strategy: &skippy_runtime::package::SpeculativeStrategyConfig,
+) -> Result<u32> {
+    let Some(policy) = strategy.window_policy.as_ref() else {
+        return Ok(0);
+    };
+    Ok(policy
+        .initial_window
+        .or(policy.fixed_default_window)
+        .or(policy.max_window)
+        .unwrap_or(0))
+}
+
 fn split_generation_load_mode(package: &skippy::SkippyPackageIdentity) -> LoadMode {
     if skippy::is_layer_package_ref(&package.package_ref) {
         LoadMode::LayerPackage
@@ -3629,7 +3721,7 @@ async fn start_runtime_layer_package_model(
 )> {
     let context_length = plan.context_length;
     let fallback_projector_path = mmproj_path_for_model(&model_name).filter(|path| path.exists());
-    let resolved = resolve_runtime_skippy_config(
+    let mut resolved = resolve_runtime_skippy_config(
         &spec,
         &model_name,
         package.source_model_bytes,
@@ -3637,6 +3729,7 @@ async fn start_runtime_layer_package_model(
         plan.slots,
         fallback_projector_path,
     )?;
+    apply_package_speculative_defaults(&mut resolved, &package).await?;
     tracing::info!(
         model = model_name,
         "KV cache: {} K + {} V, {}K context",
@@ -3809,9 +3902,34 @@ mod tests {
             layer_count,
             activation_width: 2048,
             tensor_count: 100,
+            speculative_decoding: None,
+        }
+    }
+
+    fn draft_spec_strategy() -> skippy_runtime::package::SpeculativeStrategyConfig {
+        skippy_runtime::package::SpeculativeStrategyConfig {
+            strategy_type: "draft-model".to_string(),
+            draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123".to_string()),
+            model_family: None,
+            window_policy: Some(skippy_runtime::package::SpeculativeWindowPolicyConfig {
+                default: Some("adaptive".to_string()),
+                fixed_default_window: None,
+                initial_window: Some(16),
+                min_window: Some(2),
+                max_window: Some(16),
+                adaptive: None,
+            }),
+            identity: None,
         }
     }
 
+    #[test]
+    fn package_speculative_window_uses_adaptive_initial_window() {
+        let strategy = draft_spec_strategy();
+
+        assert_eq!(speculative_strategy_window(&strategy).unwrap(), 16);
+    }
+
     fn stage_load_request(load_mode: LoadMode) -> skippy::StageLoadRequest {
         skippy::StageLoadRequest {
             topology_id: "topology-a".to_string(),
@@ -4177,8 +4295,9 @@ stop = ["END"]
             skippy_telemetry: skippy::SkippyTelemetryOptions::off(),
             survey_telemetry: survey::SurveyTelemetry::disabled(),
         };
-        let settings =
-            split_generation_load_settings(&spec).expect("split settings should resolve");
+        let settings = split_generation_load_settings(&spec)
+            .await
+            .expect("split settings should resolve");
 
         assert_eq!(settings.load_mode, LoadMode::LayerPackage);
         assert_eq!(settings.activation_width, 2048);
diff --git a/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs b/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs
index eb86476cf..192fe65a3 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs
@@ -551,6 +551,7 @@ mod tests {
             layer_count,
             activation_width: 896,
             tensor_count: 100,
+            speculative_decoding: None,
         }
     }
 
diff --git a/crates/mesh-llm/src/commands/models/mod.rs b/crates/mesh-llm/src/commands/models/mod.rs
index e8ad7a9e0..1d293d1c9 100644
--- a/crates/mesh-llm/src/commands/models/mod.rs
+++ b/crates/mesh-llm/src/commands/models/mod.rs
@@ -462,6 +462,11 @@ pub async fn dispatch_models_command(command: &ModelsCommand) -> Result<()> {
             quant,
             target,
             model_id,
+            spec_draft_model,
+            spec_strategy,
+            spec_initial_window,
+            spec_min_window,
+            spec_max_window,
             flavor,
             timeout,
             mesh_llm_ref,
@@ -481,6 +486,11 @@ pub async fn dispatch_models_command(command: &ModelsCommand) -> Result<()> {
                     quant: quant.as_deref(),
                     target: target.as_deref(),
                     model_id: model_id.as_deref(),
+                    spec_draft_model: spec_draft_model.as_deref(),
+                    spec_strategy,
+                    spec_initial_window: *spec_initial_window,
+                    spec_min_window: *spec_min_window,
+                    spec_max_window: *spec_max_window,
                     flavor,
                     timeout,
                     mesh_llm_ref,
diff --git a/crates/model-package/src/prepare.rs b/crates/model-package/src/prepare.rs
index 982936485..847865fe6 100644
--- a/crates/model-package/src/prepare.rs
+++ b/crates/model-package/src/prepare.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 
-use anyhow::{Context, Result};
+use anyhow::{Context, Result, ensure};
 use futures::StreamExt;
 use hf_hub::HFClient;
 use hf_hub::repository::RepoTreeEntry;
@@ -19,6 +19,11 @@ pub struct PrepareParams {
     pub quant: Option<String>,
     pub target: Option<String>,
     pub model_id: Option<String>,
+    pub spec_draft_model: Option<String>,
+    pub spec_strategy: String,
+    pub spec_initial_window: u32,
+    pub spec_min_window: u32,
+    pub spec_max_window: u32,
     pub flavor: String,
     pub timeout_seconds: u64,
     pub mesh_llm_ref: String,
@@ -117,6 +122,8 @@ pub async fn resolve(
     params: PrepareParams,
     permissions: &PermissionCheck,
 ) -> Result<PrepareJob> {
+    validate_speculative_params(&params)?;
+
     let quant = params
         .quant
         .as_deref()
@@ -193,6 +200,16 @@ pub async fn resolve(
     environment.insert("MODEL_ID".into(), model_id.clone());
     environment.insert("SOURCE_REVISION".into(), "main".into());
     environment.insert("MESH_LLM_REF".into(), params.mesh_llm_ref.clone());
+    if let Some(spec_draft_model) = params.spec_draft_model.as_deref() {
+        environment.insert("SPEC_DRAFT_MODEL".into(), spec_draft_model.to_string());
+        environment.insert("SPEC_STRATEGY".into(), params.spec_strategy.clone());
+        environment.insert(
+            "SPEC_INITIAL_WINDOW".into(),
+            params.spec_initial_window.to_string(),
+        );
+        environment.insert("SPEC_MIN_WINDOW".into(), params.spec_min_window.to_string());
+        environment.insert("SPEC_MAX_WINDOW".into(), params.spec_max_window.to_string());
+    }
     environment.insert(
         "CATALOG_CREATE_PR".into(),
         if permissions.catalog_create_pr {
@@ -248,6 +265,41 @@ pub async fn resolve(
     })
 }
 
+fn validate_speculative_params(params: &PrepareParams) -> Result<()> {
+    let Some(draft_model) = params.spec_draft_model.as_deref() else {
+        return Ok(());
+    };
+    ensure!(
+        !draft_model.trim().is_empty(),
+        "--spec-draft-model must not be empty when set"
+    );
+    ensure!(
+        !params.spec_strategy.trim().is_empty(),
+        "--spec-strategy must not be empty"
+    );
+    ensure!(
+        params.spec_min_window > 0,
+        "--spec-min-window must be greater than zero"
+    );
+    ensure!(
+        params.spec_initial_window > 0,
+        "--spec-initial-window must be greater than zero"
+    );
+    ensure!(
+        params.spec_max_window > 0,
+        "--spec-max-window must be greater than zero"
+    );
+    ensure!(
+        params.spec_min_window <= params.spec_initial_window,
+        "--spec-min-window must not exceed --spec-initial-window"
+    );
+    ensure!(
+        params.spec_initial_window <= params.spec_max_window,
+        "--spec-initial-window must not exceed --spec-max-window"
+    );
+    Ok(())
+}
+
 /// Format a byte count as a human-readable size.
 pub fn format_size(bytes: u64) -> String {
     const KB: u64 = 1024;
@@ -313,4 +365,77 @@ mod tests {
             .collect::<Vec<_>>();
         assert_eq!(names, vec!["Q4_K_M", "Q8_0"]);
     }
+
+    #[test]
+    fn speculative_env_is_present_only_when_draft_model_is_set() {
+        let params = PrepareParams {
+            source_repo: "unsloth/Llama-3.3-70B-Instruct-GGUF".to_string(),
+            quant: Some("Q3_K_M".to_string()),
+            target: None,
+            model_id: None,
+            spec_draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M".to_string()),
+            spec_strategy: "llama32-1b-q4".to_string(),
+            spec_initial_window: 16,
+            spec_min_window: 2,
+            spec_max_window: 16,
+            flavor: "auto".to_string(),
+            timeout_seconds: 3600,
+            mesh_llm_ref: "main".to_string(),
+            hf_token: None,
+        };
+
+        let mut environment = HashMap::new();
+        if let Some(spec_draft_model) = params.spec_draft_model.as_deref() {
+            environment.insert("SPEC_DRAFT_MODEL".to_string(), spec_draft_model.to_string());
+            environment.insert("SPEC_STRATEGY".to_string(), params.spec_strategy.clone());
+            environment.insert(
+                "SPEC_INITIAL_WINDOW".to_string(),
+                params.spec_initial_window.to_string(),
+            );
+            environment.insert(
+                "SPEC_MIN_WINDOW".to_string(),
+                params.spec_min_window.to_string(),
+            );
+            environment.insert(
+                "SPEC_MAX_WINDOW".to_string(),
+                params.spec_max_window.to_string(),
+            );
+        }
+
+        assert_eq!(
+            environment.get("SPEC_DRAFT_MODEL").map(String::as_str),
+            Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M")
+        );
+        assert_eq!(
+            environment.get("SPEC_STRATEGY").map(String::as_str),
+            Some("llama32-1b-q4")
+        );
+        assert_eq!(
+            environment.get("SPEC_INITIAL_WINDOW").map(String::as_str),
+            Some("16")
+        );
+        validate_speculative_params(&params).unwrap();
+    }
+
+    #[test]
+    fn speculative_window_validation_rejects_inverted_range() {
+        let params = PrepareParams {
+            source_repo: "unsloth/Llama-3.3-70B-Instruct-GGUF".to_string(),
+            quant: Some("Q3_K_M".to_string()),
+            target: None,
+            model_id: None,
+            spec_draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M".to_string()),
+            spec_strategy: "draft".to_string(),
+            spec_initial_window: 1,
+            spec_min_window: 2,
+            spec_max_window: 16,
+            flavor: "auto".to_string(),
+            timeout_seconds: 3600,
+            mesh_llm_ref: "main".to_string(),
+            hf_token: None,
+        };
+
+        let err = validate_speculative_params(&params).unwrap_err();
+        assert!(err.to_string().contains("--spec-min-window"));
+    }
 }
diff --git a/crates/model-package/src/script.rs b/crates/model-package/src/script.rs
index 6a5ae549e..ec1fa6297 100644
--- a/crates/model-package/src/script.rs
+++ b/crates/model-package/src/script.rs
@@ -196,6 +196,10 @@ mod tests {
         assert!(EMBEDDED_SCRIPT.contains(r#"MOUNTED_SOURCE_PATH="/source/${SOURCE_FILE}""#));
         assert!(EMBEDDED_SCRIPT.contains(r#"WRITE_PACKAGE_INPUT="$MOUNTED_SOURCE_PATH""#));
         assert!(EMBEDDED_SCRIPT.contains(r#"--source-file "$SOURCE_FILE""#));
+        assert!(EMBEDDED_SCRIPT.contains(r#"WRITE_PACKAGE_SPEC_ARGS=()"#));
+        assert!(EMBEDDED_SCRIPT.contains(r#"--spec-draft-model "$SPEC_DRAFT_MODEL""#));
+        assert!(EMBEDDED_SCRIPT.contains(r#"--spec-strategy "${SPEC_STRATEGY:-draft}""#));
+        assert!(EMBEDDED_SCRIPT.contains(r#"--spec-initial-window "${SPEC_INITIAL_WINDOW:-16}""#));
         assert!(EMBEDDED_SCRIPT.contains(r#"time "$SLICER" write-package "$WRITE_PACKAGE_INPUT""#));
         assert!(!EMBEDDED_SCRIPT.contains(r#"time $SLICER write-package "$SOURCE_PATH""#));
     }
diff --git a/crates/model-package/src/scripts/split-model-job.sh b/crates/model-package/src/scripts/split-model-job.sh
index 5a44bfd61..ef0e7a21c 100755
--- a/crates/model-package/src/scripts/split-model-job.sh
+++ b/crates/model-package/src/scripts/split-model-job.sh
@@ -6,6 +6,8 @@ set -euo pipefail
 #
 # Environment variables (set by mesh-llm model-package job spec):
 #   SOURCE_REPO, SOURCE_FILE, SOURCE_QUANT, TARGET_REPO, MODEL_ID, SOURCE_REVISION
+#   SPEC_DRAFT_MODEL, SPEC_STRATEGY, SPEC_INITIAL_WINDOW, SPEC_MIN_WINDOW,
+#   SPEC_MAX_WINDOW — optional package-declared draft-model speculation
 #   MESH_LLM_REF — git ref to build from (default: main)
 #   CATALOG_CREATE_PR — "true" to open a PR for catalog updates (non-org members)
 #   HF_TOKEN — injected as a secret by HF Jobs
@@ -32,6 +34,9 @@ echo "║  Quant:  ${SOURCE_QUANT}"
 echo "║  Target: ${TARGET_REPO}"
 echo "║  Model:  ${MODEL_ID}"
 echo "║  Build:  mesh-llm @ ${MESH_LLM_REF}"
+if [ -n "${SPEC_DRAFT_MODEL:-}" ]; then
+    echo "║  Draft:  ${SPEC_DRAFT_MODEL}"
+fi
 echo "╚══════════════════════════════════════════════════════════╝"
 echo ""
 
@@ -287,6 +292,16 @@ else
     WRITE_PACKAGE_IDENTITY_ARGS=()
     echo "  Source mount: not available; falling back to Hugging Face cache download"
 fi
+WRITE_PACKAGE_SPEC_ARGS=()
+if [ -n "${SPEC_DRAFT_MODEL:-}" ]; then
+    WRITE_PACKAGE_SPEC_ARGS=(
+        --spec-draft-model "$SPEC_DRAFT_MODEL"
+        --spec-strategy "${SPEC_STRATEGY:-draft}"
+        --spec-initial-window "${SPEC_INITIAL_WINDOW:-16}"
+        --spec-min-window "${SPEC_MIN_WINDOW:-2}"
+        --spec-max-window "${SPEC_MAX_WINDOW:-16}"
+    )
+fi
 echo "  Hugging Face cache: $HF_HUB_CACHE"
 echo "  Package workspace: $PACKAGE_DIR"
 echo "  Temporary workspace: $TMPDIR"
@@ -309,7 +324,8 @@ set +e
 time "$SLICER" write-package "$WRITE_PACKAGE_INPUT" \
     --out-dir "$PACKAGE_DIR" \
     --after-artifact-command "$ARTIFACT_UPLOAD_HOOK" \
-    "${WRITE_PACKAGE_IDENTITY_ARGS[@]}"
+    "${WRITE_PACKAGE_IDENTITY_ARGS[@]}" \
+    "${WRITE_PACKAGE_SPEC_ARGS[@]}"
 WRITE_PACKAGE_STATUS=$?
 set -e
 stop_heartbeat
diff --git a/crates/skippy-model-package/README.md b/crates/skippy-model-package/README.md
index 5590dbcde..12bff1dde 100644
--- a/crates/skippy-model-package/README.md
+++ b/crates/skippy-model-package/README.md
@@ -74,6 +74,18 @@ them into `projectors/`, fingerprints them, records them as `kind: "mmproj"` in
 checksums and sizes. Package-backed serving uses the first declared projector
 when no explicit `projector_path` is supplied by the caller.
 
+Packages can also declare a default draft-model speculative decoding strategy:
+
+```bash
+skippy-model-package write-package org/repo:Q3_K_M \
+  --out-dir model-package/ \
+  --spec-draft-model unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M \
+  --spec-strategy llama32-1b-q4 \
+  --spec-initial-window 16 \
+  --spec-min-window 2 \
+  --spec-max-window 16
+```
+
 Local paths are only accepted for package creation when the caller supplies
 explicit provenance:
 
diff --git a/crates/skippy-model-package/src/main.rs b/crates/skippy-model-package/src/main.rs
index e35047093..f9e4b2048 100644
--- a/crates/skippy-model-package/src/main.rs
+++ b/crates/skippy-model-package/src/main.rs
@@ -75,6 +75,16 @@ enum Command {
         source_revision: Option<String>,
         #[arg(long)]
         source_file: Option<String>,
+        #[arg(long = "spec-draft-model")]
+        spec_draft_model: Option<String>,
+        #[arg(long = "spec-strategy", default_value = "draft")]
+        spec_strategy: String,
+        #[arg(long = "spec-initial-window", default_value_t = 16)]
+        spec_initial_window: u32,
+        #[arg(long = "spec-min-window", default_value_t = 2)]
+        spec_min_window: u32,
+        #[arg(long = "spec-max-window", default_value_t = 16)]
+        spec_max_window: u32,
     },
     Validate {
         full: PathBuf,
@@ -180,6 +190,8 @@ struct PackageManifest {
     layer_count: u32,
     #[serde(default, skip_serializing_if = "Option::is_none")]
     activation_width: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    generation: Option<PackageGeneration>,
     shared: PackageShared,
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
     projectors: Vec<PackageProjector>,
@@ -188,6 +200,67 @@ struct PackageManifest {
     created_at_unix_secs: u64,
 }
 
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageGeneration {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    speculative_decoding: Option<PackageSpeculativeDecoding>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageSpeculativeDecoding {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    default: Option<String>,
+    #[serde(default)]
+    strategies: BTreeMap<String, PackageSpeculativeStrategy>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageSpeculativeStrategy {
+    #[serde(rename = "type")]
+    strategy_type: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    draft_model: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    model_family: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    window_policy: Option<PackageSpeculativeWindowPolicy>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    identity: Option<PackageSpeculativeIdentity>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageSpeculativeWindowPolicy {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    default: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    fixed_default_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    initial_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    min_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    max_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    adaptive: Option<PackageSpeculativeAdaptiveWindow>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageSpeculativeAdaptiveWindow {
+    supported: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    initial_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    min_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    max_window: Option<u32>,
+}
+
+#[derive(Debug, Deserialize, Serialize)]
+struct PackageSpeculativeIdentity {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    source_model_sha256: Option<String>,
+}
+
 #[derive(Debug, Deserialize, Serialize)]
 struct PackageSourceModel {
     path: String,
@@ -313,6 +386,15 @@ struct ExplicitSourceIdentity {
     source_file: Option<String>,
 }
 
+#[derive(Debug)]
+struct PackageSpeculativeOptions {
+    draft_model: Option<String>,
+    strategy: String,
+    initial_window: u32,
+    min_window: u32,
+    max_window: u32,
+}
+
 #[derive(Debug)]
 struct PackageInput {
     model_path: PathBuf,
@@ -372,6 +454,11 @@ fn main() -> Result<()> {
             source_repo,
             source_revision,
             source_file,
+            spec_draft_model,
+            spec_strategy,
+            spec_initial_window,
+            spec_min_window,
+            spec_max_window,
         } => write_package(
             model,
             out_dir,
@@ -385,6 +472,13 @@ fn main() -> Result<()> {
                 source_revision,
                 source_file,
             },
+            PackageSpeculativeOptions {
+                draft_model: spec_draft_model,
+                strategy: spec_strategy,
+                initial_window: spec_initial_window,
+                min_window: spec_min_window,
+                max_window: spec_max_window,
+            },
         ),
         Command::Validate { full, slices } => validate(full, slices),
         Command::ValidatePackage { full, package } => validate_package(full, package),
@@ -475,12 +569,73 @@ fn write_stages(model: PathBuf, stages: usize, out_dir: PathBuf) -> Result<()> {
     Ok(())
 }
 
+fn package_generation(options: PackageSpeculativeOptions) -> Result<Option<PackageGeneration>> {
+    let Some(draft_model) = options.draft_model else {
+        return Ok(None);
+    };
+    ensure!(
+        !draft_model.trim().is_empty(),
+        "--spec-draft-model must not be empty when set"
+    );
+    ensure!(
+        !options.strategy.trim().is_empty(),
+        "--spec-strategy must not be empty"
+    );
+    ensure!(
+        options.min_window > 0,
+        "--spec-min-window must be greater than zero"
+    );
+    ensure!(
+        options.initial_window > 0,
+        "--spec-initial-window must be greater than zero"
+    );
+    ensure!(
+        options.max_window > 0,
+        "--spec-max-window must be greater than zero"
+    );
+    ensure!(
+        options.min_window <= options.initial_window,
+        "--spec-min-window must not exceed --spec-initial-window"
+    );
+    ensure!(
+        options.initial_window <= options.max_window,
+        "--spec-initial-window must not exceed --spec-max-window"
+    );
+
+    let mut strategies = BTreeMap::new();
+    strategies.insert(
+        options.strategy.clone(),
+        PackageSpeculativeStrategy {
+            strategy_type: "draft-model".to_string(),
+            draft_model: Some(draft_model),
+            model_family: None,
+            window_policy: Some(PackageSpeculativeWindowPolicy {
+                default: Some("adaptive".to_string()),
+                fixed_default_window: None,
+                initial_window: Some(options.initial_window),
+                min_window: Some(options.min_window),
+                max_window: Some(options.max_window),
+                adaptive: None,
+            }),
+            identity: None,
+        },
+    );
+
+    Ok(Some(PackageGeneration {
+        speculative_decoding: Some(PackageSpeculativeDecoding {
+            default: Some(options.strategy),
+            strategies,
+        }),
+    }))
+}
+
 fn write_package(
     model: String,
     out_dir: PathBuf,
     projectors: Vec<PathBuf>,
     artifact_hook: ArtifactHook,
     explicit: ExplicitSourceIdentity,
+    speculative: PackageSpeculativeOptions,
 ) -> Result<()> {
     let input = resolve_package_input(model, explicit)?;
     fs::create_dir_all(&out_dir)
@@ -594,6 +749,7 @@ fn write_package(
         format: "layer-package".to_string(),
         layer_count,
         activation_width: Some(activation_width),
+        generation: package_generation(speculative)?,
         shared: PackageShared {
             metadata,
             embeddings,
@@ -1824,8 +1980,9 @@ fn hex_lower(bytes: &[u8]) -> String {
 #[cfg(test)]
 mod tests {
     use super::{
-        ExplicitSourceIdentity, activation_width, local_artifact_files, model_distribution_id,
-        resolve_gguf_shard_paths, resolve_local_package_input,
+        ExplicitSourceIdentity, PackageSpeculativeOptions, activation_width, local_artifact_files,
+        model_distribution_id, package_generation, resolve_gguf_shard_paths,
+        resolve_local_package_input,
     };
     use std::path::{Path, PathBuf};
 
@@ -1851,6 +2008,33 @@ mod tests {
         assert!(error.to_string().contains("requires --model-id"));
     }
 
+    #[test]
+    fn package_generation_emits_default_draft_strategy() {
+        let generation = package_generation(PackageSpeculativeOptions {
+            draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123".to_string()),
+            strategy: "llama32-1b-q4".to_string(),
+            initial_window: 16,
+            min_window: 2,
+            max_window: 16,
+        })
+        .unwrap()
+        .unwrap();
+
+        let speculative = generation.speculative_decoding.unwrap();
+        assert_eq!(speculative.default.as_deref(), Some("llama32-1b-q4"));
+        let strategy = speculative.strategies.get("llama32-1b-q4").unwrap();
+        assert_eq!(strategy.strategy_type, "draft-model");
+        assert_eq!(
+            strategy.draft_model.as_deref(),
+            Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123")
+        );
+        let policy = strategy.window_policy.as_ref().unwrap();
+        assert_eq!(policy.default.as_deref(), Some("adaptive"));
+        assert_eq!(policy.initial_window, Some(16));
+        assert_eq!(policy.min_window, Some(2));
+        assert_eq!(policy.max_window, Some(16));
+    }
+
     #[test]
     fn local_package_input_uses_explicit_coordinate_identity() {
         let input = resolve_local_package_input(
diff --git a/crates/skippy-runtime/src/package.rs b/crates/skippy-runtime/src/package.rs
index dfa590708..3b8e7cf37 100644
--- a/crates/skippy-runtime/src/package.rs
+++ b/crates/skippy-runtime/src/package.rs
@@ -55,6 +55,63 @@ pub struct LayerPackageInfo {
     pub activation_width: Option<u32>,
     pub projectors: Vec<PackageProjectorInfo>,
     pub layers: Vec<LayerPackageLayerInfo>,
+    pub generation: Option<serde_json::Value>,
+    pub speculative_decoding: Option<SpeculativeDecodingConfig>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SpeculativeDecodingConfig {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub default: Option<String>,
+    #[serde(default)]
+    pub strategies: BTreeMap<String, SpeculativeStrategyConfig>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SpeculativeStrategyConfig {
+    #[serde(rename = "type")]
+    pub strategy_type: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub draft_model: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub model_family: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub window_policy: Option<SpeculativeWindowPolicyConfig>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub identity: Option<SpeculativeIdentityConfig>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SpeculativeWindowPolicyConfig {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub default: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub fixed_default_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub initial_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub min_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub max_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub adaptive: Option<SpeculativeAdaptiveWindowConfig>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SpeculativeAdaptiveWindowConfig {
+    pub supported: bool,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub initial_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub min_window: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub max_window: Option<u32>,
+}
+
+#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
+pub struct SpeculativeIdentityConfig {
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub source_model_sha256: Option<String>,
 }
 
 #[derive(Debug, Clone)]
@@ -139,6 +196,8 @@ struct PackageManifest {
     layer_count: u32,
     #[serde(default)]
     activation_width: Option<u32>,
+    #[serde(default)]
+    generation: Option<serde_json::Value>,
     shared: PackageShared,
     #[serde(default)]
     projectors: Vec<PackageProjector>,
@@ -442,6 +501,12 @@ pub fn inspect_layer_package(package_ref: &str) -> Result<LayerPackageInfo> {
         Some(width) => Some(width),
         None => infer_activation_width_from_layers(&package_dir, &manifest.layers)?,
     };
+    let generation = manifest.generation;
+    let speculative_decoding = generation
+        .as_ref()
+        .and_then(|value| value.get("speculative_decoding"))
+        .map(parse_speculative_decoding)
+        .transpose()?;
 
     Ok(LayerPackageInfo {
         package_dir,
@@ -476,9 +541,110 @@ pub fn inspect_layer_package(package_ref: &str) -> Result<LayerPackageInfo> {
                 artifact_bytes: layer.artifact_bytes,
             })
             .collect(),
+        generation,
+        speculative_decoding,
     })
 }
 
+fn parse_speculative_decoding(value: &serde_json::Value) -> Result<SpeculativeDecodingConfig> {
+    let config: SpeculativeDecodingConfig = serde_json::from_value(value.clone())?;
+    validate_speculative_decoding(&config)?;
+    Ok(config)
+}
+
+fn validate_speculative_decoding(config: &SpeculativeDecodingConfig) -> Result<()> {
+    if let Some(default) = config.default.as_deref()
+        && !config.strategies.contains_key(default)
+    {
+        bail!("speculative_decoding.default references missing strategy '{default}'");
+    }
+    for (name, strategy) in &config.strategies {
+        if name.trim().is_empty() {
+            bail!("speculative strategy names must not be empty");
+        }
+        match strategy.strategy_type.as_str() {
+            "draft-model" => {
+                let Some(draft_model) = strategy.draft_model.as_deref() else {
+                    bail!("draft-model speculative strategy '{name}' must declare draft_model");
+                };
+                if draft_model.trim().is_empty() {
+                    bail!(
+                        "draft-model speculative strategy '{name}' draft_model must not be empty"
+                    );
+                }
+            }
+            "ngram" => {}
+            unknown => bail!("unsupported speculative strategy type '{unknown}'"),
+        }
+        validate_window_policy(name, strategy.window_policy.as_ref())?;
+    }
+    Ok(())
+}
+
+fn validate_window_policy(
+    name: &str,
+    policy: Option<&SpeculativeWindowPolicyConfig>,
+) -> Result<()> {
+    let Some(policy) = policy else {
+        return Ok(());
+    };
+    if let Some(window) = policy.fixed_default_window
+        && window == 0
+    {
+        bail!("speculative strategy '{name}' fixed_default_window must be greater than zero");
+    }
+    if let Some(window) = policy.initial_window
+        && window == 0
+    {
+        bail!("speculative strategy '{name}' initial_window must be greater than zero");
+    }
+    if let Some(window) = policy.min_window
+        && window == 0
+    {
+        bail!("speculative strategy '{name}' min_window must be greater than zero");
+    }
+    if let Some(window) = policy.max_window
+        && window == 0
+    {
+        bail!("speculative strategy '{name}' max_window must be greater than zero");
+    }
+    if let (Some(min), Some(max)) = (policy.min_window, policy.max_window)
+        && min > max
+    {
+        bail!("speculative strategy '{name}' min_window must not exceed max_window");
+    }
+    if let (Some(initial), Some(min)) = (policy.initial_window, policy.min_window)
+        && initial < min
+    {
+        bail!("speculative strategy '{name}' initial_window must not be less than min_window");
+    }
+    if let (Some(initial), Some(max)) = (policy.initial_window, policy.max_window)
+        && initial > max
+    {
+        bail!("speculative strategy '{name}' initial_window must not exceed max_window");
+    }
+    if let Some(adaptive) = &policy.adaptive {
+        if let Some(window) = adaptive.min_window
+            && window == 0
+        {
+            bail!("speculative strategy '{name}' adaptive.min_window must be greater than zero");
+        }
+        if let Some(window) = adaptive.initial_window
+            && window == 0
+        {
+            bail!(
+                "speculative strategy '{name}' adaptive.initial_window must be greater than zero"
+            );
+        }
+        if let Some(window) = adaptive.max_window
+            && window == 0
+        {
+            bail!("speculative strategy '{name}' adaptive.max_window must be greater than zero");
+        }
+    }
+    Ok(())
+}
+
 fn infer_activation_width_from_layers(
     package_dir: &Path,
     layers: &[PackageLayer],
@@ -1147,6 +1313,23 @@ mod tests {
             "format": "layer-package",
             "layer_count": 1,
             "activation_width": 4096,
+            "generation": {
+                "speculative_decoding": {
+                    "default": "draft",
+                    "strategies": {
+                        "draft": {
+                            "type": "draft-model",
+                            "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abcdef",
+                            "window_policy": {
+                                "default": "adaptive",
+                                "initial_window": 16,
+                                "min_window": 2,
+                                "max_window": 16
+                            }
+                        }
+                    }
+                }
+            },
             "shared": {
                 "metadata": {
                     "path": "metadata.gguf",
@@ -1386,6 +1569,20 @@ mod tests {
         assert_eq!(info.model_id, "model-a");
         assert_eq!(info.layer_count, 1);
         assert_eq!(info.activation_width, Some(4096));
+        assert_eq!(
+            info.generation
+                .as_ref()
+                .and_then(|generation| generation.pointer("/speculative_decoding/default"))
+                .and_then(serde_json::Value::as_str),
+            Some("draft")
+        );
+        let speculative = info.speculative_decoding.as_ref().unwrap();
+        let draft = speculative.strategies.get("draft").unwrap();
+        assert_eq!(draft.strategy_type, "draft-model");
+        assert_eq!(
+            draft.draft_model.as_deref(),
+            Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abcdef")
+        );
         assert_eq!(info.source_model_bytes, Some(123));
         assert_eq!(info.projectors.len(), 1);
         assert_eq!(info.projectors[0].kind, "mmproj");
diff --git a/docs/LAYER_PACKAGE_REPOS.md b/docs/LAYER_PACKAGE_REPOS.md
index 7c5afa8a7..e8a9d8596 100644
--- a/docs/LAYER_PACKAGE_REPOS.md
+++ b/docs/LAYER_PACKAGE_REPOS.md
@@ -75,6 +75,14 @@ Important options:
 
 - `--target <repo>`: destination Hugging Face package repo.
 - `--model-id <id>`: OpenAI-facing package model id.
+- `--spec-draft-model <ref>`: declare a package default draft model for
+  speculative decoding, using a normal model ref such as
+  `unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M`.
+- `--spec-strategy <name>`: name for that package-declared strategy, defaulting
+  to `draft`.
+- `--spec-initial-window <n>`, `--spec-min-window <n>`,
+  `--spec-max-window <n>`: adaptive speculative window defaults recorded in
+  `model-package.json`.
 - `--timeout <duration>`: HF Jobs timeout, defaulting to `1h` unless raised by
   size-based estimates.
 - `--dry-run`: print the resolved package plan and maximum cost without side effects.
diff --git a/docs/MTP.md b/docs/MTP.md
new file mode 100644
index 000000000..b8afa607a
--- /dev/null
+++ b/docs/MTP.md
@@ -0,0 +1,362 @@
+# MTP and Draft Speculation
+
+This document records the current decision: Skippy no longer carries its own
+MTP implementation. The upstream llama.cpp MTP path owns same-model MTP support.
+Skippy keeps only draft-model speculation work.
+
+## Decision
+
+- Delete Skippy-owned MTP ABI, CLI flags, staged protocol, package strategy
+  metadata, and Rust orchestration.
+- Keep draft-model speculation:
+  - single-stage native slot `draft-simple` through llama.cpp;
+  - multi-stage stage0 draft model proposals verified by downstream
+    `SpeculativeSpan` messages.
+- Do not put benchmark results in `model-package.json`.
+- Package speculative metadata may describe `draft-model` or `ngram`
+  strategies only.
+
+## Why
+
+The hand-rolled Skippy MTP path duplicated llama.cpp speculation machinery and
+kept drifting from the optimized server slot loop. That made performance and
+correctness hard to reason about. The useful Skippy work is draft-model
+speculation in distributed topologies, where stage0 can run a small draft model
+and ask the target pipeline to verify proposed spans.
+
+## Current Boundary
+
+Upstream llama.cpp:
+
+- owns same-GGUF MTP and any MTP-specific recurrent-state handling;
+- can be benchmarked directly with `llama-server` when we want MTP data.
+
+Skippy:
+
+- owns split serving, stage transport, telemetry, and draft-model speculative
+  verification;
+- does not expose `--mtp`, `draft-mtp`, `final-stage-head`, or
+  `DecodeReadoutMtp`;
+- sends draft proposals with `SpeculativeSpan`.
+
+## Benchmark Baseline
+
+Historical MTP and draft-spec benchmark outputs remain under
+`experiments/mtp/`. Treat them as experiment artifacts, not package metadata.
+New Skippy speculation runs should compare:
+
+1. vanilla target baseline;
+2. vanilla target plus draft model;
+3. Skippy single-stage baseline;
+4. Skippy single-stage plus draft model;
+5. Skippy multi-stage baseline;
+6. Skippy multi-stage plus draft model.
+
+Use the scripted harnesses so source sync, warmup, cooldown, labels, and metrics
+collection stay repeatable.
+
+## Target PR Direction
+
+The next review-ready PR should make package-declared draft speculation the
+normal path instead of an experiment-only flag set. The target scope is:
+
+- enable draft speculation from `model-package.json` when a package declares a
+  default draft strategy;
+- add or publish a layer package repository for the known-winning Llama 3.3 70B
+  Q3 target plus Llama 3.2 1B Q4 draft pair;
+- remove obsolete speculation experiments and cruft that are not part of
+  package-declared `draft-model` speculation;
+- sync with the main repository before review;
+- keep benchmark evidence in docs and experiment JSON, not in the package
+  manifest.
+
+The package manifest should stay small and declarative. For the winning pair,
+the intended shape is:
+
+```json
+{
+  "generation": {
+    "speculative_decoding": {
+      "default": "llama32-1b-q4",
+      "strategies": {
+        "llama32-1b-q4": {
+          "type": "draft-model",
+          "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@<revision>",
+          "window_policy": {
+            "default": "adaptive",
+            "initial_window": 16,
+            "min_window": 2,
+            "max_window": 16
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+Important manifest decisions:
+
+- `draft_model` is a normal model resolver ref. It should download through the
+  same model cache/resolver path as other models.
+- The strategy's presence and selection as `default` are the compatibility
+  signal. Do not add a redundant `compatible: true`.
+- Adaptive is the default policy, but it starts at the proven W16 setting for
+  this pair.
+- Do not put fallback thresholds, cost breakers, lower-prefetch knobs, or
+  benchmark scores in `model-package.json`; those remain runtime policy.
+- If the draft model cannot be resolved or the user disables speculation, serve
+  the target package normally without failing model load.
+
+Runtime/user control should stay separate from the package manifest. The package
+declares available strategies; the existing speculative runtime config decides
+whether to use them:
+
+```toml
+[defaults.speculative]
+mode = "auto"      # auto | disabled | draft | ngram
+package_strategy = "default"
+```
+
+Serving behavior should stay simple:
+
+- `auto` may use the package default strategy when it is available and
+  resolvable;
+- `disabled` never uses package-declared speculation;
+- `package_strategy = "default"` uses the manifest default, while another
+  value selects a named package strategy;
+- explicit `draft` mode still uses the normal manually configured draft path.
+
+There should be no `required` draft mode. In `auto`, missing or unresolved draft
+models fall back to baseline serving.
+
+## Two-Stage Draft Prefetch
+
+The cross-host stage0 draft harness enables lower-stage prefetch by default for
+stage0 draft speculation. For hybrid recurrent models, useful lower prefetch
+requires enough recurrent rollback snapshots for both the current verify span
+and the next prefetched span. The harness now derives that automatically:
+
+- baseline keeps the requested `N_RS_SEQ` value, which defaults to `0`;
+- draft W2 needs `n_rs_seq >= 3`;
+- draft W3 needs `n_rs_seq >= 5`;
+- in general, fixed window `W` with lower prefetch needs
+  `n_rs_seq >= 2 * W - 1`.
+
+That rule exists because llama.cpp exposes a rollback horizon of
+`n_rs_seq + 1` tokens. Without the larger horizon, stage0 can verify the current
+span but cannot safely keep a prefetched lower-stage result for the next span.
+
+Smoke evidence from
+`experiments/mtp/crosshost/two-stage-lower-prefetch-auto-20260524T202515Z`:
+
+| Condition | Stage0 `n_rs_seq` | Lower prefetch useful | Tok/s | Speedup |
+| --- | ---: | ---: | ---: | ---: |
+| baseline | 0 | n/a | 9.27 | 1.00x |
+| draft W2 | 3 | 27 | 12.05 | 1.30x |
+| draft W3 | 5 | 15 | 9.66 | 1.04x |
+
+The W2 run matched the baseline output hash and had no replay commits, so the
+prefetch work was both useful and correctness-preserving in this smoke.
+
+## Draft Guard Finding
+
+Full-corpus W2 draft speculation needs a request-local fallback guard. The
+one-prompt smoke had high acceptance, but the 9-prompt corpus includes
+low-acceptance spans, especially the long code review prompt. Unguarded W2 with
+lower prefetch was unstable:
+
+| Run | Tok/s |
+| --- | ---: |
+| split60 baseline | 9.29 |
+| unguarded W2 attempt 1 | 6.27 |
+| unguarded W2 attempt 2 | 9.75 |
+| unguarded W2 attempt 3 | 7.82 |
+
+Skippy now disables stage0 draft speculation for the rest of a request once the
+request has at least `SKIPPY_STAGE0_SPEC_FALLBACK_MIN_WINDOWS` windows and the
+accepted/proposed ratio falls below
+`SKIPPY_STAGE0_SPEC_FALLBACK_MIN_ACCEPT_RATE`. Defaults are `32` windows and
+`0.90`. If lower prefetch already advanced stage0 when the guard trips, Skippy
+forces the verified-prefix commit path before continuing with normal one-token
+split decode.
+
+Guarded W2 confirmation from
+`experiments/mtp/crosshost/two-stage-w2-guard-confirm-20260524T214529Z`:
+
+| Condition | Attempts tok/s | Median tok/s | Speedup vs split baseline |
+| --- | --- | ---: | ---: |
+| split60 baseline | 9.32, 9.29, 9.29 | 9.29 | 1.00x |
+| W2 guarded lower prefetch | 8.17, 9.74, 9.50 | 9.50 | 1.02x |
+
+Controls:
+
+- `SPEC_DRAFT_P_MIN=0.90` was worse (`8.40 tok/s`).
+- W3 guarded was worse (`7.78 tok/s`).
+- Disabling lower prefetch was worse (`8.09 tok/s`).
+- Split62 was worse for both baseline (`9.15 tok/s`) and W2 (`7.78 tok/s`).
+- Disabling the lower-prefetch cooldown was better in a one-run check
+  (`9.82 tok/s`), so the default cooldown is `0`; the fallback guard is the
+  safety valve for low-acceptance prompts.
+- The default lower-prefetch policy is now aggressive: when lower prefetch is
+  enabled and the recurrent snapshot horizon allows the next span, Skippy
+  schedules it instead of using the EWMA net-benefit gate. The old cost gate is
+  still available for experiments with
+  `SKIPPY_STAGE0_LOWER_PREFETCH_COST_GATE=1`.
+- Aggressive lower prefetch confirmation from
+  `experiments/mtp/crosshost/two-stage-w2-aggressive-prefetch-confirm-20260524T221403Z`
+  scheduled all `611/611` lower-prefetch candidates and reduced measured
+  downstream wait to roughly `0.5s` on the two valid attempts. Those attempts
+  were `10.18 tok/s` and `10.18 tok/s`, or about `1.10x` over the split60
+  baseline. The third attempt was a slow outlier at `7.20 tok/s`; its downstream
+  wait stayed low, but lower-verify and lower-prefetch time rose from about
+  `56s` to about `97s`, so the remaining instability is exposed stage0 verify
+  cost rather than missed prefetch.
+- Skippy now keeps aggressive prefetch as the default and adds a lower-prefetch
+  cost breaker. It compares prefetched lower-verify cost against a warmed
+  stage0 verify baseline shared by requests in the same server process and
+  disables lower prefetch for the current request after repeated slow spans. If
+  the breaker fires after lower prefetch has advanced stage0, Skippy forces the
+  verified-prefix commit path before continuing. The draft fallback guard
+  remains separate and can still disable draft speculation for low-acceptance
+  requests.
+- Process-baseline breaker confirmation from
+  `experiments/mtp/crosshost/two-stage-w2-process-breaker-confirm-20260524T225636Z`
+  produced `10.18`, `10.23`, and `9.37 tok/s`, all valid, with median
+  `10.18 tok/s`. That is about `1.10x` over the split60 baseline. Lower
+  prefetch stayed fully scheduled (`611/611`) and downstream wait stayed low
+  (`0.40s` to `0.54s`). The breaker did not fire in this run because the
+  measured lower-prefetch cost never exceeded the warmed baseline threshold; the
+  third run was slower because exposed lower-prefetch time rose to about
+  `10.5s`, not because RTT waits returned.
+- Skippy also has a request-local exposed-cost budget for lower prefetch. It
+  tracks cumulative exposed lower-prefetch milliseconds per committed input and
+  throttles lower prefetch for the rest of the request once the budget is
+  exceeded. The default gate is enabled with
+  `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET=1`,
+  `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MIN_SPANS=32`,
+  `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MIN_MS=1000`, and
+  `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MS_PER_TOKEN=8.0`. Once tripped,
+  lower prefetch is attempted every
+  `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_THROTTLE_STRIDE=4` eligible
+  spans instead of being disabled completely.
+- Exposed-budget confirmation from
+  `experiments/mtp/crosshost/two-stage-w2-exposed-budget-confirm-20260524T231848Z`
+  produced two clean attempts (`10.22`, `10.20 tok/s`) and one rejected slow
+  outlier (`7.79 tok/s`). The rejected attempt's short warmup was already bad
+  (`1.50 tok/s` vs the normal `~12 tok/s`), so this was a poisoned fresh process
+  before measurement rather than just a lower-prefetch policy miss.
+- The cross-host harness now supports warmup health gating before measurement:
+  `MEASURE_ATTEMPTS` can exceed `MEASURE_RUNS`, and attempts whose short warmup
+  falls below `WARMUP_MIN_TOK_S` are restarted after cooldown. The first gated W2
+  confirmation from
+  `experiments/mtp/crosshost/two-stage-w2-warmup-gated-confirm-20260524T233715Z`
+  used `MEASURE_RUNS=3`, `MEASURE_ATTEMPTS=5`, and `WARMUP_MIN_TOK_S=6.0`. It
+  accepted three attempts with healthy warmups (`12.35`, `12.41`, `11.99 tok/s`)
+  and measured `10.22`, `10.17`, and `9.35 tok/s`, median `10.17 tok/s`.
+- Adaptive lower-prefetch throttle confirmation from
+  `experiments/mtp/crosshost/two-stage-w2-throttle-confirm-20260524T235913Z`
+  used the same warmup-gated harness. Attempt 3 was rejected before measurement
+  because the 32-token warmup was only `2.12 tok/s`. The three measured attempts
+  were `10.18`, `10.21`, and `10.16 tok/s`, median `10.18 tok/s`; no measured
+  request tripped the exposed-budget throttle, so the change did not hurt the
+  fast path. Metrics reports now include a `request_speculation` section keyed by
+  request/session id for per-request stage0 speculation timing.
+
+The remaining bottleneck is exposed draft/lower-verify cost and its variance,
+not rollback correctness. W2 lower prefetch is useful, but the current two-stage
+draft loop does not yet recover enough of the split penalty to approach vanilla
+llama-server throughput.
+
+## Same-Host Two-Stage Window Finding
+
+For the Llama 3.3 70B Q3 target with the Llama 3.2 1B Q4 draft model on
+Studio, draft speculation now wins over the two-stage baseline when stage0
+lower prefetch is enabled. W8 established the first stable win; a follow-up
+window sweep found W16 to be the current best same-host setting for this pair.
+
+Key implementation decisions:
+
+- Stage0 draft sessions trim away a p-min-rejected candidate token instead of
+  leaving the draft KV advanced through a token that was never proposed.
+- Stage0 lower prefetch is enabled by default. It can be disabled with
+  `SKIPPY_STAGE0_LOWER_PREFETCH=0`.
+- Draft p-min defaults to `0.60` for Skippy draft speculation. The CLI flags
+  `--spec-draft-p-min` and `--openai-spec-draft-p-min` still override it for
+  other model pairs.
+
+Clean same-host Studio measurements, 2 prompts x 192 generated tokens:
+
+| Condition | Median tok/s | Median wall | Notes |
+| --- | ---: | ---: | --- |
+| two-stage baseline | 7.957 | 48.26s | no speculation |
+| W8, p-min 0.75, lower prefetch off | 8.134 | 47.21s | +2.2% vs baseline |
+| W8, p-min 0.75, lower prefetch on | 8.447 | 45.46s | +6.2% vs baseline |
+| W8, p-min 0.70, lower prefetch on | 8.449 | 45.45s | effectively tied with 0.75 |
+| W8, p-min 0.60, lower prefetch on | 8.708 | 44.10s | +9.4% vs baseline |
+| W16, p-min 0.60, lower prefetch on | 9.835 | 39.04s | +23.6% vs baseline |
+
+The p-min 0.60 run accepted about `266.7` of `289.7` proposed tokens
+(`~92.1%`) and committed about `3.31` inputs per speculative span. It is faster
+than p-min 0.70/0.75 because the larger useful spans amortize more downstream
+round trips, even though the lower-verify work is larger.
+
+W16 confirmation from
+`experiments/mtp/studio54/skippy-2stage-w16-defaultpmin-defaultlower-192tok-prompt2-3x-20260526T214535Z`
+ran three clean attempts at `9.832`, `9.835`, and `10.002 tok/s`. It accepted
+about `269.7` of `290.3` proposed tokens (`~92.9%`) and committed about `3.75`
+inputs per speculative span. Lower-prefetch waste was low (`0.33` spans/run),
+and downstream wait fell to about `30.5 ms` per committed input.
+
+One-shot window sweep evidence:
+
+| Window | Tok/s | Wall | Accepted / proposed | Committed inputs/span | Notes |
+| ---: | ---: | ---: | ---: | ---: | --- |
+| W6 | 8.963 | 42.84s | 270 / 287 | 3.14 | stable but smaller spans |
+| W8 | 8.822 | 43.53s | 267 / 289 | 3.42 | below confirmed W8 3-run median |
+| W10 | 9.284 | 41.36s | 269 / 291 | 3.36 | improvement over W8 |
+| W12 | 9.668 | 39.72s | 271 / 291 | 3.76 | near plateau |
+| W14 | 9.680 | 39.67s | 269 / 290 | 3.64 | near plateau |
+| W16 | 9.845 | 39.01s | 269 / 290 | 3.68 | best one-shot and confirmed by 3-run pass |
+| W20 | 3.699 | 103.82s | 269 / 290 | 3.84 | pathological exposed verify/wait outlier |
+| W24 | 9.235 | 41.58s | 271 / 291 | 4.11 | recovers, but slower than W16 |
+| adaptive W16 | 7.604 | 50.50s | 251 / 275 | 1.96 | current adaptive ramp is worse than baseline |
+
+Do not treat larger windows as automatically better. W20 showed that a wider
+window can preserve acceptance while still losing badly if lower verification
+or downstream wait becomes exposed. For this pair, W16 is the current best
+claim; future tuning should re-confirm around the plateau rather than pushing
+window size blindly upward.
+
+The old adaptive window policy was not a safe replacement for fixed W16 on this
+benchmark. It spent too much of the short request at small spans (`~1.96`
+committed inputs/span) and exposed more lower verify/downstream wait per
+committed input. Stage0 adaptive draft now starts at the configured max window
+by default, so `--openai-adaptive-speculative-window --openai-spec-draft-window
+16` begins at W16 instead of slowly ramping from W2. Set
+`SKIPPY_STAGE0_ADAPTIVE_START_MAX=0` to restore the old ramp-from-small
+behavior for experiments. A same-host smoke after this change produced
+`10.018 tok/s` and `38.33s` wall time at adaptive W16, back in the fixed-W16
+range.
+
+Clean apples-to-apples rerun from
+`experiments/mtp/studio54/clean-table-llama33q3-draft1b-w16-20260527` used the
+same source state on Studio, the same Llama 3.3 70B Q3 target, the same Llama
+3.2 1B Q4 draft, the same first 2 PR prompts, `192` generated tokens,
+temperature `0`, one warmup per attempt, `120s` cooldowns, and three clean
+measurement attempts per condition.
+
+| Runtime | Condition | Window | Median tok/s | Min-Max tok/s | Median wall | Acceptance | Valid | Speedup |
+| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
+| llama-server | baseline, no draft | - | 8.017 | 8.015-8.018 | 47.90s | n/a | 3/3 | 1.00x |
+| llama-server | draft model speculation | W16 | 11.041 | 11.041-11.044 | 34.78s | 0.870 | 3/3 | 1.38x |
+| skippy-server two-stage | baseline, no draft | - | 7.961 | 7.959-7.964 | 48.23s | n/a | 3/3 | 1.00x |
+| skippy-server two-stage | stage0-owned draft speculation | W16 | 10.024 | 9.853-10.025 | 38.31s | 0.899 | 3/3 | 1.26x |
+| skippy-server two-stage | adaptive stage0-owned draft speculation | W16 max | 10.020 | 9.867-10.036 | 38.33s | 0.899 | 3/3 | 1.26x |
+
+This is a real Skippy win: fixed W16 draft speculation improves same-host
+two-stage Skippy throughput by about `25.9%` and reduces median wall time by
+about `20.6%` versus the matching two-stage Skippy baseline. Vanilla
+llama-server remains the performance reference for this pair: Skippy two-stage
+W16 is still about `9.2%` behind vanilla W16, even though Skippy baseline is
+already close to vanilla baseline.
diff --git a/docs/USAGE.md b/docs/USAGE.md
index e4720e635..756803945 100644
--- a/docs/USAGE.md
+++ b/docs/USAGE.md
@@ -351,7 +351,8 @@ lifecycle_health_interval_ms    = 5000      # health-check interval (ms)
 
 # --- Speculative decoding ------------------------------------------------
 [defaults.speculative]
-mode                       = "auto"          # auto off draft ngram lookahead
+mode                       = "auto"          # auto disabled draft ngram
+package_strategy           = "default"       # package manifest default or named strategy
 draft_selection_policy     = "auto"          # auto manual heuristic
 pairing_fault              = "warn_disable"  # warn_disable fail_open fail_closed
 draft_max_tokens           = 16
diff --git a/docs/specs/layer-package-repos.md b/docs/specs/layer-package-repos.md
index 726650bf4..38b64f84c 100644
--- a/docs/specs/layer-package-repos.md
+++ b/docs/specs/layer-package-repos.md
@@ -275,6 +275,55 @@ multimodal model:
 Consumers MUST NOT infer projector identity from sibling files or filename
 stems. If a package needs a projector, the manifest must declare it explicitly.
 
+### Generation Capabilities
+
+`generation` is optional and defaults to no package-declared generation
+extensions. It describes capabilities and safe runtime policy defaults. It MUST
+NOT contain benchmark results. Benchmark evidence belongs in experiment
+artifacts, release notes, or documentation, not in `model-package.json`.
+
+For speculative decoding, the recommended shape is:
+
+```json
+{
+  "generation": {
+    "speculative_decoding": {
+      "default": "llama32-1b-q4",
+      "strategies": {
+        "llama32-1b-q4": {
+          "type": "draft-model",
+          "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@<revision>",
+          "window_policy": {
+            "default": "adaptive",
+            "initial_window": 16,
+            "min_window": 2,
+            "max_window": 16
+          }
+        }
+      }
+    }
+  }
+}
+```
+
+The selected strategy's presence means the package declares support for that
+strategy. There is no separate `compatible` boolean. `draft_model` is a normal
+model resolver ref, including a pinned revision when the package publisher wants
+reproducible downloads. Runtimes should resolve and cache the draft model using
+the same model download path as ordinary model artifacts.
+
+Runtime policy should still be able to disable speculation for user choice,
+unsupported hardware/topology, missing draft artifacts, or internal safety
+guards. Those controls are runtime behavior and should not be encoded as
+manifest fallback thresholds.
+
+Serving configuration should use the existing `[defaults.speculative]` and
+`[models.speculative]` control surface. In `mode = "auto"`, package-declared
+defaults may be used when available and resolvable. In `mode = "disabled"`,
+package-declared speculation is not used. `package_strategy = "default"` uses
+the manifest default, and any other value selects a named package strategy.
+There is no separate `required` mode.
+
 ## Publishing Rules
 
 Package creation SHOULD use:
@@ -283,6 +332,18 @@ Package creation SHOULD use:
 skippy-model-package write-package org/repo:distribution --out-dir model-package/
 ```
 
+Packages with a known-good draft pair SHOULD declare it at write time:
+
+```bash
+skippy-model-package write-package org/repo:distribution \
+  --spec-draft-model unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M \
+  --spec-strategy llama32-1b-q4 \
+  --spec-initial-window 16 \
+  --spec-min-window 2 \
+  --spec-max-window 16 \
+  --out-dir model-package/
+```
+
 Multimodal packages SHOULD declare projector artifacts at write time:
 
 ```bash

From f8f43150f3946d52f0f1f4cc9d0543779b62247d Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 16:29:37 +1000
Subject: [PATCH 2/9] Retry HF artifact uploads during package jobs

---
 .../src/scripts/split-model-job.sh            | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/crates/model-package/src/scripts/split-model-job.sh b/crates/model-package/src/scripts/split-model-job.sh
index ef0e7a21c..90af3382b 100755
--- a/crates/model-package/src/scripts/split-model-job.sh
+++ b/crates/model-package/src/scripts/split-model-job.sh
@@ -239,19 +239,39 @@ cat > "$ARTIFACT_UPLOAD_SCRIPT" <<'PYTHON'
 from huggingface_hub import HfApi
 from pathlib import Path
 import os
+import time
 
 path = Path(os.environ["SKIPPY_PACKAGE_ARTIFACT_PATH"])
 relative = os.environ["SKIPPY_PACKAGE_ARTIFACT_RELATIVE_PATH"]
 target_repo = os.environ["TARGET_REPO"]
+max_attempts = int(os.environ.get("ARTIFACT_UPLOAD_ATTEMPTS", "4"))
 
 api = HfApi(token=os.environ["HF_TOKEN"])
-api.upload_file(
-    repo_id=target_repo,
-    path_or_fileobj=str(path),
-    path_in_repo=relative,
-    repo_type="model",
-    commit_message=f"Add package artifact {relative}",
-)
+last_error = None
+for attempt in range(1, max_attempts + 1):
+    try:
+        api.upload_file(
+            repo_id=target_repo,
+            path_or_fileobj=str(path),
+            path_in_repo=relative,
+            repo_type="model",
+            commit_message=f"Add package artifact {relative}",
+        )
+        last_error = None
+        break
+    except Exception as err:
+        last_error = err
+        if attempt == max_attempts:
+            break
+        delay = min(60, 5 * attempt)
+        print(
+            f"  Upload failed for {relative} on attempt {attempt}/{max_attempts}: {err}. "
+            f"Retrying in {delay}s...",
+            flush=True,
+        )
+        time.sleep(delay)
+if last_error is not None:
+    raise last_error
 size = path.stat().st_size
 path.unlink()
 print(f"  Uploaded and removed {relative} ({size} bytes)")

From f666e72730aed1afe89ece1d062ffecad505eefe Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 18:13:14 +1000
Subject: [PATCH 3/9] Honor package draft disablement

---
 .../src/inference/skippy/mod.rs                 | 12 ++++++++++--
 .../mesh-llm-host-runtime/src/runtime/local.rs  | 17 +++++++++++++++--
 crates/mesh-llm-host-runtime/src/runtime/mod.rs | 12 ++++++++++++
 3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
index c9fbb6b0c..1a571e291 100644
--- a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
+++ b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs
@@ -783,8 +783,12 @@ pub(crate) fn single_stage_config(options: &SkippyModelLoadOptions) -> Result<St
             "skippy selected backend device must not be empty"
         );
     }
+    let path_ref = options.model_path.to_string_lossy();
+    let inferred_layer_package =
+        options.package_identity.is_none() && is_layer_package_ref(&path_ref);
     let package_identity = match options.package_identity.as_ref() {
         Some(identity) => identity.clone(),
+        None if inferred_layer_package => identity_from_layer_package(&path_ref)?,
         None => synthetic_direct_gguf_package(&options.model_id, &options.model_path)?,
     };
     let layer_start = options.layer_start;
@@ -832,10 +836,14 @@ pub(crate) fn single_stage_config(options: &SkippyModelLoadOptions) -> Result<St
         cache_type_k: options.cache_type_k.clone(),
         cache_type_v: options.cache_type_v.clone(),
         flash_attn_type: options.flash_attn_type,
-        filter_tensors_on_load: false,
+        filter_tensors_on_load: inferred_layer_package,
         selected_device: options.selected_device.clone().map(Into::into),
         kv_cache: None,
-        load_mode: LoadMode::RuntimeSlice,
+        load_mode: if inferred_layer_package {
+            LoadMode::LayerPackage
+        } else {
+            LoadMode::RuntimeSlice
+        },
         bind_addr: "127.0.0.1:0".to_string(),
         upstream: None,
         downstream: None,
diff --git a/crates/mesh-llm-host-runtime/src/runtime/local.rs b/crates/mesh-llm-host-runtime/src/runtime/local.rs
index d96f6e8b0..7cf87304e 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/local.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/local.rs
@@ -197,6 +197,7 @@ pub(super) struct LocalRuntimeModelStartSpec<'a> {
     pub(super) n_ubatch_override: Option<u32>,
     pub(super) flash_attention_override: FlashAttentionType,
     pub(super) parallel_override: Option<usize>,
+    pub(super) package_speculative_defaults: bool,
     pub(super) openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     pub(super) skippy_telemetry: skippy::SkippyTelemetryOptions,
     pub(super) survey_telemetry: survey::SurveyTelemetry,
@@ -774,6 +775,7 @@ pub(super) async fn start_runtime_split_model(
         n_batch_override: spec.n_batch_override,
         n_ubatch_override: spec.n_ubatch_override,
         flash_attention_override: spec.flash_attention_override,
+        package_speculative_defaults: spec.package_speculative_defaults,
         openai_guardrail_policy: spec.openai_guardrail_policy.clone(),
         pinned_gpu: spec.pinned_gpu,
         slots,
@@ -804,6 +806,7 @@ pub(super) async fn start_runtime_split_model(
         n_batch_override: spec.n_batch_override,
         n_ubatch_override: spec.n_ubatch_override,
         flash_attention_override: spec.flash_attention_override,
+        package_speculative_defaults: spec.package_speculative_defaults,
         openai_guardrail_policy: spec.openai_guardrail_policy.clone(),
         pinned_gpu: spec.pinned_gpu.cloned(),
         slots,
@@ -1162,6 +1165,7 @@ struct SplitGenerationLoadSpec<'a> {
     n_batch_override: Option<u32>,
     n_ubatch_override: Option<u32>,
     flash_attention_override: FlashAttentionType,
+    package_speculative_defaults: bool,
     openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     skippy_telemetry: skippy::SkippyTelemetryOptions,
     survey_telemetry: survey::SurveyTelemetry,
@@ -1554,7 +1558,9 @@ async fn split_generation_load_settings<'a>(
     if let Some(gpu) = spec.pinned_gpu {
         resolved.hardware.device = Some(gpu.backend_device.clone());
     }
-    apply_package_speculative_defaults(&mut resolved, spec.package).await?;
+    if spec.package_speculative_defaults {
+        apply_package_speculative_defaults(&mut resolved, spec.package).await?;
+    }
     let embedded_openai = resolved.to_embedded_openai_args(activation_width, true)?;
     let runtime_options = resolved.to_embedded_runtime_options(
         &spec.skippy_telemetry,
@@ -1952,6 +1958,7 @@ struct SplitTopologyCoordinator {
     n_batch_override: Option<u32>,
     n_ubatch_override: Option<u32>,
     flash_attention_override: FlashAttentionType,
+    package_speculative_defaults: bool,
     openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     pinned_gpu: Option<crate::runtime::StartupPinnedGpuTarget>,
     slots: usize,
@@ -2441,6 +2448,7 @@ impl SplitTopologyCoordinator {
             n_batch_override: self.n_batch_override,
             n_ubatch_override: self.n_ubatch_override,
             flash_attention_override: self.flash_attention_override,
+            package_speculative_defaults: self.package_speculative_defaults,
             openai_guardrail_policy: self.openai_guardrail_policy.clone(),
             pinned_gpu: self.pinned_gpu.as_ref(),
             slots: self.slots,
@@ -3729,7 +3737,9 @@ async fn start_runtime_layer_package_model(
         plan.slots,
         fallback_projector_path,
     )?;
-    apply_package_speculative_defaults(&mut resolved, &package).await?;
+    if spec.package_speculative_defaults {
+        apply_package_speculative_defaults(&mut resolved, &package).await?;
+    }
     tracing::info!(
         model = model_name,
         "KV cache: {} K + {} V, {}K context",
@@ -4289,6 +4299,7 @@ stop = ["END"]
             n_batch_override: None,
             n_ubatch_override: None,
             flash_attention_override: FlashAttentionType::Auto,
+            package_speculative_defaults: true,
             openai_guardrail_policy: openai_guardrail_policy_handle(
                 openai_frontend::GuardrailMode::Disabled,
             ),
@@ -4387,6 +4398,7 @@ max_tokens = 222
             n_ubatch_override: None,
             flash_attention_override: FlashAttentionType::Auto,
             parallel_override: None,
+            package_speculative_defaults: true,
             openai_guardrail_policy: openai_guardrail_policy_handle(
                 openai_frontend::GuardrailMode::Disabled,
             ),
@@ -5551,6 +5563,7 @@ max_tokens = 222
             n_batch_override: None,
             n_ubatch_override: None,
             flash_attention_override: FlashAttentionType::Auto,
+            package_speculative_defaults: true,
             openai_guardrail_policy: openai_guardrail_policy_handle(
                 openai_frontend::GuardrailMode::Disabled,
             ),
diff --git a/crates/mesh-llm-host-runtime/src/runtime/mod.rs b/crates/mesh-llm-host-runtime/src/runtime/mod.rs
index 967fa528f..ae28756a8 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/mod.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/mod.rs
@@ -1220,6 +1220,7 @@ struct StartupLocalModelTask {
     n_ubatch: Option<u32>,
     flash_attention: FlashAttentionType,
     parallel_override: Option<usize>,
+    package_speculative_defaults: bool,
     openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     split: bool,
     skippy_telemetry: skippy::SkippyTelemetryOptions,
@@ -1299,6 +1300,7 @@ struct StartupLoopContext<'a> {
     n_ubatch: Option<u32>,
     flash_attention: FlashAttentionType,
     parallel_override: Option<usize>,
+    package_speculative_defaults: bool,
     openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     skippy_telemetry: &'a skippy::SkippyTelemetryOptions,
     survey_telemetry: &'a survey::SurveyTelemetry,
@@ -1371,6 +1373,7 @@ struct StartupLaunchRuntimeContext<'a> {
     n_ubatch: Option<u32>,
     flash_attention: FlashAttentionType,
     parallel_override: Option<usize>,
+    package_speculative_defaults: bool,
     openai_guardrail_policy: OpenAiGuardrailPolicyHandle,
     skippy_telemetry: &'a skippy::SkippyTelemetryOptions,
     survey_telemetry: &'a survey::SurveyTelemetry,
@@ -1841,6 +1844,7 @@ async fn startup_handle_local_fallback_event(
             n_ubatch_override: ctx.n_ubatch,
             flash_attention_override: ctx.flash_attention,
             parallel_override: ctx.parallel_override,
+            package_speculative_defaults: ctx.package_speculative_defaults,
             openai_guardrail_policy: ctx.openai_guardrail_policy.clone(),
             skippy_telemetry: ctx.skippy_telemetry.clone(),
             survey_telemetry: ctx.survey_telemetry.clone(),
@@ -2173,6 +2177,7 @@ async fn startup_launch_runtime(
         n_ubatch,
         flash_attention,
         parallel_override,
+        package_speculative_defaults,
         openai_guardrail_policy,
         skippy_telemetry,
         survey_telemetry,
@@ -2200,6 +2205,7 @@ async fn startup_launch_runtime(
         n_ubatch_override: n_ubatch,
         flash_attention_override: flash_attention,
         parallel_override,
+        package_speculative_defaults,
         openai_guardrail_policy: openai_guardrail_policy.clone(),
         skippy_telemetry: skippy_telemetry.clone(),
         survey_telemetry: survey_telemetry.clone(),
@@ -2313,6 +2319,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) {
         n_ubatch,
         flash_attention,
         parallel_override,
+        package_speculative_defaults,
         openai_guardrail_policy,
         split,
         skippy_telemetry,
@@ -2372,6 +2379,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) {
             n_ubatch,
             flash_attention,
             parallel_override,
+            package_speculative_defaults,
             openai_guardrail_policy: openai_guardrail_policy.clone(),
             skippy_telemetry: &skippy_telemetry,
             survey_telemetry: &survey_telemetry,
@@ -2426,6 +2434,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) {
         n_ubatch,
         flash_attention,
         parallel_override,
+        package_speculative_defaults,
         openai_guardrail_policy,
         skippy_telemetry: &skippy_telemetry,
         survey_telemetry: &survey_telemetry,
@@ -6777,6 +6786,7 @@ async fn spawn_run_auto_startup_model_tasks(
         n_ubatch: primary_n_ubatch,
         flash_attention: primary_flash_attention,
         parallel_override: primary_parallel_override,
+        package_speculative_defaults: !cli.no_draft,
         openai_guardrail_policy: runtime_state.openai_guardrail_policy.clone(),
         split: options.split,
         skippy_telemetry: skippy_telemetry.clone(),
@@ -7076,6 +7086,7 @@ async fn run_auto_load_runtime_model(
                 .and_then(|m| m.flash_attention)
                 .unwrap_or(FlashAttentionType::Auto),
             parallel_override,
+            package_speculative_defaults: !ctx.cli.no_draft,
             openai_guardrail_policy: ctx.openai_guardrail_policy.clone(),
             skippy_telemetry: skippy_telemetry_options(ctx.options),
             survey_telemetry: ctx.survey_telemetry.clone(),
@@ -7892,6 +7903,7 @@ async fn spawn_run_auto_additional_model_tasks(ctx: RunAutoAdditionalModelsConte
             n_ubatch: extra_model.n_ubatch,
             flash_attention: extra_model.flash_attention,
             parallel_override: extra_model.parallel.or(ctx.config.gpu.parallel),
+            package_speculative_defaults: !ctx.cli.no_draft,
             openai_guardrail_policy: ctx.openai_guardrail_policy.clone(),
             split: ctx.options.split,
             skippy_telemetry: ctx.skippy_telemetry.clone(),

From df287b8f3d438900eaa38cc8adc1117e5c552cd4 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 18:44:58 +1000
Subject: [PATCH 4/9] Add speculative config authoring API

---
 crates/mesh-llm-config/README.md        |  25 ++++++
 crates/mesh-llm-config/src/authoring.rs | 104 +++++++++++++++++++++++-
 crates/mesh-llm-config/src/lib.rs       |  67 ++++++++++++++-
 crates/mesh-llm-host-runtime/src/sdk.rs |   6 +-
 4 files changed, 197 insertions(+), 5 deletions(-)

diff --git a/crates/mesh-llm-config/README.md b/crates/mesh-llm-config/README.md
index ec76e7477..51cdcaa6a 100644
--- a/crates/mesh-llm-config/README.md
+++ b/crates/mesh-llm-config/README.md
@@ -130,6 +130,31 @@ store.update(|config| {
 })?;
 ```
 
+### Configure speculative decoding
+
+Use the editor API for speculative defaults or model-specific speculative
+settings. This keeps SDKs and apps on the same validated config schema as the
+CLI.
+
+```rust
+use mesh_llm_config::ConfigStore;
+
+let store = ConfigStore::default_path()?;
+store.update(|config| {
+    config
+        .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")?
+        .speculative()
+        .mode("draft")
+        .draft_hf_source(
+            "unsloth/Llama-3.2-1B-Instruct-GGUF",
+            "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+        )
+        .draft_selection_policy("manual")
+        .draft_max_tokens(16);
+    Ok(())
+})?;
+```
+
 ### Configure plugins
 
 Use plugin helpers for common cases instead of writing `[[plugin]]` tables.
diff --git a/crates/mesh-llm-config/src/authoring.rs b/crates/mesh-llm-config/src/authoring.rs
index e744fd747..478ea9f73 100644
--- a/crates/mesh-llm-config/src/authoring.rs
+++ b/crates/mesh-llm-config/src/authoring.rs
@@ -1,6 +1,7 @@
 use crate::{
     GpuAssignment, HardwareConfig, MeshConfig, ModelConfigDefaults, ModelConfigEntry,
-    ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, ThroughputConfig,
+    ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, SpeculativeConfig,
+    ThroughputConfig,
 };
 use anyhow::{Result, bail};
 use mesh_llm_types::runtime::ModelRuntimeKind;
@@ -256,6 +257,20 @@ impl ModelDefaultsEditor<'_> {
         self
     }
 
+    pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> {
+        SpeculativeConfigEditor {
+            speculative: self
+                .defaults
+                .speculative
+                .get_or_insert_with(Default::default),
+        }
+    }
+
+    pub fn clear_speculative(&mut self) -> &mut Self {
+        self.defaults.speculative = None;
+        self
+    }
+
     fn hardware(&mut self) -> &mut HardwareConfig {
         self.defaults.hardware.get_or_insert_with(Default::default)
     }
@@ -332,6 +347,17 @@ impl ModelConfigEditor<'_> {
         self
     }
 
+    pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> {
+        SpeculativeConfigEditor {
+            speculative: self.model.speculative.get_or_insert_with(Default::default),
+        }
+    }
+
+    pub fn clear_speculative(&mut self) -> &mut Self {
+        self.model.speculative = None;
+        self
+    }
+
     fn hardware(&mut self) -> &mut HardwareConfig {
         self.model.hardware.get_or_insert_with(Default::default)
     }
@@ -355,6 +381,82 @@ impl ModelConfigEditor<'_> {
     }
 }
 
+pub struct SpeculativeConfigEditor<'a> {
+    speculative: &'a mut SpeculativeConfig,
+}
+
+impl SpeculativeConfigEditor<'_> {
+    pub fn mode(&mut self, mode: impl Into<String>) -> &mut Self {
+        self.speculative.mode = Some(mode.into());
+        self
+    }
+
+    pub fn package_strategy(&mut self, strategy: impl Into<String>) -> &mut Self {
+        self.speculative.package_strategy = Some(strategy.into());
+        self
+    }
+
+    pub fn draft_model_path(&mut self, path: impl Into<String>) -> &mut Self {
+        self.speculative.draft_model_path = Some(path.into());
+        self
+    }
+
+    pub fn draft_hf_source(
+        &mut self,
+        repo: impl Into<String>,
+        file: impl Into<String>,
+    ) -> &mut Self {
+        self.speculative.draft_hf_repo = Some(repo.into());
+        self.speculative.draft_hf_file = Some(file.into());
+        self
+    }
+
+    pub fn draft_selection_policy(&mut self, policy: impl Into<String>) -> &mut Self {
+        self.speculative.draft_selection_policy = Some(policy.into());
+        self
+    }
+
+    pub fn pairing_fault(&mut self, policy: impl Into<String>) -> &mut Self {
+        self.speculative.pairing_fault = Some(policy.into());
+        self
+    }
+
+    pub fn draft_max_tokens(&mut self, tokens: u32) -> &mut Self {
+        self.speculative.draft_max_tokens = Some(tokens);
+        self
+    }
+
+    pub fn draft_min_tokens(&mut self, tokens: u32) -> &mut Self {
+        self.speculative.draft_min_tokens = Some(tokens);
+        self
+    }
+
+    pub fn draft_gpu_layers(&mut self, layers: i32) -> &mut Self {
+        self.speculative.draft_gpu_layers = Some(layers);
+        self
+    }
+
+    pub fn draft_device(&mut self, device: impl Into<String>) -> &mut Self {
+        self.speculative.draft_device = Some(device.into());
+        self
+    }
+
+    pub fn draft_threads(&mut self, threads: usize) -> &mut Self {
+        self.speculative.draft_threads = Some(threads);
+        self
+    }
+
+    pub fn draft_cache_types(
+        &mut self,
+        key: impl Into<String>,
+        value: impl Into<String>,
+    ) -> &mut Self {
+        self.speculative.draft_cache_type_k = Some(key.into());
+        self.speculative.draft_cache_type_v = Some(value.into());
+        self
+    }
+}
+
 pub struct PluginConfigEditor<'a> {
     plugin: &'a mut PluginConfigEntry,
 }
diff --git a/crates/mesh-llm-config/src/lib.rs b/crates/mesh-llm-config/src/lib.rs
index fd84d9fa2..dada57895 100644
--- a/crates/mesh-llm-config/src/lib.rs
+++ b/crates/mesh-llm-config/src/lib.rs
@@ -6,7 +6,7 @@ mod validate;
 
 pub use authoring::{
     ConfigEditor, LocalServingNodeConfig, ModelConfigEditor, ModelDefaultsEditor,
-    PluginConfigEditor,
+    PluginConfigEditor, SpeculativeConfigEditor,
 };
 pub use model::*;
 pub use store::{ConfigStore, config_path, config_to_toml, load_config, parse_config_toml};
@@ -204,6 +204,71 @@ ctx_size = 4096
         assert!(fs::read_to_string(path).unwrap().contains("[[plugin]]"));
     }
 
+    #[test]
+    fn config_editor_updates_model_speculative_without_callers_writing_toml() {
+        let temp_dir = TempDir::new().unwrap();
+        let path = temp_dir.path().join("config.toml");
+        let store = ConfigStore::open(&path);
+
+        let config = store
+            .update(|config| {
+                config
+                    .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")?
+                    .speculative()
+                    .mode("draft")
+                    .draft_hf_source(
+                        "unsloth/Llama-3.2-1B-Instruct-GGUF",
+                        "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+                    )
+                    .draft_selection_policy("manual")
+                    .pairing_fault("warn_disable")
+                    .draft_max_tokens(16)
+                    .draft_gpu_layers(-1);
+                Ok(())
+            })
+            .unwrap();
+
+        let speculative = config.models[0].speculative.as_ref().unwrap();
+        assert_eq!(speculative.mode.as_deref(), Some("draft"));
+        assert_eq!(speculative.draft_max_tokens, Some(16));
+        let raw = fs::read_to_string(path).unwrap();
+        assert!(raw.contains("[[models]]"));
+        assert!(raw.contains("[models.speculative]"));
+        assert!(raw.contains("draft_selection_policy = \"manual\""));
+    }
+
+    #[test]
+    fn config_editor_updates_default_speculative_package_strategy() {
+        let temp_dir = TempDir::new().unwrap();
+        let path = temp_dir.path().join("config.toml");
+        let store = ConfigStore::open(&path);
+
+        let config = store
+            .update(|config| {
+                config
+                    .defaults()
+                    .speculative()
+                    .mode("auto")
+                    .package_strategy("llama32-1b-q4");
+                Ok(())
+            })
+            .unwrap();
+
+        let speculative = config
+            .defaults
+            .as_ref()
+            .and_then(|defaults| defaults.speculative.as_ref())
+            .unwrap();
+        assert_eq!(speculative.mode.as_deref(), Some("auto"));
+        assert_eq!(
+            speculative.package_strategy.as_deref(),
+            Some("llama32-1b-q4")
+        );
+        let raw = fs::read_to_string(path).unwrap();
+        assert!(raw.contains("[defaults.speculative]"));
+        assert!(raw.contains("package_strategy = \"llama32-1b-q4\""));
+    }
+
     #[test]
     fn parse_config_toml_rejects_unknown_runtime_kind() {
         let err = parse_config_toml(
diff --git a/crates/mesh-llm-host-runtime/src/sdk.rs b/crates/mesh-llm-host-runtime/src/sdk.rs
index d2b214ce8..3ec6c0e73 100644
--- a/crates/mesh-llm-host-runtime/src/sdk.rs
+++ b/crates/mesh-llm-host-runtime/src/sdk.rs
@@ -27,9 +27,9 @@ pub mod config {
         ModelConfigEntry, ModelDefaultsEditor, ModelFitConfig, ModelRuntimeKind, MultimodalConfig,
         OwnerControlConfig, PluginConfigEditor, PluginConfigEntry, PrefixCacheConfig,
         ReasoningBudget, ReasoningEnabled, RequestDefaultsConfig, ReservedObjectConfig,
-        SkippyConfig, SpeculativeConfig, StringOrStringList, TelemetryConfig,
-        TelemetryMetricsConfig, TensorSplitConfig, ThroughputConfig, config_path, config_to_toml,
-        load_config, parse_config_toml, validate_config,
+        SkippyConfig, SpeculativeConfig, SpeculativeConfigEditor, StringOrStringList,
+        TelemetryConfig, TelemetryMetricsConfig, TensorSplitConfig, ThroughputConfig, config_path,
+        config_to_toml, load_config, parse_config_toml, validate_config,
     };
 }
 

From ff1a70ac8a079338daa273c5a695d5229e41c3a9 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 18:46:26 +1000
Subject: [PATCH 5/9] Document speculative config authoring re-exports

---
 crates/mesh-llm-host-runtime/README.md | 32 ++++++++++++++++++++++++++
 crates/mesh-llm/README.md              |  6 +++++
 2 files changed, 38 insertions(+)

diff --git a/crates/mesh-llm-host-runtime/README.md b/crates/mesh-llm-host-runtime/README.md
index d41df951b..505102404 100644
--- a/crates/mesh-llm-host-runtime/README.md
+++ b/crates/mesh-llm-host-runtime/README.md
@@ -7,3 +7,35 @@ binary.
 
 This crate is being split so reusable CLI, TUI, SDK, and embeddable runtime
 surfaces can be published and consumed independently.
+
+## Config API
+
+The host runtime re-exports the shared config authoring surface through
+`mesh_llm_host_runtime::sdk::config` and `mesh_llm_host_runtime::plugin`. Use
+that API when host-runtime consumers need to read or write `config.toml`; avoid
+hand-writing TOML or duplicating config structs in runtime callers.
+
+Speculative model settings use the same high-level editor API as other model
+settings:
+
+```rust
+use mesh_llm_host_runtime::sdk::config::ConfigStore;
+
+let store = ConfigStore::default_path()?;
+store.update(|config| {
+    config
+        .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")?
+        .speculative()
+        .mode("draft")
+        .draft_hf_source(
+            "unsloth/Llama-3.2-1B-Instruct-GGUF",
+            "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
+        )
+        .draft_selection_policy("manual")
+        .draft_max_tokens(16);
+    Ok(())
+})?;
+```
+
+Schema ownership, validation, and persistence live in
+[`../mesh-llm-config`](../mesh-llm-config).
diff --git a/crates/mesh-llm/README.md b/crates/mesh-llm/README.md
index 49ba1b855..8b6eb1f47 100644
--- a/crates/mesh-llm/README.md
+++ b/crates/mesh-llm/README.md
@@ -23,4 +23,10 @@ own the reusable pieces:
 - [`../mesh-llm-plugin`](../mesh-llm-plugin) for plugin author API and plugin protobuf schema
 - existing `model-*`, `openai-frontend`, and `skippy-*` crates for their domains
 
+The compatibility re-export includes the shared config authoring API under
+`mesh_llm::sdk::config`. Embedders that need to update `config.toml`, including
+model-specific speculative decoding settings, should use `ConfigStore::update`
+and the typed editors from [`../mesh-llm-config`](../mesh-llm-config) instead
+of writing TOML directly.
+
 For install and end-user usage, see the [project README](../../README.md).

From 054b6ce844f02819488ad985be2ebee74a5b67c8 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Wed, 27 May 2026 20:49:46 +1000
Subject: [PATCH 6/9] Fix package spec CI failures

---
 crates/mesh-llm-commands/src/model_package.rs | 45 ++++++++++---------
 .../src/runtime/local.rs                      | 20 ++++++++-
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/crates/mesh-llm-commands/src/model_package.rs b/crates/mesh-llm-commands/src/model_package.rs
index 8377cb429..3193c5d4e 100644
--- a/crates/mesh-llm-commands/src/model_package.rs
+++ b/crates/mesh-llm-commands/src/model_package.rs
@@ -198,26 +198,7 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
     );
 
     if !submitting {
-        let redacted = redacted_spec(&job.spec);
-        if json {
-            println!(
-                "{}",
-                serde_json::to_string_pretty(&json!({
-                    "dryRun": true,
-                    "confirmRequired": true,
-                    "sourceRepo": job.source_repo,
-                    "sourceFile": job.source_file,
-                    "targetRepo": job.target_repo,
-                    "modelId": job.model_id,
-                    "jobPlan": job.job_plan,
-                    "spec": redacted,
-                }))?
-            );
-        } else {
-            eprintln!();
-            eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit.");
-            println!("{}", serde_json::to_string_pretty(&redacted)?);
-        }
+        print_package_job_dry_run(&job, json)?;
         return Ok(());
     }
 
@@ -266,6 +247,30 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
     Ok(())
 }
 
+fn print_package_job_dry_run(job: &prepare::PrepareJob, json_output: bool) -> Result<()> {
+    let redacted = redacted_spec(&job.spec);
+    if json_output {
+        println!(
+            "{}",
+            serde_json::to_string_pretty(&json!({
+                "dryRun": true,
+                "confirmRequired": true,
+                "sourceRepo": job.source_repo,
+                "sourceFile": job.source_file,
+                "targetRepo": job.target_repo,
+                "modelId": job.model_id,
+                "jobPlan": job.job_plan,
+                "spec": redacted,
+            }))?
+        );
+    } else {
+        eprintln!();
+        eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit.");
+        println!("{}", serde_json::to_string_pretty(&redacted)?);
+    }
+    Ok(())
+}
+
 async fn run_list_quants(
     client: &hf_hub::HFClient,
     source_repo: &str,
diff --git a/crates/mesh-llm-host-runtime/src/runtime/local.rs b/crates/mesh-llm-host-runtime/src/runtime/local.rs
index 7cf87304e..0c667ed05 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/local.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/local.rs
@@ -5447,8 +5447,24 @@ max_tokens = 222
         );
     }
 
-    #[tokio::test]
-    async fn load_split_runtime_generation_stops_candidate_stages_after_partial_load_failure() {
+    #[test]
+    fn load_split_runtime_generation_stops_candidate_stages_after_partial_load_failure() {
+        std::thread::Builder::new()
+            .name("split-load-partial-failure-test".to_string())
+            .stack_size(16 * 1024 * 1024)
+            .spawn(|| {
+                tokio::runtime::Builder::new_current_thread()
+                    .enable_all()
+                    .build()
+                    .unwrap()
+                    .block_on(load_split_runtime_generation_partial_failure_case());
+            })
+            .unwrap()
+            .join()
+            .unwrap();
+    }
+
+    async fn load_split_runtime_generation_partial_failure_case() {
         let node = mesh::Node::new_for_tests(NodeRole::Host { http_port: 9337 })
             .await
             .unwrap();

From 926b5bb8c1545efc4ecc3d3d3af342c7d133d874 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Fri, 5 Jun 2026 17:30:29 +1000
Subject: [PATCH 7/9] Fix package draft branch after main refactors

---
 crates/mesh-llm-config/src/model.rs             |  2 ++
 crates/mesh-llm-config/src/validate.rs          |  4 ++++
 crates/mesh-llm-host-runtime/src/runtime/mod.rs |  6 +++---
 crates/mesh-llm/src/commands/mod.rs             | 10 ++++++++++
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/crates/mesh-llm-config/src/model.rs b/crates/mesh-llm-config/src/model.rs
index 59a65f0a0..2a81ff828 100644
--- a/crates/mesh-llm-config/src/model.rs
+++ b/crates/mesh-llm-config/src/model.rs
@@ -404,6 +404,8 @@ pub struct SpeculativeConfig {
     #[serde(default)]
     pub mode: Option<String>,
     #[serde(default)]
+    pub package_strategy: Option<String>,
+    #[serde(default)]
     pub draft_model_path: Option<String>,
     #[serde(default)]
     pub draft_hf_repo: Option<String>,
diff --git a/crates/mesh-llm-config/src/validate.rs b/crates/mesh-llm-config/src/validate.rs
index a379c27e3..ce5fd75fa 100644
--- a/crates/mesh-llm-config/src/validate.rs
+++ b/crates/mesh-llm-config/src/validate.rs
@@ -497,6 +497,10 @@ fn validate_speculative(config: &SpeculativeConfig, base_path: &str) -> Result<(
         &["auto", "disabled", "draft", "ngram"],
         &format!("{base_path}.mode"),
     )?;
+    validate_optional_non_empty(
+        config.package_strategy.as_deref(),
+        &format!("{base_path}.package_strategy"),
+    )?;
     validate_hf_pair(
         config.draft_hf_repo.as_deref(),
         config.draft_hf_file.as_deref(),
diff --git a/crates/mesh-llm-host-runtime/src/runtime/mod.rs b/crates/mesh-llm-host-runtime/src/runtime/mod.rs
index ae28756a8..a363f9a32 100644
--- a/crates/mesh-llm-host-runtime/src/runtime/mod.rs
+++ b/crates/mesh-llm-host-runtime/src/runtime/mod.rs
@@ -6786,7 +6786,7 @@ async fn spawn_run_auto_startup_model_tasks(
         n_ubatch: primary_n_ubatch,
         flash_attention: primary_flash_attention,
         parallel_override: primary_parallel_override,
-        package_speculative_defaults: !cli.no_draft,
+        package_speculative_defaults: !options.no_draft,
         openai_guardrail_policy: runtime_state.openai_guardrail_policy.clone(),
         split: options.split,
         skippy_telemetry: skippy_telemetry.clone(),
@@ -7086,7 +7086,7 @@ async fn run_auto_load_runtime_model(
                 .and_then(|m| m.flash_attention)
                 .unwrap_or(FlashAttentionType::Auto),
             parallel_override,
-            package_speculative_defaults: !ctx.cli.no_draft,
+            package_speculative_defaults: !ctx.options.no_draft,
             openai_guardrail_policy: ctx.openai_guardrail_policy.clone(),
             skippy_telemetry: skippy_telemetry_options(ctx.options),
             survey_telemetry: ctx.survey_telemetry.clone(),
@@ -7903,7 +7903,7 @@ async fn spawn_run_auto_additional_model_tasks(ctx: RunAutoAdditionalModelsConte
             n_ubatch: extra_model.n_ubatch,
             flash_attention: extra_model.flash_attention,
             parallel_override: extra_model.parallel.or(ctx.config.gpu.parallel),
-            package_speculative_defaults: !ctx.cli.no_draft,
+            package_speculative_defaults: !ctx.options.no_draft,
             openai_guardrail_policy: ctx.openai_guardrail_policy.clone(),
             split: ctx.options.split,
             skippy_telemetry: ctx.skippy_telemetry.clone(),
diff --git a/crates/mesh-llm/src/commands/mod.rs b/crates/mesh-llm/src/commands/mod.rs
index 86c16438b..5c835c720 100644
--- a/crates/mesh-llm/src/commands/mod.rs
+++ b/crates/mesh-llm/src/commands/mod.rs
@@ -103,6 +103,11 @@ async fn dispatch_model_prepare(cmd: &Command) -> Result<()> {
         quant,
         target,
         model_id,
+        spec_draft_model,
+        spec_strategy,
+        spec_initial_window,
+        spec_min_window,
+        spec_max_window,
         flavor,
         timeout,
         mesh_llm_ref,
@@ -126,6 +131,11 @@ async fn dispatch_model_prepare(cmd: &Command) -> Result<()> {
             quant: quant.as_deref(),
             target: target.as_deref(),
             model_id: model_id.as_deref(),
+            spec_draft_model: spec_draft_model.as_deref(),
+            spec_strategy,
+            spec_initial_window: *spec_initial_window,
+            spec_min_window: *spec_min_window,
+            spec_max_window: *spec_max_window,
             flavor,
             timeout,
             mesh_llm_ref,

From f56a90b1e2d81bda0cb298f297af6a26de019164 Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Fri, 5 Jun 2026 19:48:56 +1000
Subject: [PATCH 8/9] Fix native runtime loading for draft packages

---
 crates/mesh-llm-native-runtime/src/cache.rs | 47 +++++++++++++++-
 crates/skippy-ffi/src/lib.rs                | 52 +++++++++++++++++-
 crates/skippy-server/src/runtime_state.rs   | 60 ++++++++++++++++++++-
 scripts/package-native-runtime.sh           |  2 +-
 4 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/crates/mesh-llm-native-runtime/src/cache.rs b/crates/mesh-llm-native-runtime/src/cache.rs
index 413e08591..d920626f9 100644
--- a/crates/mesh-llm-native-runtime/src/cache.rs
+++ b/crates/mesh-llm-native-runtime/src/cache.rs
@@ -203,7 +203,10 @@ fn copy_dir_recursive(source: &Path, target: &Path) -> Result<()> {
         let entry = entry?;
         let source_path = entry.path();
         let target_path = target.join(entry.file_name());
-        if entry.file_type()?.is_dir() {
+        let file_type = entry.file_type()?;
+        if file_type.is_symlink() {
+            copy_symlink(&source_path, &target_path)?;
+        } else if file_type.is_dir() {
             copy_dir_recursive(&source_path, &target_path)?;
         } else {
             fs::copy(&source_path, &target_path).with_context(|| {
@@ -218,6 +221,28 @@ fn copy_dir_recursive(source: &Path, target: &Path) -> Result<()> {
     Ok(())
 }
 
+#[cfg(unix)]
+fn copy_symlink(source: &Path, target: &Path) -> Result<()> {
+    let link = fs::read_link(source).with_context(|| format!("read link {}", source.display()))?;
+    std::os::unix::fs::symlink(&link, target)
+        .with_context(|| format!("symlink {} to {}", link.display(), target.display()))
+}
+
+#[cfg(windows)]
+fn copy_symlink(source: &Path, target: &Path) -> Result<()> {
+    let link = fs::read_link(source).with_context(|| format!("read link {}", source.display()))?;
+    let resolved = source
+        .parent()
+        .unwrap_or_else(|| Path::new("."))
+        .join(&link);
+    if resolved.is_dir() {
+        std::os::windows::fs::symlink_dir(&link, target)
+    } else {
+        std::os::windows::fs::symlink_file(&link, target)
+    }
+    .with_context(|| format!("symlink {} to {}", link.display(), target.display()))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -301,4 +326,24 @@ mod tests {
             ]
         );
     }
+
+    #[cfg(unix)]
+    #[test]
+    fn install_preserves_library_symlinks() {
+        let temp = tempfile::tempdir().unwrap();
+        let source = temp.path().join("source");
+        write_runtime(&source, "0.68.0", "meshllm-native-linux-x86_64-cpu");
+        std::os::unix::fs::symlink("libmeshllm_ffi.so", source.join("lib/libmeshllm_ffi.so.0"))
+            .unwrap();
+
+        let cache = NativeRuntimeCache::new(temp.path().join("cache"));
+        let installed = cache.install_from_dir(&source).unwrap();
+        let installed_alias = installed.path.join("lib/libmeshllm_ffi.so.0");
+
+        assert!(installed_alias.is_symlink());
+        assert_eq!(
+            fs::read_link(installed_alias).unwrap(),
+            Path::new("libmeshllm_ffi.so")
+        );
+    }
 }
diff --git a/crates/skippy-ffi/src/lib.rs b/crates/skippy-ffi/src/lib.rs
index 7ab529d57..91a7b9ccb 100644
--- a/crates/skippy-ffi/src/lib.rs
+++ b/crates/skippy-ffi/src/lib.rs
@@ -344,6 +344,8 @@ impl std::error::Error for NativeRuntimeLoadError {}
 mod dynamic {
     use super::*;
     use libloading::Library;
+    use std::collections::HashSet;
+    use std::path::{Path, PathBuf};
     use std::sync::OnceLock;
 
     static SYMBOLS: OnceLock<Symbols> = OnceLock::new();
@@ -392,12 +394,28 @@ mod dynamic {
             .into_iter()
             .map(|path| path.as_ref().to_path_buf())
             .collect::<Vec<_>>();
-        let symbols = unsafe { Symbols::load_paths(&collected) }?;
+        let symbols = unsafe { Symbols::load_paths(&unique_library_paths(&collected)) }?;
         SYMBOLS
             .set(symbols)
             .map_err(|_| NativeRuntimeLoadError::AlreadyLoaded)
     }
 
+    fn unique_library_paths<P>(paths: &[P]) -> Vec<PathBuf>
+    where
+        P: AsRef<Path>,
+    {
+        let mut seen = HashSet::new();
+        let mut unique = Vec::with_capacity(paths.len());
+        for path in paths {
+            let path = path.as_ref();
+            let identity = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
+            if seen.insert(identity) {
+                unique.push(path.to_path_buf());
+            }
+        }
+        unique
+    }
+
     fn symbols() -> &'static Symbols {
         SYMBOLS
             .get()
@@ -553,6 +571,38 @@ mod dynamic {
         mtmd_helper_eval_chunks(ctx: *mut MtmdContext, lctx: *mut Opaque, chunks: *const MtmdInputChunks, n_past: i32, seq_id: i32, n_batch: i32, logits_last: bool, new_n_past: *mut i32) -> c_int;
         mtmd_helper_eval_chunk_single(ctx: *mut MtmdContext, lctx: *mut Opaque, chunk: *const Opaque, n_past: i32, seq_id: i32, n_batch: i32, logits_last: bool, new_n_past: *mut i32) -> c_int;
     }
+
+    #[cfg(test)]
+    mod tests {
+        use super::unique_library_paths;
+        use std::fs;
+        use std::time::{SystemTime, UNIX_EPOCH};
+
+        #[cfg(unix)]
+        #[test]
+        fn unique_library_paths_deduplicates_symlinks() {
+            let root = std::env::temp_dir().join(format!(
+                "skippy-ffi-{}-{}",
+                std::process::id(),
+                SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .unwrap()
+                    .as_nanos()
+            ));
+            fs::create_dir_all(&root).unwrap();
+            let real = root.join("libllama.0.0.dylib");
+            let soname = root.join("libllama.0.dylib");
+            let linker = root.join("libllama.dylib");
+            fs::write(&real, b"not a real dylib").unwrap();
+            std::os::unix::fs::symlink(&real, &soname).unwrap();
+            std::os::unix::fs::symlink(&real, &linker).unwrap();
+
+            let unique = unique_library_paths(&[&real, &soname, &linker]);
+
+            assert_eq!(unique, vec![real.clone()]);
+            fs::remove_dir_all(root).unwrap();
+        }
+    }
 }
 
 #[cfg(feature = "dynamic-runtime")]
diff --git a/crates/skippy-server/src/runtime_state.rs b/crates/skippy-server/src/runtime_state.rs
index ab44aeeb7..af467ad3f 100644
--- a/crates/skippy-server/src/runtime_state.rs
+++ b/crates/skippy-server/src/runtime_state.rs
@@ -904,6 +904,12 @@ fn runtime_config_from_stage_config(
         .map(u32::try_from)
         .transpose()
         .with_context(|| format!("n_threads_batch exceeds u32 for {}", config.stage_id))?;
+    let requires_tensor_filter = matches!(
+        config.load_mode,
+        LoadMode::LayerPackage | LoadMode::ArtifactSlice
+    ) || config.layer_start > 0
+        || config.downstream.is_some();
+
     Ok(RuntimeConfig {
         stage_index: config.stage_index,
         layer_start: config.layer_start,
@@ -934,7 +940,7 @@ fn runtime_config_from_stage_config(
         projector_path: config.projector_path.clone(),
         include_embeddings: config.layer_start == 0,
         include_output: config.downstream.is_none(),
-        filter_tensors_on_load: config.filter_tensors_on_load,
+        filter_tensors_on_load: config.filter_tensors_on_load || requires_tensor_filter,
     })
 }
 
@@ -1105,7 +1111,7 @@ mod tests {
             cache_type_k: "f16".to_string(),
             cache_type_v: "f16".to_string(),
             flash_attn_type: FlashAttentionType::Enabled,
-            filter_tensors_on_load: true,
+            filter_tensors_on_load: false,
             selected_device: Some(StageDevice {
                 backend_device: "Vulkan1".into(),
                 stable_id: Some("pci:0000:65:00.0".into()),
@@ -1186,6 +1192,7 @@ mod tests {
 
         assert!(!runtime_config.include_embeddings);
         assert!(runtime_config.include_output);
+        assert!(runtime_config.filter_tensors_on_load);
     }
 
     #[test]
@@ -1231,6 +1238,55 @@ mod tests {
         assert_eq!(runtime_config.n_threads_batch, None);
         assert_eq!(runtime_config.n_batch, None);
         assert_eq!(runtime_config.n_ubatch, None);
+        assert!(!runtime_config.filter_tensors_on_load);
+    }
+
+    #[test]
+    fn runtime_config_forces_tensor_filter_for_first_stage_with_downstream() {
+        let config = StageConfig {
+            run_id: "run-a".to_string(),
+            topology_id: "topology-a".to_string(),
+            model_id: "model-a".to_string(),
+            package_ref: Some("/tmp/package".to_string()),
+            manifest_sha256: Some("manifest".to_string()),
+            source_model_path: None,
+            source_model_sha256: None,
+            source_model_bytes: None,
+            materialized_path: None,
+            materialized_pinned: false,
+            model_path: Some("/tmp/package".to_string()),
+            projector_path: None,
+            stage_id: "stage-0".to_string(),
+            stage_index: 0,
+            layer_start: 0,
+            layer_end: 20,
+            ctx_size: 512,
+            lane_count: 1,
+            n_batch: None,
+            n_ubatch: None,
+            n_gpu_layers: -1,
+            cache_type_k: "f16".to_string(),
+            cache_type_v: "f16".to_string(),
+            flash_attn_type: FlashAttentionType::Auto,
+            filter_tensors_on_load: false,
+            selected_device: None,
+            kv_cache: None,
+            load_mode: LoadMode::RuntimeSlice,
+            bind_addr: "127.0.0.1:0".to_string(),
+            upstream: None,
+            downstream: Some(PeerConfig {
+                stage_id: "stage-1".to_string(),
+                stage_index: 1,
+                endpoint: "tcp://127.0.0.1:19001".to_string(),
+            }),
+        };
+
+        let runtime_config =
+            runtime_config_from_stage_config(&config, &RuntimeLaunchOverrides::default()).unwrap();
+
+        assert!(runtime_config.include_embeddings);
+        assert!(!runtime_config.include_output);
+        assert!(runtime_config.filter_tensors_on_load);
     }
 
     #[test]
diff --git a/scripts/package-native-runtime.sh b/scripts/package-native-runtime.sh
index 3786ae912..261865da2 100755
--- a/scripts/package-native-runtime.sh
+++ b/scripts/package-native-runtime.sh
@@ -288,7 +288,7 @@ mkdir -p "$stage_dir/lib"
 library_paths=()
 for library in "${runtime_libraries[@]}"; do
     name="$(basename "$library")"
-    cp "$library" "$stage_dir/lib/$name"
+    cp -P "$library" "$stage_dir/lib/$name"
     library_paths+=("lib/$name")
 done
 

From 63e3ee7be29dd1adb1b576a816da1a76fa8f599a Mon Sep 17 00:00:00 2001
From: James Dumay <jameswdumay@gmail.com>
Date: Sun, 7 Jun 2026 09:54:36 +1000
Subject: [PATCH 9/9] Tune speculative fallback for drafting

---
 .../src/frontend/embedded_generation.rs       | 14 +++-
 .../skippy-server/src/frontend/speculative.rs | 68 ++++++++++++++++--
 crates/skippy-server/src/frontend/tests.rs    | 72 +++++++++++++++++++
 3 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/crates/skippy-server/src/frontend/embedded_generation.rs b/crates/skippy-server/src/frontend/embedded_generation.rs
index 847153510..020768a0c 100644
--- a/crates/skippy-server/src/frontend/embedded_generation.rs
+++ b/crates/skippy-server/src/frontend/embedded_generation.rs
@@ -718,7 +718,7 @@ impl StageOpenAiBackend {
             }
             let max_speculative_window = request.speculative_window.max(1);
             let mut adaptive_window = if request.adaptive_speculative_window {
-                max_speculative_window.min(4)
+                max_speculative_window.min(2)
             } else {
                 max_speculative_window
             };
@@ -767,6 +767,9 @@ impl StageOpenAiBackend {
                 if fused_reached_stop {
                     break;
                 }
+                if decoded_tokens >= request.max_tokens as usize {
+                    break;
+                }
                 if request
                     .cancellation
                     .is_some_and(openai_frontend::CancellationToken::is_cancelled)
@@ -774,6 +777,10 @@ impl StageOpenAiBackend {
                     break;
                 }
                 let token_timer = PhaseTimer::start();
+                if draft_guard.is_some() && speculative_stats.should_disable_for_request() {
+                    speculative_stats.mark_disabled_for_request();
+                    draft_guard = None;
+                }
                 if draft_guard.is_some() {
                     let remaining = request.max_tokens as usize - decoded_tokens;
                     if remaining == 0 {
@@ -985,6 +992,11 @@ impl StageOpenAiBackend {
                             }
                         }
 
+                        let remaining_commit_budget =
+                            (request.max_tokens as usize).saturating_sub(decoded_tokens);
+                        commit_tokens.truncate(remaining_commit_budget);
+                        speculative_stats.committed_tokens += commit_tokens.len();
+
                         let mut reached_stop = false;
                         for token in commit_tokens {
                             current = token;
diff --git a/crates/skippy-server/src/frontend/speculative.rs b/crates/skippy-server/src/frontend/speculative.rs
index b9791139f..f90b9e07c 100644
--- a/crates/skippy-server/src/frontend/speculative.rs
+++ b/crates/skippy-server/src/frontend/speculative.rs
@@ -1,9 +1,16 @@
 use super::*;
 
+const SPEC_FALLBACK_MIN_WINDOWS: usize = 4;
+const SPEC_FALLBACK_MIN_DRAFT_TOKENS: usize = 8;
+const SPEC_FALLBACK_ACCEPT_RATE: f64 = 0.75;
+const SPEC_FALLBACK_REPAIR_RATE: f64 = 0.50;
+const SPEC_FALLBACK_MAX_MS_PER_COMMITTED_TOKEN: f64 = 220.0;
+
 #[derive(Default)]
 pub(super) struct OpenAiSpeculativeStats {
     pub(super) windows: usize,
     pub(super) draft_tokens: usize,
+    pub(super) committed_tokens: usize,
     pub(super) accepted_tokens: usize,
     pub(super) rejected_tokens: usize,
     pub(super) full_accept_windows: usize,
@@ -47,9 +54,26 @@ pub(super) struct OpenAiSpeculativeStats {
     pub(super) adaptive_window_grows: usize,
     pub(super) adaptive_window_shrinks: usize,
     pub(super) adaptive_window_enabled: bool,
+    pub(super) disabled_for_request: bool,
 }
 
 impl OpenAiSpeculativeStats {
+    pub(super) fn should_disable_for_request(&self) -> bool {
+        if self.disabled_for_request
+            || self.windows < SPEC_FALLBACK_MIN_WINDOWS
+            || self.draft_tokens < SPEC_FALLBACK_MIN_DRAFT_TOKENS
+        {
+            return false;
+        }
+        self.accept_rate() < SPEC_FALLBACK_ACCEPT_RATE
+            || self.repair_rate() >= SPEC_FALLBACK_REPAIR_RATE
+            || self.speculative_ms_per_committed_token() > SPEC_FALLBACK_MAX_MS_PER_COMMITTED_TOKEN
+    }
+
+    pub(super) fn mark_disabled_for_request(&mut self) {
+        self.disabled_for_request = true;
+    }
+
     pub(super) fn observe_verify_decision(
         &mut self,
         decision: VerifySpanDecision,
@@ -99,6 +123,32 @@ impl OpenAiSpeculativeStats {
         }
     }
 
+    fn accept_rate(&self) -> f64 {
+        if self.draft_tokens == 0 {
+            0.0
+        } else {
+            self.accepted_tokens as f64 / self.draft_tokens as f64
+        }
+    }
+
+    fn repair_rate(&self) -> f64 {
+        if self.windows == 0 {
+            0.0
+        } else {
+            self.repair_required_windows as f64 / self.windows as f64
+        }
+    }
+
+    fn speculative_ms_per_committed_token(&self) -> f64 {
+        if self.committed_tokens == 0 {
+            0.0
+        } else {
+            let elapsed_ms =
+                self.draft_propose_ms + self.primary_verify_elapsed_ms + self.recovery_ms;
+            elapsed_ms / self.committed_tokens as f64
+        }
+    }
+
     pub(super) fn observe_reject(&mut self, decision: VerifySpanDecision) {
         if let Some(repair_input_count) = decision.repair_input_count {
             self.rejected_windows += 1;
@@ -155,17 +205,21 @@ impl OpenAiSpeculativeStats {
             "llama_stage.spec.accepted".to_string(),
             json!(self.accepted_tokens),
         );
+        attrs.insert(
+            "llama_stage.spec.committed".to_string(),
+            json!(self.committed_tokens),
+        );
         attrs.insert(
             "llama_stage.spec.rejected".to_string(),
             json!(self.rejected_tokens),
         );
         attrs.insert(
             "llama_stage.spec.accept_rate".to_string(),
-            json!(if self.draft_tokens == 0 {
-                0.0
-            } else {
-                self.accepted_tokens as f64 / self.draft_tokens as f64
-            }),
+            json!(self.accept_rate()),
+        );
+        attrs.insert(
+            "llama_stage.spec.ms_per_committed_token".to_string(),
+            json!(self.speculative_ms_per_committed_token()),
         );
         attrs.insert(
             "llama_stage.spec.full_accept_windows".to_string(),
@@ -291,6 +345,10 @@ impl OpenAiSpeculativeStats {
             "llama_stage.spec.window_shrinks".to_string(),
             json!(self.adaptive_window_shrinks),
         );
+        attrs.insert(
+            "llama_stage.spec.disabled_for_request".to_string(),
+            json!(self.disabled_for_request),
+        );
     }
 }
 
diff --git a/crates/skippy-server/src/frontend/tests.rs b/crates/skippy-server/src/frontend/tests.rs
index b148bd4bf..418c258cd 100644
--- a/crates/skippy-server/src/frontend/tests.rs
+++ b/crates/skippy-server/src/frontend/tests.rs
@@ -28,6 +28,78 @@ const MM_CTX_SIZE_ENV: &str = "SKIPPY_MM_CTX_SIZE";
 const MM_MAX_TOKENS_ENV: &str = "SKIPPY_MM_MAX_TOKENS";
 const MM_N_GPU_LAYERS_ENV: &str = "SKIPPY_MM_N_GPU_LAYERS";
 
+#[test]
+fn speculative_fallback_waits_for_enough_evidence() {
+    let stats = OpenAiSpeculativeStats {
+        windows: 3,
+        draft_tokens: 7,
+        accepted_tokens: 0,
+        repair_required_windows: 3,
+        ..OpenAiSpeculativeStats::default()
+    };
+
+    assert!(!stats.should_disable_for_request());
+}
+
+#[test]
+fn speculative_fallback_disables_low_acceptance_requests() {
+    let mut stats = OpenAiSpeculativeStats {
+        windows: 4,
+        draft_tokens: 8,
+        committed_tokens: 5,
+        accepted_tokens: 5,
+        repair_required_windows: 0,
+        ..OpenAiSpeculativeStats::default()
+    };
+
+    assert!(stats.should_disable_for_request());
+    stats.mark_disabled_for_request();
+    assert!(!stats.should_disable_for_request());
+}
+
+#[test]
+fn speculative_fallback_disables_repair_heavy_requests() {
+    let stats = OpenAiSpeculativeStats {
+        windows: 4,
+        draft_tokens: 8,
+        committed_tokens: 8,
+        accepted_tokens: 8,
+        repair_required_windows: 2,
+        ..OpenAiSpeculativeStats::default()
+    };
+
+    assert!(stats.should_disable_for_request());
+}
+
+#[test]
+fn speculative_fallback_keeps_high_acceptance_requests() {
+    let stats = OpenAiSpeculativeStats {
+        windows: 4,
+        draft_tokens: 8,
+        committed_tokens: 8,
+        accepted_tokens: 8,
+        repair_required_windows: 1,
+        primary_verify_elapsed_ms: 1_600.0,
+        ..OpenAiSpeculativeStats::default()
+    };
+
+    assert!(!stats.should_disable_for_request());
+}
+
+#[test]
+fn speculative_fallback_disables_slow_accepted_requests() {
+    let stats = OpenAiSpeculativeStats {
+        windows: 4,
+        draft_tokens: 8,
+        committed_tokens: 8,
+        accepted_tokens: 8,
+        primary_verify_elapsed_ms: 1_900.0,
+        ..OpenAiSpeculativeStats::default()
+    };
+
+    assert!(stats.should_disable_for_request());
+}
+
 #[test]
 fn proactive_eviction_attrs_are_bounded_and_request_free() {
     let attrs = proactive_eviction_attrs("error", Some("inactive_session"), 1024, 2, 768);