From 0c1c14d5dadb29810bbdc5cfae782704c25a01f8 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 15:15:50 +1000 Subject: [PATCH 1/9] Enable package-declared draft speculation --- crates/mesh-llm-cli/src/models.rs | 15 + crates/mesh-llm-cli/src/parser.rs | 20 + crates/mesh-llm-commands/src/model_package.rs | 15 + .../src/inference/skippy/mod.rs | 1 + .../src/inference/skippy/package.rs | 4 + .../inference/skippy/resolver/speculative.rs | 6 + .../src/inference/skippy/resolver/tests.rs | 1 + .../inference/skippy/resolver/translation.rs | 2 +- .../src/inference/skippy/resolver/types.rs | 2 + .../src/plugin/config.rs | 5 +- .../src/runtime/local.rs | 129 ++++++- .../src/runtime/split_planning.rs | 1 + crates/mesh-llm/src/commands/models/mod.rs | 10 + crates/model-package/src/prepare.rs | 127 +++++- crates/model-package/src/script.rs | 4 + .../src/scripts/split-model-job.sh | 18 +- crates/skippy-model-package/README.md | 12 + crates/skippy-model-package/src/main.rs | 188 ++++++++- crates/skippy-runtime/src/package.rs | 197 ++++++++++ docs/LAYER_PACKAGE_REPOS.md | 8 + docs/MTP.md | 362 ++++++++++++++++++ docs/USAGE.md | 3 +- docs/specs/layer-package-repos.md | 61 +++ 23 files changed, 1178 insertions(+), 13 deletions(-) create mode 100644 docs/MTP.md diff --git a/crates/mesh-llm-cli/src/models.rs b/crates/mesh-llm-cli/src/models.rs index 95fbe8a2c..8895cffa0 100644 --- a/crates/mesh-llm-cli/src/models.rs +++ b/crates/mesh-llm-cli/src/models.rs @@ -30,6 +30,21 @@ pub enum ModelsCommand { /// Override model ID in the manifest. #[arg(long)] model_id: Option, + /// Draft model ref to declare for package default speculative decoding. + #[arg(long = "spec-draft-model")] + spec_draft_model: Option, + /// Package speculative strategy name. + #[arg(long = "spec-strategy", default_value = "draft")] + spec_strategy: String, + /// Initial adaptive speculative decode window for the package strategy. + #[arg(long = "spec-initial-window", default_value_t = 16)] + spec_initial_window: u32, + /// Minimum adaptive speculative decode window for the package strategy. + #[arg(long = "spec-min-window", default_value_t = 2)] + spec_min_window: u32, + /// Maximum adaptive speculative decode window for the package strategy. + #[arg(long = "spec-max-window", default_value_t = 16)] + spec_max_window: u32, /// HF Job hardware flavor. Use auto for the default CPU splitter baseline. #[arg(long, default_value = "auto")] flavor: String, diff --git a/crates/mesh-llm-cli/src/parser.rs b/crates/mesh-llm-cli/src/parser.rs index 350f20e8e..f30b3d510 100644 --- a/crates/mesh-llm-cli/src/parser.rs +++ b/crates/mesh-llm-cli/src/parser.rs @@ -859,6 +859,26 @@ pub enum Command { #[arg(long)] model_id: Option, + /// Draft model ref to declare for package default speculative decoding. + #[arg(long = "spec-draft-model")] + spec_draft_model: Option, + + /// Package speculative strategy name. + #[arg(long = "spec-strategy", default_value = "draft")] + spec_strategy: String, + + /// Initial adaptive speculative decode window for the package strategy. + #[arg(long = "spec-initial-window", default_value_t = 16)] + spec_initial_window: u32, + + /// Minimum adaptive speculative decode window for the package strategy. + #[arg(long = "spec-min-window", default_value_t = 2)] + spec_min_window: u32, + + /// Maximum adaptive speculative decode window for the package strategy. + #[arg(long = "spec-max-window", default_value_t = 16)] + spec_max_window: u32, + /// HF Job hardware flavor. Use auto for the default CPU splitter baseline. #[arg(long, default_value = "auto")] flavor: String, diff --git a/crates/mesh-llm-commands/src/model_package.rs b/crates/mesh-llm-commands/src/model_package.rs index 15c52163c..8377cb429 100644 --- a/crates/mesh-llm-commands/src/model_package.rs +++ b/crates/mesh-llm-commands/src/model_package.rs @@ -13,6 +13,11 @@ pub struct ModelPrepareArgs<'a> { pub quant: Option<&'a str>, pub target: Option<&'a str>, pub model_id: Option<&'a str>, + pub spec_draft_model: Option<&'a str>, + pub spec_strategy: &'a str, + pub spec_initial_window: u32, + pub spec_min_window: u32, + pub spec_max_window: u32, pub flavor: &'a str, pub timeout: &'a str, pub mesh_llm_ref: &'a str, @@ -34,6 +39,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> { quant, target, model_id, + spec_draft_model, + spec_strategy, + spec_initial_window, + spec_min_window, + spec_max_window, flavor, timeout, mesh_llm_ref, @@ -119,6 +129,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> { quant: source_quant.map(|s| s.to_string()), target: target.map(|s| s.to_string()), model_id: model_id.map(|s| s.to_string()), + spec_draft_model: spec_draft_model.map(|s| s.to_string()), + spec_strategy: spec_strategy.to_string(), + spec_initial_window, + spec_min_window, + spec_max_window, flavor: flavor.to_string(), timeout_seconds, mesh_llm_ref: mesh_llm_ref.to_string(), diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs index 04cfb4e55..c9fbb6b0c 100644 --- a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs +++ b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs @@ -1017,6 +1017,7 @@ mod tests { layer_count, activation_width: 4096, tensor_count: 100, + speculative_decoding: None, } } diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs index 94d338f11..de6c94d77 100644 --- a/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs +++ b/crates/mesh-llm-host-runtime/src/inference/skippy/package.rs @@ -7,6 +7,7 @@ use std::{ use anyhow::{Context, Result}; use serde::Serialize; use sha2::{Digest, Sha256}; +use skippy_runtime::package::SpeculativeDecodingConfig; #[derive(Clone, Debug, Eq, PartialEq)] pub struct SkippyPackageIdentity { @@ -19,6 +20,7 @@ pub struct SkippyPackageIdentity { pub layer_count: u32, pub activation_width: u32, pub tensor_count: u64, + pub speculative_decoding: Option, } #[derive(Clone, Debug, Eq, PartialEq, Serialize)] @@ -109,6 +111,7 @@ pub fn synthetic_direct_gguf_package( layer_count: compact.layer_count, activation_width: compact.embedding_size, tensor_count, + speculative_decoding: None, }) } @@ -328,6 +331,7 @@ pub fn identity_from_layer_package(package_ref: &str) -> Result SkippyPackageIdentity { layer_count, activation_width: 4096, tensor_count: 100, + speculative_decoding: None, } } diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs index d963f2fae..bc788d795 100644 --- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs +++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/translation.rs @@ -234,7 +234,7 @@ impl ResolvedSkippyConfig { } else { 0 }, - adaptive_speculative_window: false, + adaptive_speculative_window: mode == "draft" && self.speculative.adaptive_window, draft_n_gpu_layers: if mode == "draft" { self.speculative.draft_n_gpu_layers } else { diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs index 6e59fbb1c..98688d8c9 100644 --- a/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs +++ b/crates/mesh-llm-host-runtime/src/inference/skippy/resolver/types.rs @@ -88,9 +88,11 @@ pub(crate) struct ResolvedSkippyExecutionConfig { #[derive(Clone, Debug, PartialEq)] pub(crate) struct ResolvedSpeculativeConfig { pub(crate) mode: String, + pub(crate) package_strategy: Option, pub(crate) draft_model_path: Option, pub(crate) pairing_fault: String, pub(crate) draft_max_tokens: u32, + pub(crate) adaptive_window: bool, pub(crate) explicit: bool, pub(crate) draft_n_gpu_layers: Option, } diff --git a/crates/mesh-llm-host-runtime/src/plugin/config.rs b/crates/mesh-llm-host-runtime/src/plugin/config.rs index ee85080cc..19a1cbf37 100644 --- a/crates/mesh-llm-host-runtime/src/plugin/config.rs +++ b/crates/mesh-llm-host-runtime/src/plugin/config.rs @@ -16,8 +16,9 @@ pub use mesh_llm_config::{ MultimodalConfig, OwnerControlConfig, PluginConfigEditor, PluginConfigEntry, PluginStartupConfig, PrefixCacheConfig, ReasoningBudget, ReasoningEnabled, RequestDefaultsConfig, ReservedObjectConfig, SkippyConfig, SpeculativeConfig, - StringOrStringList, TelemetryConfig, TelemetryMetricsConfig, TensorSplitConfig, - ThroughputConfig, config_path, config_to_toml, load_config, parse_config_toml, validate_config, + SpeculativeConfigEditor, StringOrStringList, TelemetryConfig, TelemetryMetricsConfig, + TensorSplitConfig, ThroughputConfig, config_path, config_to_toml, load_config, + parse_config_toml, validate_config, }; use mesh_llm_plugin::MeshVisibility; use std::collections::BTreeMap; diff --git a/crates/mesh-llm-host-runtime/src/runtime/local.rs b/crates/mesh-llm-host-runtime/src/runtime/local.rs index 70ce51505..d96f6e8b0 100644 --- a/crates/mesh-llm-host-runtime/src/runtime/local.rs +++ b/crates/mesh-llm-host-runtime/src/runtime/local.rs @@ -1205,7 +1205,7 @@ async fn load_split_runtime_generation_inner( spec: &SplitGenerationLoadSpec<'_>, cleanup_on_error: &mut bool, ) -> Result { - let settings = split_generation_load_settings(spec)?; + let settings = split_generation_load_settings(spec).await?; anyhow::ensure!( settings.stage0.node_id == spec.node.id(), "split topology stage 0 moved to {}; local coordinator is {}", @@ -1512,7 +1512,7 @@ fn split_runtime_stage_upstream( }) } -fn split_generation_load_settings<'a>( +async fn split_generation_load_settings<'a>( spec: &'a SplitGenerationLoadSpec<'_>, ) -> Result> { let stage0 = spec @@ -1554,6 +1554,7 @@ fn split_generation_load_settings<'a>( if let Some(gpu) = spec.pinned_gpu { resolved.hardware.device = Some(gpu.backend_device.clone()); } + apply_package_speculative_defaults(&mut resolved, spec.package).await?; let embedded_openai = resolved.to_embedded_openai_args(activation_width, true)?; let runtime_options = resolved.to_embedded_runtime_options( &spec.skippy_telemetry, @@ -1576,6 +1577,97 @@ fn split_generation_load_settings<'a>( }) } +async fn apply_package_speculative_defaults( + resolved: &mut skippy::ResolvedSkippyConfig, + package: &skippy::SkippyPackageIdentity, +) -> Result<()> { + if resolved.speculative.explicit || resolved.speculative.mode == "draft" { + return Ok(()); + } + let Some(config) = package.speculative_decoding.as_ref() else { + return Ok(()); + }; + let Some(strategy_name) = selected_package_speculative_strategy(config, resolved) else { + return Ok(()); + }; + let Some(strategy) = config.strategies.get(&strategy_name) else { + anyhow::bail!( + "speculative decoding strategy '{strategy_name}' is not declared by package {}", + package.package_ref + ); + }; + if strategy.strategy_type != "draft-model" { + tracing::warn!( + package_ref = package.package_ref, + strategy = strategy_name, + strategy_type = strategy.strategy_type, + "package speculative strategy is not supported by the embedded Skippy runtime" + ); + return Ok(()); + } + let draft_ref = strategy + .draft_model + .as_deref() + .context("draft-model speculative strategy is missing draft_model")?; + let draft_path = match models::download_model_ref_with_progress_details(draft_ref, true).await { + Ok((path, _)) => path, + Err(error) => { + tracing::warn!( + package_ref = package.package_ref, + strategy = strategy_name, + draft_ref, + error = %error, + "package draft model could not be resolved; starting without speculative decoding" + ); + return Ok(()); + } + }; + let window = speculative_strategy_window(strategy)?; + let adaptive = strategy + .window_policy + .as_ref() + .and_then(|policy| policy.default.as_deref()) + == Some("adaptive"); + resolved.speculative.mode = "draft".to_string(); + resolved.speculative.draft_model_path = Some(draft_path.clone()); + resolved.speculative.draft_max_tokens = window; + resolved.speculative.adaptive_window = adaptive; + resolved.speculative.draft_n_gpu_layers = None; + tracing::info!( + package_ref = package.package_ref, + strategy = strategy_name, + draft_ref, + draft_path = %draft_path.display(), + window, + adaptive, + "enabled package-declared draft speculative decoding" + ); + Ok(()) +} + +fn selected_package_speculative_strategy( + config: &skippy_runtime::package::SpeculativeDecodingConfig, + resolved: &skippy::ResolvedSkippyConfig, +) -> Option { + match resolved.speculative.package_strategy.as_deref() { + Some("default") | None => config.default.clone(), + Some(strategy) => Some(strategy.to_string()), + } +} + +fn speculative_strategy_window( + strategy: &skippy_runtime::package::SpeculativeStrategyConfig, +) -> Result { + let Some(policy) = strategy.window_policy.as_ref() else { + return Ok(0); + }; + Ok(policy + .initial_window + .or(policy.fixed_default_window) + .or(policy.max_window) + .unwrap_or(0)) +} + fn split_generation_load_mode(package: &skippy::SkippyPackageIdentity) -> LoadMode { if skippy::is_layer_package_ref(&package.package_ref) { LoadMode::LayerPackage @@ -3629,7 +3721,7 @@ async fn start_runtime_layer_package_model( )> { let context_length = plan.context_length; let fallback_projector_path = mmproj_path_for_model(&model_name).filter(|path| path.exists()); - let resolved = resolve_runtime_skippy_config( + let mut resolved = resolve_runtime_skippy_config( &spec, &model_name, package.source_model_bytes, @@ -3637,6 +3729,7 @@ async fn start_runtime_layer_package_model( plan.slots, fallback_projector_path, )?; + apply_package_speculative_defaults(&mut resolved, &package).await?; tracing::info!( model = model_name, "KV cache: {} K + {} V, {}K context", @@ -3809,9 +3902,34 @@ mod tests { layer_count, activation_width: 2048, tensor_count: 100, + speculative_decoding: None, + } + } + + fn draft_spec_strategy() -> skippy_runtime::package::SpeculativeStrategyConfig { + skippy_runtime::package::SpeculativeStrategyConfig { + strategy_type: "draft-model".to_string(), + draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123".to_string()), + model_family: None, + window_policy: Some(skippy_runtime::package::SpeculativeWindowPolicyConfig { + default: Some("adaptive".to_string()), + fixed_default_window: None, + initial_window: Some(16), + min_window: Some(2), + max_window: Some(16), + adaptive: None, + }), + identity: None, } } + #[test] + fn package_speculative_window_uses_adaptive_initial_window() { + let strategy = draft_spec_strategy(); + + assert_eq!(speculative_strategy_window(&strategy).unwrap(), 16); + } + fn stage_load_request(load_mode: LoadMode) -> skippy::StageLoadRequest { skippy::StageLoadRequest { topology_id: "topology-a".to_string(), @@ -4177,8 +4295,9 @@ stop = ["END"] skippy_telemetry: skippy::SkippyTelemetryOptions::off(), survey_telemetry: survey::SurveyTelemetry::disabled(), }; - let settings = - split_generation_load_settings(&spec).expect("split settings should resolve"); + let settings = split_generation_load_settings(&spec) + .await + .expect("split settings should resolve"); assert_eq!(settings.load_mode, LoadMode::LayerPackage); assert_eq!(settings.activation_width, 2048); diff --git a/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs b/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs index eb86476cf..192fe65a3 100644 --- a/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs +++ b/crates/mesh-llm-host-runtime/src/runtime/split_planning.rs @@ -551,6 +551,7 @@ mod tests { layer_count, activation_width: 896, tensor_count: 100, + speculative_decoding: None, } } diff --git a/crates/mesh-llm/src/commands/models/mod.rs b/crates/mesh-llm/src/commands/models/mod.rs index e8ad7a9e0..1d293d1c9 100644 --- a/crates/mesh-llm/src/commands/models/mod.rs +++ b/crates/mesh-llm/src/commands/models/mod.rs @@ -462,6 +462,11 @@ pub async fn dispatch_models_command(command: &ModelsCommand) -> Result<()> { quant, target, model_id, + spec_draft_model, + spec_strategy, + spec_initial_window, + spec_min_window, + spec_max_window, flavor, timeout, mesh_llm_ref, @@ -481,6 +486,11 @@ pub async fn dispatch_models_command(command: &ModelsCommand) -> Result<()> { quant: quant.as_deref(), target: target.as_deref(), model_id: model_id.as_deref(), + spec_draft_model: spec_draft_model.as_deref(), + spec_strategy, + spec_initial_window: *spec_initial_window, + spec_min_window: *spec_min_window, + spec_max_window: *spec_max_window, flavor, timeout, mesh_llm_ref, diff --git a/crates/model-package/src/prepare.rs b/crates/model-package/src/prepare.rs index 982936485..847865fe6 100644 --- a/crates/model-package/src/prepare.rs +++ b/crates/model-package/src/prepare.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use anyhow::{Context, Result}; +use anyhow::{Context, Result, ensure}; use futures::StreamExt; use hf_hub::HFClient; use hf_hub::repository::RepoTreeEntry; @@ -19,6 +19,11 @@ pub struct PrepareParams { pub quant: Option, pub target: Option, pub model_id: Option, + pub spec_draft_model: Option, + pub spec_strategy: String, + pub spec_initial_window: u32, + pub spec_min_window: u32, + pub spec_max_window: u32, pub flavor: String, pub timeout_seconds: u64, pub mesh_llm_ref: String, @@ -117,6 +122,8 @@ pub async fn resolve( params: PrepareParams, permissions: &PermissionCheck, ) -> Result { + validate_speculative_params(¶ms)?; + let quant = params .quant .as_deref() @@ -193,6 +200,16 @@ pub async fn resolve( environment.insert("MODEL_ID".into(), model_id.clone()); environment.insert("SOURCE_REVISION".into(), "main".into()); environment.insert("MESH_LLM_REF".into(), params.mesh_llm_ref.clone()); + if let Some(spec_draft_model) = params.spec_draft_model.as_deref() { + environment.insert("SPEC_DRAFT_MODEL".into(), spec_draft_model.to_string()); + environment.insert("SPEC_STRATEGY".into(), params.spec_strategy.clone()); + environment.insert( + "SPEC_INITIAL_WINDOW".into(), + params.spec_initial_window.to_string(), + ); + environment.insert("SPEC_MIN_WINDOW".into(), params.spec_min_window.to_string()); + environment.insert("SPEC_MAX_WINDOW".into(), params.spec_max_window.to_string()); + } environment.insert( "CATALOG_CREATE_PR".into(), if permissions.catalog_create_pr { @@ -248,6 +265,41 @@ pub async fn resolve( }) } +fn validate_speculative_params(params: &PrepareParams) -> Result<()> { + let Some(draft_model) = params.spec_draft_model.as_deref() else { + return Ok(()); + }; + ensure!( + !draft_model.trim().is_empty(), + "--spec-draft-model must not be empty when set" + ); + ensure!( + !params.spec_strategy.trim().is_empty(), + "--spec-strategy must not be empty" + ); + ensure!( + params.spec_min_window > 0, + "--spec-min-window must be greater than zero" + ); + ensure!( + params.spec_initial_window > 0, + "--spec-initial-window must be greater than zero" + ); + ensure!( + params.spec_max_window > 0, + "--spec-max-window must be greater than zero" + ); + ensure!( + params.spec_min_window <= params.spec_initial_window, + "--spec-min-window must not exceed --spec-initial-window" + ); + ensure!( + params.spec_initial_window <= params.spec_max_window, + "--spec-initial-window must not exceed --spec-max-window" + ); + Ok(()) +} + /// Format a byte count as a human-readable size. pub fn format_size(bytes: u64) -> String { const KB: u64 = 1024; @@ -313,4 +365,77 @@ mod tests { .collect::>(); assert_eq!(names, vec!["Q4_K_M", "Q8_0"]); } + + #[test] + fn speculative_env_is_present_only_when_draft_model_is_set() { + let params = PrepareParams { + source_repo: "unsloth/Llama-3.3-70B-Instruct-GGUF".to_string(), + quant: Some("Q3_K_M".to_string()), + target: None, + model_id: None, + spec_draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M".to_string()), + spec_strategy: "llama32-1b-q4".to_string(), + spec_initial_window: 16, + spec_min_window: 2, + spec_max_window: 16, + flavor: "auto".to_string(), + timeout_seconds: 3600, + mesh_llm_ref: "main".to_string(), + hf_token: None, + }; + + let mut environment = HashMap::new(); + if let Some(spec_draft_model) = params.spec_draft_model.as_deref() { + environment.insert("SPEC_DRAFT_MODEL".to_string(), spec_draft_model.to_string()); + environment.insert("SPEC_STRATEGY".to_string(), params.spec_strategy.clone()); + environment.insert( + "SPEC_INITIAL_WINDOW".to_string(), + params.spec_initial_window.to_string(), + ); + environment.insert( + "SPEC_MIN_WINDOW".to_string(), + params.spec_min_window.to_string(), + ); + environment.insert( + "SPEC_MAX_WINDOW".to_string(), + params.spec_max_window.to_string(), + ); + } + + assert_eq!( + environment.get("SPEC_DRAFT_MODEL").map(String::as_str), + Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M") + ); + assert_eq!( + environment.get("SPEC_STRATEGY").map(String::as_str), + Some("llama32-1b-q4") + ); + assert_eq!( + environment.get("SPEC_INITIAL_WINDOW").map(String::as_str), + Some("16") + ); + validate_speculative_params(¶ms).unwrap(); + } + + #[test] + fn speculative_window_validation_rejects_inverted_range() { + let params = PrepareParams { + source_repo: "unsloth/Llama-3.3-70B-Instruct-GGUF".to_string(), + quant: Some("Q3_K_M".to_string()), + target: None, + model_id: None, + spec_draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M".to_string()), + spec_strategy: "draft".to_string(), + spec_initial_window: 1, + spec_min_window: 2, + spec_max_window: 16, + flavor: "auto".to_string(), + timeout_seconds: 3600, + mesh_llm_ref: "main".to_string(), + hf_token: None, + }; + + let err = validate_speculative_params(¶ms).unwrap_err(); + assert!(err.to_string().contains("--spec-min-window")); + } } diff --git a/crates/model-package/src/script.rs b/crates/model-package/src/script.rs index 6a5ae549e..ec1fa6297 100644 --- a/crates/model-package/src/script.rs +++ b/crates/model-package/src/script.rs @@ -196,6 +196,10 @@ mod tests { assert!(EMBEDDED_SCRIPT.contains(r#"MOUNTED_SOURCE_PATH="/source/${SOURCE_FILE}""#)); assert!(EMBEDDED_SCRIPT.contains(r#"WRITE_PACKAGE_INPUT="$MOUNTED_SOURCE_PATH""#)); assert!(EMBEDDED_SCRIPT.contains(r#"--source-file "$SOURCE_FILE""#)); + assert!(EMBEDDED_SCRIPT.contains(r#"WRITE_PACKAGE_SPEC_ARGS=()"#)); + assert!(EMBEDDED_SCRIPT.contains(r#"--spec-draft-model "$SPEC_DRAFT_MODEL""#)); + assert!(EMBEDDED_SCRIPT.contains(r#"--spec-strategy "${SPEC_STRATEGY:-draft}""#)); + assert!(EMBEDDED_SCRIPT.contains(r#"--spec-initial-window "${SPEC_INITIAL_WINDOW:-16}""#)); assert!(EMBEDDED_SCRIPT.contains(r#"time "$SLICER" write-package "$WRITE_PACKAGE_INPUT""#)); assert!(!EMBEDDED_SCRIPT.contains(r#"time $SLICER write-package "$SOURCE_PATH""#)); } diff --git a/crates/model-package/src/scripts/split-model-job.sh b/crates/model-package/src/scripts/split-model-job.sh index 5a44bfd61..ef0e7a21c 100755 --- a/crates/model-package/src/scripts/split-model-job.sh +++ b/crates/model-package/src/scripts/split-model-job.sh @@ -6,6 +6,8 @@ set -euo pipefail # # Environment variables (set by mesh-llm model-package job spec): # SOURCE_REPO, SOURCE_FILE, SOURCE_QUANT, TARGET_REPO, MODEL_ID, SOURCE_REVISION +# SPEC_DRAFT_MODEL, SPEC_STRATEGY, SPEC_INITIAL_WINDOW, SPEC_MIN_WINDOW, +# SPEC_MAX_WINDOW — optional package-declared draft-model speculation # MESH_LLM_REF — git ref to build from (default: main) # CATALOG_CREATE_PR — "true" to open a PR for catalog updates (non-org members) # HF_TOKEN — injected as a secret by HF Jobs @@ -32,6 +34,9 @@ echo "║ Quant: ${SOURCE_QUANT}" echo "║ Target: ${TARGET_REPO}" echo "║ Model: ${MODEL_ID}" echo "║ Build: mesh-llm @ ${MESH_LLM_REF}" +if [ -n "${SPEC_DRAFT_MODEL:-}" ]; then + echo "║ Draft: ${SPEC_DRAFT_MODEL}" +fi echo "╚══════════════════════════════════════════════════════════╝" echo "" @@ -287,6 +292,16 @@ else WRITE_PACKAGE_IDENTITY_ARGS=() echo " Source mount: not available; falling back to Hugging Face cache download" fi +WRITE_PACKAGE_SPEC_ARGS=() +if [ -n "${SPEC_DRAFT_MODEL:-}" ]; then + WRITE_PACKAGE_SPEC_ARGS=( + --spec-draft-model "$SPEC_DRAFT_MODEL" + --spec-strategy "${SPEC_STRATEGY:-draft}" + --spec-initial-window "${SPEC_INITIAL_WINDOW:-16}" + --spec-min-window "${SPEC_MIN_WINDOW:-2}" + --spec-max-window "${SPEC_MAX_WINDOW:-16}" + ) +fi echo " Hugging Face cache: $HF_HUB_CACHE" echo " Package workspace: $PACKAGE_DIR" echo " Temporary workspace: $TMPDIR" @@ -309,7 +324,8 @@ set +e time "$SLICER" write-package "$WRITE_PACKAGE_INPUT" \ --out-dir "$PACKAGE_DIR" \ --after-artifact-command "$ARTIFACT_UPLOAD_HOOK" \ - "${WRITE_PACKAGE_IDENTITY_ARGS[@]}" + "${WRITE_PACKAGE_IDENTITY_ARGS[@]}" \ + "${WRITE_PACKAGE_SPEC_ARGS[@]}" WRITE_PACKAGE_STATUS=$? set -e stop_heartbeat diff --git a/crates/skippy-model-package/README.md b/crates/skippy-model-package/README.md index 5590dbcde..12bff1dde 100644 --- a/crates/skippy-model-package/README.md +++ b/crates/skippy-model-package/README.md @@ -74,6 +74,18 @@ them into `projectors/`, fingerprints them, records them as `kind: "mmproj"` in checksums and sizes. Package-backed serving uses the first declared projector when no explicit `projector_path` is supplied by the caller. +Packages can also declare a default draft-model speculative decoding strategy: + +```bash +skippy-model-package write-package org/repo:Q3_K_M \ + --out-dir model-package/ \ + --spec-draft-model unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M \ + --spec-strategy llama32-1b-q4 \ + --spec-initial-window 16 \ + --spec-min-window 2 \ + --spec-max-window 16 +``` + Local paths are only accepted for package creation when the caller supplies explicit provenance: diff --git a/crates/skippy-model-package/src/main.rs b/crates/skippy-model-package/src/main.rs index e35047093..f9e4b2048 100644 --- a/crates/skippy-model-package/src/main.rs +++ b/crates/skippy-model-package/src/main.rs @@ -75,6 +75,16 @@ enum Command { source_revision: Option, #[arg(long)] source_file: Option, + #[arg(long = "spec-draft-model")] + spec_draft_model: Option, + #[arg(long = "spec-strategy", default_value = "draft")] + spec_strategy: String, + #[arg(long = "spec-initial-window", default_value_t = 16)] + spec_initial_window: u32, + #[arg(long = "spec-min-window", default_value_t = 2)] + spec_min_window: u32, + #[arg(long = "spec-max-window", default_value_t = 16)] + spec_max_window: u32, }, Validate { full: PathBuf, @@ -180,6 +190,8 @@ struct PackageManifest { layer_count: u32, #[serde(default, skip_serializing_if = "Option::is_none")] activation_width: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + generation: Option, shared: PackageShared, #[serde(default, skip_serializing_if = "Vec::is_empty")] projectors: Vec, @@ -188,6 +200,67 @@ struct PackageManifest { created_at_unix_secs: u64, } +#[derive(Debug, Deserialize, Serialize)] +struct PackageGeneration { + #[serde(default, skip_serializing_if = "Option::is_none")] + speculative_decoding: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PackageSpeculativeDecoding { + #[serde(default, skip_serializing_if = "Option::is_none")] + default: Option, + #[serde(default)] + strategies: BTreeMap, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PackageSpeculativeStrategy { + #[serde(rename = "type")] + strategy_type: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + draft_model: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + model_family: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + window_policy: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + identity: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PackageSpeculativeWindowPolicy { + #[serde(default, skip_serializing_if = "Option::is_none")] + default: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + fixed_default_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + initial_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + min_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + max_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + adaptive: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PackageSpeculativeAdaptiveWindow { + supported: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + initial_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + min_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + max_window: Option, +} + +#[derive(Debug, Deserialize, Serialize)] +struct PackageSpeculativeIdentity { + #[serde(default, skip_serializing_if = "Option::is_none")] + source_model_sha256: Option, +} + #[derive(Debug, Deserialize, Serialize)] struct PackageSourceModel { path: String, @@ -313,6 +386,15 @@ struct ExplicitSourceIdentity { source_file: Option, } +#[derive(Debug)] +struct PackageSpeculativeOptions { + draft_model: Option, + strategy: String, + initial_window: u32, + min_window: u32, + max_window: u32, +} + #[derive(Debug)] struct PackageInput { model_path: PathBuf, @@ -372,6 +454,11 @@ fn main() -> Result<()> { source_repo, source_revision, source_file, + spec_draft_model, + spec_strategy, + spec_initial_window, + spec_min_window, + spec_max_window, } => write_package( model, out_dir, @@ -385,6 +472,13 @@ fn main() -> Result<()> { source_revision, source_file, }, + PackageSpeculativeOptions { + draft_model: spec_draft_model, + strategy: spec_strategy, + initial_window: spec_initial_window, + min_window: spec_min_window, + max_window: spec_max_window, + }, ), Command::Validate { full, slices } => validate(full, slices), Command::ValidatePackage { full, package } => validate_package(full, package), @@ -475,12 +569,73 @@ fn write_stages(model: PathBuf, stages: usize, out_dir: PathBuf) -> Result<()> { Ok(()) } +fn package_generation(options: PackageSpeculativeOptions) -> Result> { + let Some(draft_model) = options.draft_model else { + return Ok(None); + }; + ensure!( + !draft_model.trim().is_empty(), + "--spec-draft-model must not be empty when set" + ); + ensure!( + !options.strategy.trim().is_empty(), + "--spec-strategy must not be empty" + ); + ensure!( + options.min_window > 0, + "--spec-min-window must be greater than zero" + ); + ensure!( + options.initial_window > 0, + "--spec-initial-window must be greater than zero" + ); + ensure!( + options.max_window > 0, + "--spec-max-window must be greater than zero" + ); + ensure!( + options.min_window <= options.initial_window, + "--spec-min-window must not exceed --spec-initial-window" + ); + ensure!( + options.initial_window <= options.max_window, + "--spec-initial-window must not exceed --spec-max-window" + ); + + let mut strategies = BTreeMap::new(); + strategies.insert( + options.strategy.clone(), + PackageSpeculativeStrategy { + strategy_type: "draft-model".to_string(), + draft_model: Some(draft_model), + model_family: None, + window_policy: Some(PackageSpeculativeWindowPolicy { + default: Some("adaptive".to_string()), + fixed_default_window: None, + initial_window: Some(options.initial_window), + min_window: Some(options.min_window), + max_window: Some(options.max_window), + adaptive: None, + }), + identity: None, + }, + ); + + Ok(Some(PackageGeneration { + speculative_decoding: Some(PackageSpeculativeDecoding { + default: Some(options.strategy), + strategies, + }), + })) +} + fn write_package( model: String, out_dir: PathBuf, projectors: Vec, artifact_hook: ArtifactHook, explicit: ExplicitSourceIdentity, + speculative: PackageSpeculativeOptions, ) -> Result<()> { let input = resolve_package_input(model, explicit)?; fs::create_dir_all(&out_dir) @@ -594,6 +749,7 @@ fn write_package( format: "layer-package".to_string(), layer_count, activation_width: Some(activation_width), + generation: package_generation(speculative)?, shared: PackageShared { metadata, embeddings, @@ -1824,8 +1980,9 @@ fn hex_lower(bytes: &[u8]) -> String { #[cfg(test)] mod tests { use super::{ - ExplicitSourceIdentity, activation_width, local_artifact_files, model_distribution_id, - resolve_gguf_shard_paths, resolve_local_package_input, + ExplicitSourceIdentity, PackageSpeculativeOptions, activation_width, local_artifact_files, + model_distribution_id, package_generation, resolve_gguf_shard_paths, + resolve_local_package_input, }; use std::path::{Path, PathBuf}; @@ -1851,6 +2008,33 @@ mod tests { assert!(error.to_string().contains("requires --model-id")); } + #[test] + fn package_generation_emits_default_draft_strategy() { + let generation = package_generation(PackageSpeculativeOptions { + draft_model: Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123".to_string()), + strategy: "llama32-1b-q4".to_string(), + initial_window: 16, + min_window: 2, + max_window: 16, + }) + .unwrap() + .unwrap(); + + let speculative = generation.speculative_decoding.unwrap(); + assert_eq!(speculative.default.as_deref(), Some("llama32-1b-q4")); + let strategy = speculative.strategies.get("llama32-1b-q4").unwrap(); + assert_eq!(strategy.strategy_type, "draft-model"); + assert_eq!( + strategy.draft_model.as_deref(), + Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abc123") + ); + let policy = strategy.window_policy.as_ref().unwrap(); + assert_eq!(policy.default.as_deref(), Some("adaptive")); + assert_eq!(policy.initial_window, Some(16)); + assert_eq!(policy.min_window, Some(2)); + assert_eq!(policy.max_window, Some(16)); + } + #[test] fn local_package_input_uses_explicit_coordinate_identity() { let input = resolve_local_package_input( diff --git a/crates/skippy-runtime/src/package.rs b/crates/skippy-runtime/src/package.rs index dfa590708..3b8e7cf37 100644 --- a/crates/skippy-runtime/src/package.rs +++ b/crates/skippy-runtime/src/package.rs @@ -55,6 +55,63 @@ pub struct LayerPackageInfo { pub activation_width: Option, pub projectors: Vec, pub layers: Vec, + pub generation: Option, + pub speculative_decoding: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct SpeculativeDecodingConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub default: Option, + #[serde(default)] + pub strategies: BTreeMap, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct SpeculativeStrategyConfig { + #[serde(rename = "type")] + pub strategy_type: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub draft_model: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_family: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub window_policy: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub identity: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct SpeculativeWindowPolicyConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub default: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub fixed_default_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub initial_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub min_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub adaptive: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct SpeculativeAdaptiveWindowConfig { + pub supported: bool, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub initial_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub min_window: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub max_window: Option, +} + +#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] +pub struct SpeculativeIdentityConfig { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_model_sha256: Option, } #[derive(Debug, Clone)] @@ -139,6 +196,8 @@ struct PackageManifest { layer_count: u32, #[serde(default)] activation_width: Option, + #[serde(default)] + generation: Option, shared: PackageShared, #[serde(default)] projectors: Vec, @@ -442,6 +501,12 @@ pub fn inspect_layer_package(package_ref: &str) -> Result { Some(width) => Some(width), None => infer_activation_width_from_layers(&package_dir, &manifest.layers)?, }; + let generation = manifest.generation; + let speculative_decoding = generation + .as_ref() + .and_then(|value| value.get("speculative_decoding")) + .map(parse_speculative_decoding) + .transpose()?; Ok(LayerPackageInfo { package_dir, @@ -476,9 +541,110 @@ pub fn inspect_layer_package(package_ref: &str) -> Result { artifact_bytes: layer.artifact_bytes, }) .collect(), + generation, + speculative_decoding, }) } +fn parse_speculative_decoding(value: &serde_json::Value) -> Result { + let config: SpeculativeDecodingConfig = serde_json::from_value(value.clone())?; + validate_speculative_decoding(&config)?; + Ok(config) +} + +fn validate_speculative_decoding(config: &SpeculativeDecodingConfig) -> Result<()> { + if let Some(default) = config.default.as_deref() + && !config.strategies.contains_key(default) + { + bail!("speculative_decoding.default references missing strategy '{default}'"); + } + for (name, strategy) in &config.strategies { + if name.trim().is_empty() { + bail!("speculative strategy names must not be empty"); + } + match strategy.strategy_type.as_str() { + "draft-model" => { + let Some(draft_model) = strategy.draft_model.as_deref() else { + bail!("draft-model speculative strategy '{name}' must declare draft_model"); + }; + if draft_model.trim().is_empty() { + bail!( + "draft-model speculative strategy '{name}' draft_model must not be empty" + ); + } + } + "ngram" => {} + unknown => bail!("unsupported speculative strategy type '{unknown}'"), + } + validate_window_policy(name, strategy.window_policy.as_ref())?; + } + Ok(()) +} + +fn validate_window_policy( + name: &str, + policy: Option<&SpeculativeWindowPolicyConfig>, +) -> Result<()> { + let Some(policy) = policy else { + return Ok(()); + }; + if let Some(window) = policy.fixed_default_window + && window == 0 + { + bail!("speculative strategy '{name}' fixed_default_window must be greater than zero"); + } + if let Some(window) = policy.initial_window + && window == 0 + { + bail!("speculative strategy '{name}' initial_window must be greater than zero"); + } + if let Some(window) = policy.min_window + && window == 0 + { + bail!("speculative strategy '{name}' min_window must be greater than zero"); + } + if let Some(window) = policy.max_window + && window == 0 + { + bail!("speculative strategy '{name}' max_window must be greater than zero"); + } + if let (Some(min), Some(max)) = (policy.min_window, policy.max_window) + && min > max + { + bail!("speculative strategy '{name}' min_window must not exceed max_window"); + } + if let (Some(initial), Some(min)) = (policy.initial_window, policy.min_window) + && initial < min + { + bail!("speculative strategy '{name}' initial_window must not be less than min_window"); + } + if let (Some(initial), Some(max)) = (policy.initial_window, policy.max_window) + && initial > max + { + bail!("speculative strategy '{name}' initial_window must not exceed max_window"); + } + if let Some(adaptive) = &policy.adaptive { + if let Some(window) = adaptive.min_window + && window == 0 + { + bail!("speculative strategy '{name}' adaptive.min_window must be greater than zero"); + } + if let Some(window) = adaptive.initial_window + && window == 0 + { + bail!( + "speculative strategy '{name}' adaptive.initial_window must be greater than zero" + ); + } + if let Some(window) = adaptive.max_window + && window == 0 + { + bail!("speculative strategy '{name}' adaptive.max_window must be greater than zero"); + } + } + Ok(()) +} + fn infer_activation_width_from_layers( package_dir: &Path, layers: &[PackageLayer], @@ -1147,6 +1313,23 @@ mod tests { "format": "layer-package", "layer_count": 1, "activation_width": 4096, + "generation": { + "speculative_decoding": { + "default": "draft", + "strategies": { + "draft": { + "type": "draft-model", + "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abcdef", + "window_policy": { + "default": "adaptive", + "initial_window": 16, + "min_window": 2, + "max_window": 16 + } + } + } + } + }, "shared": { "metadata": { "path": "metadata.gguf", @@ -1386,6 +1569,20 @@ mod tests { assert_eq!(info.model_id, "model-a"); assert_eq!(info.layer_count, 1); assert_eq!(info.activation_width, Some(4096)); + assert_eq!( + info.generation + .as_ref() + .and_then(|generation| generation.pointer("/speculative_decoding/default")) + .and_then(serde_json::Value::as_str), + Some("draft") + ); + let speculative = info.speculative_decoding.as_ref().unwrap(); + let draft = speculative.strategies.get("draft").unwrap(); + assert_eq!(draft.strategy_type, "draft-model"); + assert_eq!( + draft.draft_model.as_deref(), + Some("unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@abcdef") + ); assert_eq!(info.source_model_bytes, Some(123)); assert_eq!(info.projectors.len(), 1); assert_eq!(info.projectors[0].kind, "mmproj"); diff --git a/docs/LAYER_PACKAGE_REPOS.md b/docs/LAYER_PACKAGE_REPOS.md index 7c5afa8a7..e8a9d8596 100644 --- a/docs/LAYER_PACKAGE_REPOS.md +++ b/docs/LAYER_PACKAGE_REPOS.md @@ -75,6 +75,14 @@ Important options: - `--target `: destination Hugging Face package repo. - `--model-id `: OpenAI-facing package model id. +- `--spec-draft-model `: declare a package default draft model for + speculative decoding, using a normal model ref such as + `unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M`. +- `--spec-strategy `: name for that package-declared strategy, defaulting + to `draft`. +- `--spec-initial-window `, `--spec-min-window `, + `--spec-max-window `: adaptive speculative window defaults recorded in + `model-package.json`. - `--timeout `: HF Jobs timeout, defaulting to `1h` unless raised by size-based estimates. - `--dry-run`: print the resolved package plan and maximum cost without side effects. diff --git a/docs/MTP.md b/docs/MTP.md new file mode 100644 index 000000000..b8afa607a --- /dev/null +++ b/docs/MTP.md @@ -0,0 +1,362 @@ +# MTP and Draft Speculation + +This document records the current decision: Skippy no longer carries its own +MTP implementation. The upstream llama.cpp MTP path owns same-model MTP support. +Skippy keeps only draft-model speculation work. + +## Decision + +- Delete Skippy-owned MTP ABI, CLI flags, staged protocol, package strategy + metadata, and Rust orchestration. +- Keep draft-model speculation: + - single-stage native slot `draft-simple` through llama.cpp; + - multi-stage stage0 draft model proposals verified by downstream + `SpeculativeSpan` messages. +- Do not put benchmark results in `model-package.json`. +- Package speculative metadata may describe `draft-model` or `ngram` + strategies only. + +## Why + +The hand-rolled Skippy MTP path duplicated llama.cpp speculation machinery and +kept drifting from the optimized server slot loop. That made performance and +correctness hard to reason about. The useful Skippy work is draft-model +speculation in distributed topologies, where stage0 can run a small draft model +and ask the target pipeline to verify proposed spans. + +## Current Boundary + +Upstream llama.cpp: + +- owns same-GGUF MTP and any MTP-specific recurrent-state handling; +- can be benchmarked directly with `llama-server` when we want MTP data. + +Skippy: + +- owns split serving, stage transport, telemetry, and draft-model speculative + verification; +- does not expose `--mtp`, `draft-mtp`, `final-stage-head`, or + `DecodeReadoutMtp`; +- sends draft proposals with `SpeculativeSpan`. + +## Benchmark Baseline + +Historical MTP and draft-spec benchmark outputs remain under +`experiments/mtp/`. Treat them as experiment artifacts, not package metadata. +New Skippy speculation runs should compare: + +1. vanilla target baseline; +2. vanilla target plus draft model; +3. Skippy single-stage baseline; +4. Skippy single-stage plus draft model; +5. Skippy multi-stage baseline; +6. Skippy multi-stage plus draft model. + +Use the scripted harnesses so source sync, warmup, cooldown, labels, and metrics +collection stay repeatable. + +## Target PR Direction + +The next review-ready PR should make package-declared draft speculation the +normal path instead of an experiment-only flag set. The target scope is: + +- enable draft speculation from `model-package.json` when a package declares a + default draft strategy; +- add or publish a layer package repository for the known-winning Llama 3.3 70B + Q3 target plus Llama 3.2 1B Q4 draft pair; +- remove obsolete speculation experiments and cruft that are not part of + package-declared `draft-model` speculation; +- sync with the main repository before review; +- keep benchmark evidence in docs and experiment JSON, not in the package + manifest. + +The package manifest should stay small and declarative. For the winning pair, +the intended shape is: + +```json +{ + "generation": { + "speculative_decoding": { + "default": "llama32-1b-q4", + "strategies": { + "llama32-1b-q4": { + "type": "draft-model", + "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@", + "window_policy": { + "default": "adaptive", + "initial_window": 16, + "min_window": 2, + "max_window": 16 + } + } + } + } + } +} +``` + +Important manifest decisions: + +- `draft_model` is a normal model resolver ref. It should download through the + same model cache/resolver path as other models. +- The strategy's presence and selection as `default` are the compatibility + signal. Do not add a redundant `compatible: true`. +- Adaptive is the default policy, but it starts at the proven W16 setting for + this pair. +- Do not put fallback thresholds, cost breakers, lower-prefetch knobs, or + benchmark scores in `model-package.json`; those remain runtime policy. +- If the draft model cannot be resolved or the user disables speculation, serve + the target package normally without failing model load. + +Runtime/user control should stay separate from the package manifest. The package +declares available strategies; the existing speculative runtime config decides +whether to use them: + +```toml +[defaults.speculative] +mode = "auto" # auto | disabled | draft | ngram +package_strategy = "default" +``` + +Serving behavior should stay simple: + +- `auto` may use the package default strategy when it is available and + resolvable; +- `disabled` never uses package-declared speculation; +- `package_strategy = "default"` uses the manifest default, while another + value selects a named package strategy; +- explicit `draft` mode still uses the normal manually configured draft path. + +There should be no `required` draft mode. In `auto`, missing or unresolved draft +models fall back to baseline serving. + +## Two-Stage Draft Prefetch + +The cross-host stage0 draft harness enables lower-stage prefetch by default for +stage0 draft speculation. For hybrid recurrent models, useful lower prefetch +requires enough recurrent rollback snapshots for both the current verify span +and the next prefetched span. The harness now derives that automatically: + +- baseline keeps the requested `N_RS_SEQ` value, which defaults to `0`; +- draft W2 needs `n_rs_seq >= 3`; +- draft W3 needs `n_rs_seq >= 5`; +- in general, fixed window `W` with lower prefetch needs + `n_rs_seq >= 2 * W - 1`. + +That rule exists because llama.cpp exposes a rollback horizon of +`n_rs_seq + 1` tokens. Without the larger horizon, stage0 can verify the current +span but cannot safely keep a prefetched lower-stage result for the next span. + +Smoke evidence from +`experiments/mtp/crosshost/two-stage-lower-prefetch-auto-20260524T202515Z`: + +| Condition | Stage0 `n_rs_seq` | Lower prefetch useful | Tok/s | Speedup | +| --- | ---: | ---: | ---: | ---: | +| baseline | 0 | n/a | 9.27 | 1.00x | +| draft W2 | 3 | 27 | 12.05 | 1.30x | +| draft W3 | 5 | 15 | 9.66 | 1.04x | + +The W2 run matched the baseline output hash and had no replay commits, so the +prefetch work was both useful and correctness-preserving in this smoke. + +## Draft Guard Finding + +Full-corpus W2 draft speculation needs a request-local fallback guard. The +one-prompt smoke had high acceptance, but the 9-prompt corpus includes +low-acceptance spans, especially the long code review prompt. Unguarded W2 with +lower prefetch was unstable: + +| Run | Tok/s | +| --- | ---: | +| split60 baseline | 9.29 | +| unguarded W2 attempt 1 | 6.27 | +| unguarded W2 attempt 2 | 9.75 | +| unguarded W2 attempt 3 | 7.82 | + +Skippy now disables stage0 draft speculation for the rest of a request once the +request has at least `SKIPPY_STAGE0_SPEC_FALLBACK_MIN_WINDOWS` windows and the +accepted/proposed ratio falls below +`SKIPPY_STAGE0_SPEC_FALLBACK_MIN_ACCEPT_RATE`. Defaults are `32` windows and +`0.90`. If lower prefetch already advanced stage0 when the guard trips, Skippy +forces the verified-prefix commit path before continuing with normal one-token +split decode. + +Guarded W2 confirmation from +`experiments/mtp/crosshost/two-stage-w2-guard-confirm-20260524T214529Z`: + +| Condition | Attempts tok/s | Median tok/s | Speedup vs split baseline | +| --- | --- | ---: | ---: | +| split60 baseline | 9.32, 9.29, 9.29 | 9.29 | 1.00x | +| W2 guarded lower prefetch | 8.17, 9.74, 9.50 | 9.50 | 1.02x | + +Controls: + +- `SPEC_DRAFT_P_MIN=0.90` was worse (`8.40 tok/s`). +- W3 guarded was worse (`7.78 tok/s`). +- Disabling lower prefetch was worse (`8.09 tok/s`). +- Split62 was worse for both baseline (`9.15 tok/s`) and W2 (`7.78 tok/s`). +- Disabling the lower-prefetch cooldown was better in a one-run check + (`9.82 tok/s`), so the default cooldown is `0`; the fallback guard is the + safety valve for low-acceptance prompts. +- The default lower-prefetch policy is now aggressive: when lower prefetch is + enabled and the recurrent snapshot horizon allows the next span, Skippy + schedules it instead of using the EWMA net-benefit gate. The old cost gate is + still available for experiments with + `SKIPPY_STAGE0_LOWER_PREFETCH_COST_GATE=1`. +- Aggressive lower prefetch confirmation from + `experiments/mtp/crosshost/two-stage-w2-aggressive-prefetch-confirm-20260524T221403Z` + scheduled all `611/611` lower-prefetch candidates and reduced measured + downstream wait to roughly `0.5s` on the two valid attempts. Those attempts + were `10.18 tok/s` and `10.18 tok/s`, or about `1.10x` over the split60 + baseline. The third attempt was a slow outlier at `7.20 tok/s`; its downstream + wait stayed low, but lower-verify and lower-prefetch time rose from about + `56s` to about `97s`, so the remaining instability is exposed stage0 verify + cost rather than missed prefetch. +- Skippy now keeps aggressive prefetch as the default and adds a lower-prefetch + cost breaker. It compares prefetched lower-verify cost against a warmed + stage0 verify baseline shared by requests in the same server process and + disables lower prefetch for the current request after repeated slow spans. If + the breaker fires after lower prefetch has advanced stage0, Skippy forces the + verified-prefix commit path before continuing. The draft fallback guard + remains separate and can still disable draft speculation for low-acceptance + requests. +- Process-baseline breaker confirmation from + `experiments/mtp/crosshost/two-stage-w2-process-breaker-confirm-20260524T225636Z` + produced `10.18`, `10.23`, and `9.37 tok/s`, all valid, with median + `10.18 tok/s`. That is about `1.10x` over the split60 baseline. Lower + prefetch stayed fully scheduled (`611/611`) and downstream wait stayed low + (`0.40s` to `0.54s`). The breaker did not fire in this run because the + measured lower-prefetch cost never exceeded the warmed baseline threshold; the + third run was slower because exposed lower-prefetch time rose to about + `10.5s`, not because RTT waits returned. +- Skippy also has a request-local exposed-cost budget for lower prefetch. It + tracks cumulative exposed lower-prefetch milliseconds per committed input and + throttles lower prefetch for the rest of the request once the budget is + exceeded. The default gate is enabled with + `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET=1`, + `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MIN_SPANS=32`, + `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MIN_MS=1000`, and + `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_MS_PER_TOKEN=8.0`. Once tripped, + lower prefetch is attempted every + `SKIPPY_STAGE0_LOWER_PREFETCH_EXPOSED_BUDGET_THROTTLE_STRIDE=4` eligible + spans instead of being disabled completely. +- Exposed-budget confirmation from + `experiments/mtp/crosshost/two-stage-w2-exposed-budget-confirm-20260524T231848Z` + produced two clean attempts (`10.22`, `10.20 tok/s`) and one rejected slow + outlier (`7.79 tok/s`). The rejected attempt's short warmup was already bad + (`1.50 tok/s` vs the normal `~12 tok/s`), so this was a poisoned fresh process + before measurement rather than just a lower-prefetch policy miss. +- The cross-host harness now supports warmup health gating before measurement: + `MEASURE_ATTEMPTS` can exceed `MEASURE_RUNS`, and attempts whose short warmup + falls below `WARMUP_MIN_TOK_S` are restarted after cooldown. The first gated W2 + confirmation from + `experiments/mtp/crosshost/two-stage-w2-warmup-gated-confirm-20260524T233715Z` + used `MEASURE_RUNS=3`, `MEASURE_ATTEMPTS=5`, and `WARMUP_MIN_TOK_S=6.0`. It + accepted three attempts with healthy warmups (`12.35`, `12.41`, `11.99 tok/s`) + and measured `10.22`, `10.17`, and `9.35 tok/s`, median `10.17 tok/s`. +- Adaptive lower-prefetch throttle confirmation from + `experiments/mtp/crosshost/two-stage-w2-throttle-confirm-20260524T235913Z` + used the same warmup-gated harness. Attempt 3 was rejected before measurement + because the 32-token warmup was only `2.12 tok/s`. The three measured attempts + were `10.18`, `10.21`, and `10.16 tok/s`, median `10.18 tok/s`; no measured + request tripped the exposed-budget throttle, so the change did not hurt the + fast path. Metrics reports now include a `request_speculation` section keyed by + request/session id for per-request stage0 speculation timing. + +The remaining bottleneck is exposed draft/lower-verify cost and its variance, +not rollback correctness. W2 lower prefetch is useful, but the current two-stage +draft loop does not yet recover enough of the split penalty to approach vanilla +llama-server throughput. + +## Same-Host Two-Stage Window Finding + +For the Llama 3.3 70B Q3 target with the Llama 3.2 1B Q4 draft model on +Studio, draft speculation now wins over the two-stage baseline when stage0 +lower prefetch is enabled. W8 established the first stable win; a follow-up +window sweep found W16 to be the current best same-host setting for this pair. + +Key implementation decisions: + +- Stage0 draft sessions trim away a p-min-rejected candidate token instead of + leaving the draft KV advanced through a token that was never proposed. +- Stage0 lower prefetch is enabled by default. It can be disabled with + `SKIPPY_STAGE0_LOWER_PREFETCH=0`. +- Draft p-min defaults to `0.60` for Skippy draft speculation. The CLI flags + `--spec-draft-p-min` and `--openai-spec-draft-p-min` still override it for + other model pairs. + +Clean same-host Studio measurements, 2 prompts x 192 generated tokens: + +| Condition | Median tok/s | Median wall | Notes | +| --- | ---: | ---: | --- | +| two-stage baseline | 7.957 | 48.26s | no speculation | +| W8, p-min 0.75, lower prefetch off | 8.134 | 47.21s | +2.2% vs baseline | +| W8, p-min 0.75, lower prefetch on | 8.447 | 45.46s | +6.2% vs baseline | +| W8, p-min 0.70, lower prefetch on | 8.449 | 45.45s | effectively tied with 0.75 | +| W8, p-min 0.60, lower prefetch on | 8.708 | 44.10s | +9.4% vs baseline | +| W16, p-min 0.60, lower prefetch on | 9.835 | 39.04s | +23.6% vs baseline | + +The p-min 0.60 run accepted about `266.7` of `289.7` proposed tokens +(`~92.1%`) and committed about `3.31` inputs per speculative span. It is faster +than p-min 0.70/0.75 because the larger useful spans amortize more downstream +round trips, even though the lower-verify work is larger. + +W16 confirmation from +`experiments/mtp/studio54/skippy-2stage-w16-defaultpmin-defaultlower-192tok-prompt2-3x-20260526T214535Z` +ran three clean attempts at `9.832`, `9.835`, and `10.002 tok/s`. It accepted +about `269.7` of `290.3` proposed tokens (`~92.9%`) and committed about `3.75` +inputs per speculative span. Lower-prefetch waste was low (`0.33` spans/run), +and downstream wait fell to about `30.5 ms` per committed input. + +One-shot window sweep evidence: + +| Window | Tok/s | Wall | Accepted / proposed | Committed inputs/span | Notes | +| ---: | ---: | ---: | ---: | ---: | --- | +| W6 | 8.963 | 42.84s | 270 / 287 | 3.14 | stable but smaller spans | +| W8 | 8.822 | 43.53s | 267 / 289 | 3.42 | below confirmed W8 3-run median | +| W10 | 9.284 | 41.36s | 269 / 291 | 3.36 | improvement over W8 | +| W12 | 9.668 | 39.72s | 271 / 291 | 3.76 | near plateau | +| W14 | 9.680 | 39.67s | 269 / 290 | 3.64 | near plateau | +| W16 | 9.845 | 39.01s | 269 / 290 | 3.68 | best one-shot and confirmed by 3-run pass | +| W20 | 3.699 | 103.82s | 269 / 290 | 3.84 | pathological exposed verify/wait outlier | +| W24 | 9.235 | 41.58s | 271 / 291 | 4.11 | recovers, but slower than W16 | +| adaptive W16 | 7.604 | 50.50s | 251 / 275 | 1.96 | current adaptive ramp is worse than baseline | + +Do not treat larger windows as automatically better. W20 showed that a wider +window can preserve acceptance while still losing badly if lower verification +or downstream wait becomes exposed. For this pair, W16 is the current best +claim; future tuning should re-confirm around the plateau rather than pushing +window size blindly upward. + +The old adaptive window policy was not a safe replacement for fixed W16 on this +benchmark. It spent too much of the short request at small spans (`~1.96` +committed inputs/span) and exposed more lower verify/downstream wait per +committed input. Stage0 adaptive draft now starts at the configured max window +by default, so `--openai-adaptive-speculative-window --openai-spec-draft-window +16` begins at W16 instead of slowly ramping from W2. Set +`SKIPPY_STAGE0_ADAPTIVE_START_MAX=0` to restore the old ramp-from-small +behavior for experiments. A same-host smoke after this change produced +`10.018 tok/s` and `38.33s` wall time at adaptive W16, back in the fixed-W16 +range. + +Clean apples-to-apples rerun from +`experiments/mtp/studio54/clean-table-llama33q3-draft1b-w16-20260527` used the +same source state on Studio, the same Llama 3.3 70B Q3 target, the same Llama +3.2 1B Q4 draft, the same first 2 PR prompts, `192` generated tokens, +temperature `0`, one warmup per attempt, `120s` cooldowns, and three clean +measurement attempts per condition. + +| Runtime | Condition | Window | Median tok/s | Min-Max tok/s | Median wall | Acceptance | Valid | Speedup | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | +| llama-server | baseline, no draft | - | 8.017 | 8.015-8.018 | 47.90s | n/a | 3/3 | 1.00x | +| llama-server | draft model speculation | W16 | 11.041 | 11.041-11.044 | 34.78s | 0.870 | 3/3 | 1.38x | +| skippy-server two-stage | baseline, no draft | - | 7.961 | 7.959-7.964 | 48.23s | n/a | 3/3 | 1.00x | +| skippy-server two-stage | stage0-owned draft speculation | W16 | 10.024 | 9.853-10.025 | 38.31s | 0.899 | 3/3 | 1.26x | +| skippy-server two-stage | adaptive stage0-owned draft speculation | W16 max | 10.020 | 9.867-10.036 | 38.33s | 0.899 | 3/3 | 1.26x | + +This is a real Skippy win: fixed W16 draft speculation improves same-host +two-stage Skippy throughput by about `25.9%` and reduces median wall time by +about `20.6%` versus the matching two-stage Skippy baseline. Vanilla +llama-server remains the performance reference for this pair: Skippy two-stage +W16 is still about `9.2%` behind vanilla W16, even though Skippy baseline is +already close to vanilla baseline. diff --git a/docs/USAGE.md b/docs/USAGE.md index e4720e635..756803945 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -351,7 +351,8 @@ lifecycle_health_interval_ms = 5000 # health-check interval (ms) # --- Speculative decoding ------------------------------------------------ [defaults.speculative] -mode = "auto" # auto off draft ngram lookahead +mode = "auto" # auto disabled draft ngram +package_strategy = "default" # package manifest default or named strategy draft_selection_policy = "auto" # auto manual heuristic pairing_fault = "warn_disable" # warn_disable fail_open fail_closed draft_max_tokens = 16 diff --git a/docs/specs/layer-package-repos.md b/docs/specs/layer-package-repos.md index 726650bf4..38b64f84c 100644 --- a/docs/specs/layer-package-repos.md +++ b/docs/specs/layer-package-repos.md @@ -275,6 +275,55 @@ multimodal model: Consumers MUST NOT infer projector identity from sibling files or filename stems. If a package needs a projector, the manifest must declare it explicitly. +### Generation Capabilities + +`generation` is optional and defaults to no package-declared generation +extensions. It describes capabilities and safe runtime policy defaults. It MUST +NOT contain benchmark results. Benchmark evidence belongs in experiment +artifacts, release notes, or documentation, not in `model-package.json`. + +For speculative decoding, the recommended shape is: + +```json +{ + "generation": { + "speculative_decoding": { + "default": "llama32-1b-q4", + "strategies": { + "llama32-1b-q4": { + "type": "draft-model", + "draft_model": "unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M@", + "window_policy": { + "default": "adaptive", + "initial_window": 16, + "min_window": 2, + "max_window": 16 + } + } + } + } + } +} +``` + +The selected strategy's presence means the package declares support for that +strategy. There is no separate `compatible` boolean. `draft_model` is a normal +model resolver ref, including a pinned revision when the package publisher wants +reproducible downloads. Runtimes should resolve and cache the draft model using +the same model download path as ordinary model artifacts. + +Runtime policy should still be able to disable speculation for user choice, +unsupported hardware/topology, missing draft artifacts, or internal safety +guards. Those controls are runtime behavior and should not be encoded as +manifest fallback thresholds. + +Serving configuration should use the existing `[defaults.speculative]` and +`[models.speculative]` control surface. In `mode = "auto"`, package-declared +defaults may be used when available and resolvable. In `mode = "disabled"`, +package-declared speculation is not used. `package_strategy = "default"` uses +the manifest default, and any other value selects a named package strategy. +There is no separate `required` mode. + ## Publishing Rules Package creation SHOULD use: @@ -283,6 +332,18 @@ Package creation SHOULD use: skippy-model-package write-package org/repo:distribution --out-dir model-package/ ``` +Packages with a known-good draft pair SHOULD declare it at write time: + +```bash +skippy-model-package write-package org/repo:distribution \ + --spec-draft-model unsloth/Llama-3.2-1B-Instruct-GGUF:Q4_K_M \ + --spec-strategy llama32-1b-q4 \ + --spec-initial-window 16 \ + --spec-min-window 2 \ + --spec-max-window 16 \ + --out-dir model-package/ +``` + Multimodal packages SHOULD declare projector artifacts at write time: ```bash From f8f43150f3946d52f0f1f4cc9d0543779b62247d Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 16:29:37 +1000 Subject: [PATCH 2/9] Retry HF artifact uploads during package jobs --- .../src/scripts/split-model-job.sh | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/crates/model-package/src/scripts/split-model-job.sh b/crates/model-package/src/scripts/split-model-job.sh index ef0e7a21c..90af3382b 100755 --- a/crates/model-package/src/scripts/split-model-job.sh +++ b/crates/model-package/src/scripts/split-model-job.sh @@ -239,19 +239,39 @@ cat > "$ARTIFACT_UPLOAD_SCRIPT" <<'PYTHON' from huggingface_hub import HfApi from pathlib import Path import os +import time path = Path(os.environ["SKIPPY_PACKAGE_ARTIFACT_PATH"]) relative = os.environ["SKIPPY_PACKAGE_ARTIFACT_RELATIVE_PATH"] target_repo = os.environ["TARGET_REPO"] +max_attempts = int(os.environ.get("ARTIFACT_UPLOAD_ATTEMPTS", "4")) api = HfApi(token=os.environ["HF_TOKEN"]) -api.upload_file( - repo_id=target_repo, - path_or_fileobj=str(path), - path_in_repo=relative, - repo_type="model", - commit_message=f"Add package artifact {relative}", -) +last_error = None +for attempt in range(1, max_attempts + 1): + try: + api.upload_file( + repo_id=target_repo, + path_or_fileobj=str(path), + path_in_repo=relative, + repo_type="model", + commit_message=f"Add package artifact {relative}", + ) + last_error = None + break + except Exception as err: + last_error = err + if attempt == max_attempts: + break + delay = min(60, 5 * attempt) + print( + f" Upload failed for {relative} on attempt {attempt}/{max_attempts}: {err}. " + f"Retrying in {delay}s...", + flush=True, + ) + time.sleep(delay) +if last_error is not None: + raise last_error size = path.stat().st_size path.unlink() print(f" Uploaded and removed {relative} ({size} bytes)") From f666e72730aed1afe89ece1d062ffecad505eefe Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 18:13:14 +1000 Subject: [PATCH 3/9] Honor package draft disablement --- .../src/inference/skippy/mod.rs | 12 ++++++++++-- .../mesh-llm-host-runtime/src/runtime/local.rs | 17 +++++++++++++++-- crates/mesh-llm-host-runtime/src/runtime/mod.rs | 12 ++++++++++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs index c9fbb6b0c..1a571e291 100644 --- a/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs +++ b/crates/mesh-llm-host-runtime/src/inference/skippy/mod.rs @@ -783,8 +783,12 @@ pub(crate) fn single_stage_config(options: &SkippyModelLoadOptions) -> Result identity.clone(), + None if inferred_layer_package => identity_from_layer_package(&path_ref)?, None => synthetic_direct_gguf_package(&options.model_id, &options.model_path)?, }; let layer_start = options.layer_start; @@ -832,10 +836,14 @@ pub(crate) fn single_stage_config(options: &SkippyModelLoadOptions) -> Result { pub(super) n_ubatch_override: Option, pub(super) flash_attention_override: FlashAttentionType, pub(super) parallel_override: Option, + pub(super) package_speculative_defaults: bool, pub(super) openai_guardrail_policy: OpenAiGuardrailPolicyHandle, pub(super) skippy_telemetry: skippy::SkippyTelemetryOptions, pub(super) survey_telemetry: survey::SurveyTelemetry, @@ -774,6 +775,7 @@ pub(super) async fn start_runtime_split_model( n_batch_override: spec.n_batch_override, n_ubatch_override: spec.n_ubatch_override, flash_attention_override: spec.flash_attention_override, + package_speculative_defaults: spec.package_speculative_defaults, openai_guardrail_policy: spec.openai_guardrail_policy.clone(), pinned_gpu: spec.pinned_gpu, slots, @@ -804,6 +806,7 @@ pub(super) async fn start_runtime_split_model( n_batch_override: spec.n_batch_override, n_ubatch_override: spec.n_ubatch_override, flash_attention_override: spec.flash_attention_override, + package_speculative_defaults: spec.package_speculative_defaults, openai_guardrail_policy: spec.openai_guardrail_policy.clone(), pinned_gpu: spec.pinned_gpu.cloned(), slots, @@ -1162,6 +1165,7 @@ struct SplitGenerationLoadSpec<'a> { n_batch_override: Option, n_ubatch_override: Option, flash_attention_override: FlashAttentionType, + package_speculative_defaults: bool, openai_guardrail_policy: OpenAiGuardrailPolicyHandle, skippy_telemetry: skippy::SkippyTelemetryOptions, survey_telemetry: survey::SurveyTelemetry, @@ -1554,7 +1558,9 @@ async fn split_generation_load_settings<'a>( if let Some(gpu) = spec.pinned_gpu { resolved.hardware.device = Some(gpu.backend_device.clone()); } - apply_package_speculative_defaults(&mut resolved, spec.package).await?; + if spec.package_speculative_defaults { + apply_package_speculative_defaults(&mut resolved, spec.package).await?; + } let embedded_openai = resolved.to_embedded_openai_args(activation_width, true)?; let runtime_options = resolved.to_embedded_runtime_options( &spec.skippy_telemetry, @@ -1952,6 +1958,7 @@ struct SplitTopologyCoordinator { n_batch_override: Option, n_ubatch_override: Option, flash_attention_override: FlashAttentionType, + package_speculative_defaults: bool, openai_guardrail_policy: OpenAiGuardrailPolicyHandle, pinned_gpu: Option, slots: usize, @@ -2441,6 +2448,7 @@ impl SplitTopologyCoordinator { n_batch_override: self.n_batch_override, n_ubatch_override: self.n_ubatch_override, flash_attention_override: self.flash_attention_override, + package_speculative_defaults: self.package_speculative_defaults, openai_guardrail_policy: self.openai_guardrail_policy.clone(), pinned_gpu: self.pinned_gpu.as_ref(), slots: self.slots, @@ -3729,7 +3737,9 @@ async fn start_runtime_layer_package_model( plan.slots, fallback_projector_path, )?; - apply_package_speculative_defaults(&mut resolved, &package).await?; + if spec.package_speculative_defaults { + apply_package_speculative_defaults(&mut resolved, &package).await?; + } tracing::info!( model = model_name, "KV cache: {} K + {} V, {}K context", @@ -4289,6 +4299,7 @@ stop = ["END"] n_batch_override: None, n_ubatch_override: None, flash_attention_override: FlashAttentionType::Auto, + package_speculative_defaults: true, openai_guardrail_policy: openai_guardrail_policy_handle( openai_frontend::GuardrailMode::Disabled, ), @@ -4387,6 +4398,7 @@ max_tokens = 222 n_ubatch_override: None, flash_attention_override: FlashAttentionType::Auto, parallel_override: None, + package_speculative_defaults: true, openai_guardrail_policy: openai_guardrail_policy_handle( openai_frontend::GuardrailMode::Disabled, ), @@ -5551,6 +5563,7 @@ max_tokens = 222 n_batch_override: None, n_ubatch_override: None, flash_attention_override: FlashAttentionType::Auto, + package_speculative_defaults: true, openai_guardrail_policy: openai_guardrail_policy_handle( openai_frontend::GuardrailMode::Disabled, ), diff --git a/crates/mesh-llm-host-runtime/src/runtime/mod.rs b/crates/mesh-llm-host-runtime/src/runtime/mod.rs index 967fa528f..ae28756a8 100644 --- a/crates/mesh-llm-host-runtime/src/runtime/mod.rs +++ b/crates/mesh-llm-host-runtime/src/runtime/mod.rs @@ -1220,6 +1220,7 @@ struct StartupLocalModelTask { n_ubatch: Option, flash_attention: FlashAttentionType, parallel_override: Option, + package_speculative_defaults: bool, openai_guardrail_policy: OpenAiGuardrailPolicyHandle, split: bool, skippy_telemetry: skippy::SkippyTelemetryOptions, @@ -1299,6 +1300,7 @@ struct StartupLoopContext<'a> { n_ubatch: Option, flash_attention: FlashAttentionType, parallel_override: Option, + package_speculative_defaults: bool, openai_guardrail_policy: OpenAiGuardrailPolicyHandle, skippy_telemetry: &'a skippy::SkippyTelemetryOptions, survey_telemetry: &'a survey::SurveyTelemetry, @@ -1371,6 +1373,7 @@ struct StartupLaunchRuntimeContext<'a> { n_ubatch: Option, flash_attention: FlashAttentionType, parallel_override: Option, + package_speculative_defaults: bool, openai_guardrail_policy: OpenAiGuardrailPolicyHandle, skippy_telemetry: &'a skippy::SkippyTelemetryOptions, survey_telemetry: &'a survey::SurveyTelemetry, @@ -1841,6 +1844,7 @@ async fn startup_handle_local_fallback_event( n_ubatch_override: ctx.n_ubatch, flash_attention_override: ctx.flash_attention, parallel_override: ctx.parallel_override, + package_speculative_defaults: ctx.package_speculative_defaults, openai_guardrail_policy: ctx.openai_guardrail_policy.clone(), skippy_telemetry: ctx.skippy_telemetry.clone(), survey_telemetry: ctx.survey_telemetry.clone(), @@ -2173,6 +2177,7 @@ async fn startup_launch_runtime( n_ubatch, flash_attention, parallel_override, + package_speculative_defaults, openai_guardrail_policy, skippy_telemetry, survey_telemetry, @@ -2200,6 +2205,7 @@ async fn startup_launch_runtime( n_ubatch_override: n_ubatch, flash_attention_override: flash_attention, parallel_override, + package_speculative_defaults, openai_guardrail_policy: openai_guardrail_policy.clone(), skippy_telemetry: skippy_telemetry.clone(), survey_telemetry: survey_telemetry.clone(), @@ -2313,6 +2319,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) { n_ubatch, flash_attention, parallel_override, + package_speculative_defaults, openai_guardrail_policy, split, skippy_telemetry, @@ -2372,6 +2379,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) { n_ubatch, flash_attention, parallel_override, + package_speculative_defaults, openai_guardrail_policy: openai_guardrail_policy.clone(), skippy_telemetry: &skippy_telemetry, survey_telemetry: &survey_telemetry, @@ -2426,6 +2434,7 @@ async fn startup_local_model_loop(params: StartupLocalModelTask) { n_ubatch, flash_attention, parallel_override, + package_speculative_defaults, openai_guardrail_policy, skippy_telemetry: &skippy_telemetry, survey_telemetry: &survey_telemetry, @@ -6777,6 +6786,7 @@ async fn spawn_run_auto_startup_model_tasks( n_ubatch: primary_n_ubatch, flash_attention: primary_flash_attention, parallel_override: primary_parallel_override, + package_speculative_defaults: !cli.no_draft, openai_guardrail_policy: runtime_state.openai_guardrail_policy.clone(), split: options.split, skippy_telemetry: skippy_telemetry.clone(), @@ -7076,6 +7086,7 @@ async fn run_auto_load_runtime_model( .and_then(|m| m.flash_attention) .unwrap_or(FlashAttentionType::Auto), parallel_override, + package_speculative_defaults: !ctx.cli.no_draft, openai_guardrail_policy: ctx.openai_guardrail_policy.clone(), skippy_telemetry: skippy_telemetry_options(ctx.options), survey_telemetry: ctx.survey_telemetry.clone(), @@ -7892,6 +7903,7 @@ async fn spawn_run_auto_additional_model_tasks(ctx: RunAutoAdditionalModelsConte n_ubatch: extra_model.n_ubatch, flash_attention: extra_model.flash_attention, parallel_override: extra_model.parallel.or(ctx.config.gpu.parallel), + package_speculative_defaults: !ctx.cli.no_draft, openai_guardrail_policy: ctx.openai_guardrail_policy.clone(), split: ctx.options.split, skippy_telemetry: ctx.skippy_telemetry.clone(), From df287b8f3d438900eaa38cc8adc1117e5c552cd4 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 18:44:58 +1000 Subject: [PATCH 4/9] Add speculative config authoring API --- crates/mesh-llm-config/README.md | 25 ++++++ crates/mesh-llm-config/src/authoring.rs | 104 +++++++++++++++++++++++- crates/mesh-llm-config/src/lib.rs | 67 ++++++++++++++- crates/mesh-llm-host-runtime/src/sdk.rs | 6 +- 4 files changed, 197 insertions(+), 5 deletions(-) diff --git a/crates/mesh-llm-config/README.md b/crates/mesh-llm-config/README.md index ec76e7477..51cdcaa6a 100644 --- a/crates/mesh-llm-config/README.md +++ b/crates/mesh-llm-config/README.md @@ -130,6 +130,31 @@ store.update(|config| { })?; ``` +### Configure speculative decoding + +Use the editor API for speculative defaults or model-specific speculative +settings. This keeps SDKs and apps on the same validated config schema as the +CLI. + +```rust +use mesh_llm_config::ConfigStore; + +let store = ConfigStore::default_path()?; +store.update(|config| { + config + .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")? + .speculative() + .mode("draft") + .draft_hf_source( + "unsloth/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-Q4_K_M.gguf", + ) + .draft_selection_policy("manual") + .draft_max_tokens(16); + Ok(()) +})?; +``` + ### Configure plugins Use plugin helpers for common cases instead of writing `[[plugin]]` tables. diff --git a/crates/mesh-llm-config/src/authoring.rs b/crates/mesh-llm-config/src/authoring.rs index e744fd747..478ea9f73 100644 --- a/crates/mesh-llm-config/src/authoring.rs +++ b/crates/mesh-llm-config/src/authoring.rs @@ -1,6 +1,7 @@ use crate::{ GpuAssignment, HardwareConfig, MeshConfig, ModelConfigDefaults, ModelConfigEntry, - ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, ThroughputConfig, + ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, SpeculativeConfig, + ThroughputConfig, }; use anyhow::{Result, bail}; use mesh_llm_types::runtime::ModelRuntimeKind; @@ -256,6 +257,20 @@ impl ModelDefaultsEditor<'_> { self } + pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> { + SpeculativeConfigEditor { + speculative: self + .defaults + .speculative + .get_or_insert_with(Default::default), + } + } + + pub fn clear_speculative(&mut self) -> &mut Self { + self.defaults.speculative = None; + self + } + fn hardware(&mut self) -> &mut HardwareConfig { self.defaults.hardware.get_or_insert_with(Default::default) } @@ -332,6 +347,17 @@ impl ModelConfigEditor<'_> { self } + pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> { + SpeculativeConfigEditor { + speculative: self.model.speculative.get_or_insert_with(Default::default), + } + } + + pub fn clear_speculative(&mut self) -> &mut Self { + self.model.speculative = None; + self + } + fn hardware(&mut self) -> &mut HardwareConfig { self.model.hardware.get_or_insert_with(Default::default) } @@ -355,6 +381,82 @@ impl ModelConfigEditor<'_> { } } +pub struct SpeculativeConfigEditor<'a> { + speculative: &'a mut SpeculativeConfig, +} + +impl SpeculativeConfigEditor<'_> { + pub fn mode(&mut self, mode: impl Into) -> &mut Self { + self.speculative.mode = Some(mode.into()); + self + } + + pub fn package_strategy(&mut self, strategy: impl Into) -> &mut Self { + self.speculative.package_strategy = Some(strategy.into()); + self + } + + pub fn draft_model_path(&mut self, path: impl Into) -> &mut Self { + self.speculative.draft_model_path = Some(path.into()); + self + } + + pub fn draft_hf_source( + &mut self, + repo: impl Into, + file: impl Into, + ) -> &mut Self { + self.speculative.draft_hf_repo = Some(repo.into()); + self.speculative.draft_hf_file = Some(file.into()); + self + } + + pub fn draft_selection_policy(&mut self, policy: impl Into) -> &mut Self { + self.speculative.draft_selection_policy = Some(policy.into()); + self + } + + pub fn pairing_fault(&mut self, policy: impl Into) -> &mut Self { + self.speculative.pairing_fault = Some(policy.into()); + self + } + + pub fn draft_max_tokens(&mut self, tokens: u32) -> &mut Self { + self.speculative.draft_max_tokens = Some(tokens); + self + } + + pub fn draft_min_tokens(&mut self, tokens: u32) -> &mut Self { + self.speculative.draft_min_tokens = Some(tokens); + self + } + + pub fn draft_gpu_layers(&mut self, layers: i32) -> &mut Self { + self.speculative.draft_gpu_layers = Some(layers); + self + } + + pub fn draft_device(&mut self, device: impl Into) -> &mut Self { + self.speculative.draft_device = Some(device.into()); + self + } + + pub fn draft_threads(&mut self, threads: usize) -> &mut Self { + self.speculative.draft_threads = Some(threads); + self + } + + pub fn draft_cache_types( + &mut self, + key: impl Into, + value: impl Into, + ) -> &mut Self { + self.speculative.draft_cache_type_k = Some(key.into()); + self.speculative.draft_cache_type_v = Some(value.into()); + self + } +} + pub struct PluginConfigEditor<'a> { plugin: &'a mut PluginConfigEntry, } diff --git a/crates/mesh-llm-config/src/lib.rs b/crates/mesh-llm-config/src/lib.rs index fd84d9fa2..dada57895 100644 --- a/crates/mesh-llm-config/src/lib.rs +++ b/crates/mesh-llm-config/src/lib.rs @@ -6,7 +6,7 @@ mod validate; pub use authoring::{ ConfigEditor, LocalServingNodeConfig, ModelConfigEditor, ModelDefaultsEditor, - PluginConfigEditor, + PluginConfigEditor, SpeculativeConfigEditor, }; pub use model::*; pub use store::{ConfigStore, config_path, config_to_toml, load_config, parse_config_toml}; @@ -204,6 +204,71 @@ ctx_size = 4096 assert!(fs::read_to_string(path).unwrap().contains("[[plugin]]")); } + #[test] + fn config_editor_updates_model_speculative_without_callers_writing_toml() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("config.toml"); + let store = ConfigStore::open(&path); + + let config = store + .update(|config| { + config + .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")? + .speculative() + .mode("draft") + .draft_hf_source( + "unsloth/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-Q4_K_M.gguf", + ) + .draft_selection_policy("manual") + .pairing_fault("warn_disable") + .draft_max_tokens(16) + .draft_gpu_layers(-1); + Ok(()) + }) + .unwrap(); + + let speculative = config.models[0].speculative.as_ref().unwrap(); + assert_eq!(speculative.mode.as_deref(), Some("draft")); + assert_eq!(speculative.draft_max_tokens, Some(16)); + let raw = fs::read_to_string(path).unwrap(); + assert!(raw.contains("[[models]]")); + assert!(raw.contains("[models.speculative]")); + assert!(raw.contains("draft_selection_policy = \"manual\"")); + } + + #[test] + fn config_editor_updates_default_speculative_package_strategy() { + let temp_dir = TempDir::new().unwrap(); + let path = temp_dir.path().join("config.toml"); + let store = ConfigStore::open(&path); + + let config = store + .update(|config| { + config + .defaults() + .speculative() + .mode("auto") + .package_strategy("llama32-1b-q4"); + Ok(()) + }) + .unwrap(); + + let speculative = config + .defaults + .as_ref() + .and_then(|defaults| defaults.speculative.as_ref()) + .unwrap(); + assert_eq!(speculative.mode.as_deref(), Some("auto")); + assert_eq!( + speculative.package_strategy.as_deref(), + Some("llama32-1b-q4") + ); + let raw = fs::read_to_string(path).unwrap(); + assert!(raw.contains("[defaults.speculative]")); + assert!(raw.contains("package_strategy = \"llama32-1b-q4\"")); + } + #[test] fn parse_config_toml_rejects_unknown_runtime_kind() { let err = parse_config_toml( diff --git a/crates/mesh-llm-host-runtime/src/sdk.rs b/crates/mesh-llm-host-runtime/src/sdk.rs index d2b214ce8..3ec6c0e73 100644 --- a/crates/mesh-llm-host-runtime/src/sdk.rs +++ b/crates/mesh-llm-host-runtime/src/sdk.rs @@ -27,9 +27,9 @@ pub mod config { ModelConfigEntry, ModelDefaultsEditor, ModelFitConfig, ModelRuntimeKind, MultimodalConfig, OwnerControlConfig, PluginConfigEditor, PluginConfigEntry, PrefixCacheConfig, ReasoningBudget, ReasoningEnabled, RequestDefaultsConfig, ReservedObjectConfig, - SkippyConfig, SpeculativeConfig, StringOrStringList, TelemetryConfig, - TelemetryMetricsConfig, TensorSplitConfig, ThroughputConfig, config_path, config_to_toml, - load_config, parse_config_toml, validate_config, + SkippyConfig, SpeculativeConfig, SpeculativeConfigEditor, StringOrStringList, + TelemetryConfig, TelemetryMetricsConfig, TensorSplitConfig, ThroughputConfig, config_path, + config_to_toml, load_config, parse_config_toml, validate_config, }; } From ff1a70ac8a079338daa273c5a695d5229e41c3a9 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 18:46:26 +1000 Subject: [PATCH 5/9] Document speculative config authoring re-exports --- crates/mesh-llm-host-runtime/README.md | 32 ++++++++++++++++++++++++++ crates/mesh-llm/README.md | 6 +++++ 2 files changed, 38 insertions(+) diff --git a/crates/mesh-llm-host-runtime/README.md b/crates/mesh-llm-host-runtime/README.md index d41df951b..505102404 100644 --- a/crates/mesh-llm-host-runtime/README.md +++ b/crates/mesh-llm-host-runtime/README.md @@ -7,3 +7,35 @@ binary. This crate is being split so reusable CLI, TUI, SDK, and embeddable runtime surfaces can be published and consumed independently. + +## Config API + +The host runtime re-exports the shared config authoring surface through +`mesh_llm_host_runtime::sdk::config` and `mesh_llm_host_runtime::plugin`. Use +that API when host-runtime consumers need to read or write `config.toml`; avoid +hand-writing TOML or duplicating config structs in runtime callers. + +Speculative model settings use the same high-level editor API as other model +settings: + +```rust +use mesh_llm_host_runtime::sdk::config::ConfigStore; + +let store = ConfigStore::default_path()?; +store.update(|config| { + config + .upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")? + .speculative() + .mode("draft") + .draft_hf_source( + "unsloth/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-Q4_K_M.gguf", + ) + .draft_selection_policy("manual") + .draft_max_tokens(16); + Ok(()) +})?; +``` + +Schema ownership, validation, and persistence live in +[`../mesh-llm-config`](../mesh-llm-config). diff --git a/crates/mesh-llm/README.md b/crates/mesh-llm/README.md index 49ba1b855..8b6eb1f47 100644 --- a/crates/mesh-llm/README.md +++ b/crates/mesh-llm/README.md @@ -23,4 +23,10 @@ own the reusable pieces: - [`../mesh-llm-plugin`](../mesh-llm-plugin) for plugin author API and plugin protobuf schema - existing `model-*`, `openai-frontend`, and `skippy-*` crates for their domains +The compatibility re-export includes the shared config authoring API under +`mesh_llm::sdk::config`. Embedders that need to update `config.toml`, including +model-specific speculative decoding settings, should use `ConfigStore::update` +and the typed editors from [`../mesh-llm-config`](../mesh-llm-config) instead +of writing TOML directly. + For install and end-user usage, see the [project README](../../README.md). From 054b6ce844f02819488ad985be2ebee74a5b67c8 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Wed, 27 May 2026 20:49:46 +1000 Subject: [PATCH 6/9] Fix package spec CI failures --- crates/mesh-llm-commands/src/model_package.rs | 45 ++++++++++--------- .../src/runtime/local.rs | 20 ++++++++- 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/crates/mesh-llm-commands/src/model_package.rs b/crates/mesh-llm-commands/src/model_package.rs index 8377cb429..3193c5d4e 100644 --- a/crates/mesh-llm-commands/src/model_package.rs +++ b/crates/mesh-llm-commands/src/model_package.rs @@ -198,26 +198,7 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> { ); if !submitting { - let redacted = redacted_spec(&job.spec); - if json { - println!( - "{}", - serde_json::to_string_pretty(&json!({ - "dryRun": true, - "confirmRequired": true, - "sourceRepo": job.source_repo, - "sourceFile": job.source_file, - "targetRepo": job.target_repo, - "modelId": job.model_id, - "jobPlan": job.job_plan, - "spec": redacted, - }))? - ); - } else { - eprintln!(); - eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit."); - println!("{}", serde_json::to_string_pretty(&redacted)?); - } + print_package_job_dry_run(&job, json)?; return Ok(()); } @@ -266,6 +247,30 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> { Ok(()) } +fn print_package_job_dry_run(job: &prepare::PrepareJob, json_output: bool) -> Result<()> { + let redacted = redacted_spec(&job.spec); + if json_output { + println!( + "{}", + serde_json::to_string_pretty(&json!({ + "dryRun": true, + "confirmRequired": true, + "sourceRepo": job.source_repo, + "sourceFile": job.source_file, + "targetRepo": job.target_repo, + "modelId": job.model_id, + "jobPlan": job.job_plan, + "spec": redacted, + }))? + ); + } else { + eprintln!(); + eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit."); + println!("{}", serde_json::to_string_pretty(&redacted)?); + } + Ok(()) +} + async fn run_list_quants( client: &hf_hub::HFClient, source_repo: &str, diff --git a/crates/mesh-llm-host-runtime/src/runtime/local.rs b/crates/mesh-llm-host-runtime/src/runtime/local.rs index 7cf87304e..0c667ed05 100644 --- a/crates/mesh-llm-host-runtime/src/runtime/local.rs +++ b/crates/mesh-llm-host-runtime/src/runtime/local.rs @@ -5447,8 +5447,24 @@ max_tokens = 222 ); } - #[tokio::test] - async fn load_split_runtime_generation_stops_candidate_stages_after_partial_load_failure() { + #[test] + fn load_split_runtime_generation_stops_candidate_stages_after_partial_load_failure() { + std::thread::Builder::new() + .name("split-load-partial-failure-test".to_string()) + .stack_size(16 * 1024 * 1024) + .spawn(|| { + tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap() + .block_on(load_split_runtime_generation_partial_failure_case()); + }) + .unwrap() + .join() + .unwrap(); + } + + async fn load_split_runtime_generation_partial_failure_case() { let node = mesh::Node::new_for_tests(NodeRole::Host { http_port: 9337 }) .await .unwrap(); From 926b5bb8c1545efc4ecc3d3d3af342c7d133d874 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Fri, 5 Jun 2026 17:30:29 +1000 Subject: [PATCH 7/9] Fix package draft branch after main refactors --- crates/mesh-llm-config/src/model.rs | 2 ++ crates/mesh-llm-config/src/validate.rs | 4 ++++ crates/mesh-llm-host-runtime/src/runtime/mod.rs | 6 +++--- crates/mesh-llm/src/commands/mod.rs | 10 ++++++++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/crates/mesh-llm-config/src/model.rs b/crates/mesh-llm-config/src/model.rs index 59a65f0a0..2a81ff828 100644 --- a/crates/mesh-llm-config/src/model.rs +++ b/crates/mesh-llm-config/src/model.rs @@ -404,6 +404,8 @@ pub struct SpeculativeConfig { #[serde(default)] pub mode: Option, #[serde(default)] + pub package_strategy: Option, + #[serde(default)] pub draft_model_path: Option, #[serde(default)] pub draft_hf_repo: Option, diff --git a/crates/mesh-llm-config/src/validate.rs b/crates/mesh-llm-config/src/validate.rs index a379c27e3..ce5fd75fa 100644 --- a/crates/mesh-llm-config/src/validate.rs +++ b/crates/mesh-llm-config/src/validate.rs @@ -497,6 +497,10 @@ fn validate_speculative(config: &SpeculativeConfig, base_path: &str) -> Result<( &["auto", "disabled", "draft", "ngram"], &format!("{base_path}.mode"), )?; + validate_optional_non_empty( + config.package_strategy.as_deref(), + &format!("{base_path}.package_strategy"), + )?; validate_hf_pair( config.draft_hf_repo.as_deref(), config.draft_hf_file.as_deref(), diff --git a/crates/mesh-llm-host-runtime/src/runtime/mod.rs b/crates/mesh-llm-host-runtime/src/runtime/mod.rs index ae28756a8..a363f9a32 100644 --- a/crates/mesh-llm-host-runtime/src/runtime/mod.rs +++ b/crates/mesh-llm-host-runtime/src/runtime/mod.rs @@ -6786,7 +6786,7 @@ async fn spawn_run_auto_startup_model_tasks( n_ubatch: primary_n_ubatch, flash_attention: primary_flash_attention, parallel_override: primary_parallel_override, - package_speculative_defaults: !cli.no_draft, + package_speculative_defaults: !options.no_draft, openai_guardrail_policy: runtime_state.openai_guardrail_policy.clone(), split: options.split, skippy_telemetry: skippy_telemetry.clone(), @@ -7086,7 +7086,7 @@ async fn run_auto_load_runtime_model( .and_then(|m| m.flash_attention) .unwrap_or(FlashAttentionType::Auto), parallel_override, - package_speculative_defaults: !ctx.cli.no_draft, + package_speculative_defaults: !ctx.options.no_draft, openai_guardrail_policy: ctx.openai_guardrail_policy.clone(), skippy_telemetry: skippy_telemetry_options(ctx.options), survey_telemetry: ctx.survey_telemetry.clone(), @@ -7903,7 +7903,7 @@ async fn spawn_run_auto_additional_model_tasks(ctx: RunAutoAdditionalModelsConte n_ubatch: extra_model.n_ubatch, flash_attention: extra_model.flash_attention, parallel_override: extra_model.parallel.or(ctx.config.gpu.parallel), - package_speculative_defaults: !ctx.cli.no_draft, + package_speculative_defaults: !ctx.options.no_draft, openai_guardrail_policy: ctx.openai_guardrail_policy.clone(), split: ctx.options.split, skippy_telemetry: ctx.skippy_telemetry.clone(), diff --git a/crates/mesh-llm/src/commands/mod.rs b/crates/mesh-llm/src/commands/mod.rs index 86c16438b..5c835c720 100644 --- a/crates/mesh-llm/src/commands/mod.rs +++ b/crates/mesh-llm/src/commands/mod.rs @@ -103,6 +103,11 @@ async fn dispatch_model_prepare(cmd: &Command) -> Result<()> { quant, target, model_id, + spec_draft_model, + spec_strategy, + spec_initial_window, + spec_min_window, + spec_max_window, flavor, timeout, mesh_llm_ref, @@ -126,6 +131,11 @@ async fn dispatch_model_prepare(cmd: &Command) -> Result<()> { quant: quant.as_deref(), target: target.as_deref(), model_id: model_id.as_deref(), + spec_draft_model: spec_draft_model.as_deref(), + spec_strategy, + spec_initial_window: *spec_initial_window, + spec_min_window: *spec_min_window, + spec_max_window: *spec_max_window, flavor, timeout, mesh_llm_ref, From f56a90b1e2d81bda0cb298f297af6a26de019164 Mon Sep 17 00:00:00 2001 From: James Dumay Date: Fri, 5 Jun 2026 19:48:56 +1000 Subject: [PATCH 8/9] Fix native runtime loading for draft packages --- crates/mesh-llm-native-runtime/src/cache.rs | 47 +++++++++++++++- crates/skippy-ffi/src/lib.rs | 52 +++++++++++++++++- crates/skippy-server/src/runtime_state.rs | 60 ++++++++++++++++++++- scripts/package-native-runtime.sh | 2 +- 4 files changed, 156 insertions(+), 5 deletions(-) diff --git a/crates/mesh-llm-native-runtime/src/cache.rs b/crates/mesh-llm-native-runtime/src/cache.rs index 413e08591..d920626f9 100644 --- a/crates/mesh-llm-native-runtime/src/cache.rs +++ b/crates/mesh-llm-native-runtime/src/cache.rs @@ -203,7 +203,10 @@ fn copy_dir_recursive(source: &Path, target: &Path) -> Result<()> { let entry = entry?; let source_path = entry.path(); let target_path = target.join(entry.file_name()); - if entry.file_type()?.is_dir() { + let file_type = entry.file_type()?; + if file_type.is_symlink() { + copy_symlink(&source_path, &target_path)?; + } else if file_type.is_dir() { copy_dir_recursive(&source_path, &target_path)?; } else { fs::copy(&source_path, &target_path).with_context(|| { @@ -218,6 +221,28 @@ fn copy_dir_recursive(source: &Path, target: &Path) -> Result<()> { Ok(()) } +#[cfg(unix)] +fn copy_symlink(source: &Path, target: &Path) -> Result<()> { + let link = fs::read_link(source).with_context(|| format!("read link {}", source.display()))?; + std::os::unix::fs::symlink(&link, target) + .with_context(|| format!("symlink {} to {}", link.display(), target.display())) +} + +#[cfg(windows)] +fn copy_symlink(source: &Path, target: &Path) -> Result<()> { + let link = fs::read_link(source).with_context(|| format!("read link {}", source.display()))?; + let resolved = source + .parent() + .unwrap_or_else(|| Path::new(".")) + .join(&link); + if resolved.is_dir() { + std::os::windows::fs::symlink_dir(&link, target) + } else { + std::os::windows::fs::symlink_file(&link, target) + } + .with_context(|| format!("symlink {} to {}", link.display(), target.display())) +} + #[cfg(test)] mod tests { use super::*; @@ -301,4 +326,24 @@ mod tests { ] ); } + + #[cfg(unix)] + #[test] + fn install_preserves_library_symlinks() { + let temp = tempfile::tempdir().unwrap(); + let source = temp.path().join("source"); + write_runtime(&source, "0.68.0", "meshllm-native-linux-x86_64-cpu"); + std::os::unix::fs::symlink("libmeshllm_ffi.so", source.join("lib/libmeshllm_ffi.so.0")) + .unwrap(); + + let cache = NativeRuntimeCache::new(temp.path().join("cache")); + let installed = cache.install_from_dir(&source).unwrap(); + let installed_alias = installed.path.join("lib/libmeshllm_ffi.so.0"); + + assert!(installed_alias.is_symlink()); + assert_eq!( + fs::read_link(installed_alias).unwrap(), + Path::new("libmeshllm_ffi.so") + ); + } } diff --git a/crates/skippy-ffi/src/lib.rs b/crates/skippy-ffi/src/lib.rs index 7ab529d57..91a7b9ccb 100644 --- a/crates/skippy-ffi/src/lib.rs +++ b/crates/skippy-ffi/src/lib.rs @@ -344,6 +344,8 @@ impl std::error::Error for NativeRuntimeLoadError {} mod dynamic { use super::*; use libloading::Library; + use std::collections::HashSet; + use std::path::{Path, PathBuf}; use std::sync::OnceLock; static SYMBOLS: OnceLock = OnceLock::new(); @@ -392,12 +394,28 @@ mod dynamic { .into_iter() .map(|path| path.as_ref().to_path_buf()) .collect::>(); - let symbols = unsafe { Symbols::load_paths(&collected) }?; + let symbols = unsafe { Symbols::load_paths(&unique_library_paths(&collected)) }?; SYMBOLS .set(symbols) .map_err(|_| NativeRuntimeLoadError::AlreadyLoaded) } + fn unique_library_paths

(paths: &[P]) -> Vec + where + P: AsRef, + { + let mut seen = HashSet::new(); + let mut unique = Vec::with_capacity(paths.len()); + for path in paths { + let path = path.as_ref(); + let identity = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + if seen.insert(identity) { + unique.push(path.to_path_buf()); + } + } + unique + } + fn symbols() -> &'static Symbols { SYMBOLS .get() @@ -553,6 +571,38 @@ mod dynamic { mtmd_helper_eval_chunks(ctx: *mut MtmdContext, lctx: *mut Opaque, chunks: *const MtmdInputChunks, n_past: i32, seq_id: i32, n_batch: i32, logits_last: bool, new_n_past: *mut i32) -> c_int; mtmd_helper_eval_chunk_single(ctx: *mut MtmdContext, lctx: *mut Opaque, chunk: *const Opaque, n_past: i32, seq_id: i32, n_batch: i32, logits_last: bool, new_n_past: *mut i32) -> c_int; } + + #[cfg(test)] + mod tests { + use super::unique_library_paths; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; + + #[cfg(unix)] + #[test] + fn unique_library_paths_deduplicates_symlinks() { + let root = std::env::temp_dir().join(format!( + "skippy-ffi-{}-{}", + std::process::id(), + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(&root).unwrap(); + let real = root.join("libllama.0.0.dylib"); + let soname = root.join("libllama.0.dylib"); + let linker = root.join("libllama.dylib"); + fs::write(&real, b"not a real dylib").unwrap(); + std::os::unix::fs::symlink(&real, &soname).unwrap(); + std::os::unix::fs::symlink(&real, &linker).unwrap(); + + let unique = unique_library_paths(&[&real, &soname, &linker]); + + assert_eq!(unique, vec![real.clone()]); + fs::remove_dir_all(root).unwrap(); + } + } } #[cfg(feature = "dynamic-runtime")] diff --git a/crates/skippy-server/src/runtime_state.rs b/crates/skippy-server/src/runtime_state.rs index ab44aeeb7..af467ad3f 100644 --- a/crates/skippy-server/src/runtime_state.rs +++ b/crates/skippy-server/src/runtime_state.rs @@ -904,6 +904,12 @@ fn runtime_config_from_stage_config( .map(u32::try_from) .transpose() .with_context(|| format!("n_threads_batch exceeds u32 for {}", config.stage_id))?; + let requires_tensor_filter = matches!( + config.load_mode, + LoadMode::LayerPackage | LoadMode::ArtifactSlice + ) || config.layer_start > 0 + || config.downstream.is_some(); + Ok(RuntimeConfig { stage_index: config.stage_index, layer_start: config.layer_start, @@ -934,7 +940,7 @@ fn runtime_config_from_stage_config( projector_path: config.projector_path.clone(), include_embeddings: config.layer_start == 0, include_output: config.downstream.is_none(), - filter_tensors_on_load: config.filter_tensors_on_load, + filter_tensors_on_load: config.filter_tensors_on_load || requires_tensor_filter, }) } @@ -1105,7 +1111,7 @@ mod tests { cache_type_k: "f16".to_string(), cache_type_v: "f16".to_string(), flash_attn_type: FlashAttentionType::Enabled, - filter_tensors_on_load: true, + filter_tensors_on_load: false, selected_device: Some(StageDevice { backend_device: "Vulkan1".into(), stable_id: Some("pci:0000:65:00.0".into()), @@ -1186,6 +1192,7 @@ mod tests { assert!(!runtime_config.include_embeddings); assert!(runtime_config.include_output); + assert!(runtime_config.filter_tensors_on_load); } #[test] @@ -1231,6 +1238,55 @@ mod tests { assert_eq!(runtime_config.n_threads_batch, None); assert_eq!(runtime_config.n_batch, None); assert_eq!(runtime_config.n_ubatch, None); + assert!(!runtime_config.filter_tensors_on_load); + } + + #[test] + fn runtime_config_forces_tensor_filter_for_first_stage_with_downstream() { + let config = StageConfig { + run_id: "run-a".to_string(), + topology_id: "topology-a".to_string(), + model_id: "model-a".to_string(), + package_ref: Some("/tmp/package".to_string()), + manifest_sha256: Some("manifest".to_string()), + source_model_path: None, + source_model_sha256: None, + source_model_bytes: None, + materialized_path: None, + materialized_pinned: false, + model_path: Some("/tmp/package".to_string()), + projector_path: None, + stage_id: "stage-0".to_string(), + stage_index: 0, + layer_start: 0, + layer_end: 20, + ctx_size: 512, + lane_count: 1, + n_batch: None, + n_ubatch: None, + n_gpu_layers: -1, + cache_type_k: "f16".to_string(), + cache_type_v: "f16".to_string(), + flash_attn_type: FlashAttentionType::Auto, + filter_tensors_on_load: false, + selected_device: None, + kv_cache: None, + load_mode: LoadMode::RuntimeSlice, + bind_addr: "127.0.0.1:0".to_string(), + upstream: None, + downstream: Some(PeerConfig { + stage_id: "stage-1".to_string(), + stage_index: 1, + endpoint: "tcp://127.0.0.1:19001".to_string(), + }), + }; + + let runtime_config = + runtime_config_from_stage_config(&config, &RuntimeLaunchOverrides::default()).unwrap(); + + assert!(runtime_config.include_embeddings); + assert!(!runtime_config.include_output); + assert!(runtime_config.filter_tensors_on_load); } #[test] diff --git a/scripts/package-native-runtime.sh b/scripts/package-native-runtime.sh index 3786ae912..261865da2 100755 --- a/scripts/package-native-runtime.sh +++ b/scripts/package-native-runtime.sh @@ -288,7 +288,7 @@ mkdir -p "$stage_dir/lib" library_paths=() for library in "${runtime_libraries[@]}"; do name="$(basename "$library")" - cp "$library" "$stage_dir/lib/$name" + cp -P "$library" "$stage_dir/lib/$name" library_paths+=("lib/$name") done From 63e3ee7be29dd1adb1b576a816da1a76fa8f599a Mon Sep 17 00:00:00 2001 From: James Dumay Date: Sun, 7 Jun 2026 09:54:36 +1000 Subject: [PATCH 9/9] Tune speculative fallback for drafting --- .../src/frontend/embedded_generation.rs | 14 +++- .../skippy-server/src/frontend/speculative.rs | 68 ++++++++++++++++-- crates/skippy-server/src/frontend/tests.rs | 72 +++++++++++++++++++ 3 files changed, 148 insertions(+), 6 deletions(-) diff --git a/crates/skippy-server/src/frontend/embedded_generation.rs b/crates/skippy-server/src/frontend/embedded_generation.rs index 847153510..020768a0c 100644 --- a/crates/skippy-server/src/frontend/embedded_generation.rs +++ b/crates/skippy-server/src/frontend/embedded_generation.rs @@ -718,7 +718,7 @@ impl StageOpenAiBackend { } let max_speculative_window = request.speculative_window.max(1); let mut adaptive_window = if request.adaptive_speculative_window { - max_speculative_window.min(4) + max_speculative_window.min(2) } else { max_speculative_window }; @@ -767,6 +767,9 @@ impl StageOpenAiBackend { if fused_reached_stop { break; } + if decoded_tokens >= request.max_tokens as usize { + break; + } if request .cancellation .is_some_and(openai_frontend::CancellationToken::is_cancelled) @@ -774,6 +777,10 @@ impl StageOpenAiBackend { break; } let token_timer = PhaseTimer::start(); + if draft_guard.is_some() && speculative_stats.should_disable_for_request() { + speculative_stats.mark_disabled_for_request(); + draft_guard = None; + } if draft_guard.is_some() { let remaining = request.max_tokens as usize - decoded_tokens; if remaining == 0 { @@ -985,6 +992,11 @@ impl StageOpenAiBackend { } } + let remaining_commit_budget = + (request.max_tokens as usize).saturating_sub(decoded_tokens); + commit_tokens.truncate(remaining_commit_budget); + speculative_stats.committed_tokens += commit_tokens.len(); + let mut reached_stop = false; for token in commit_tokens { current = token; diff --git a/crates/skippy-server/src/frontend/speculative.rs b/crates/skippy-server/src/frontend/speculative.rs index b9791139f..f90b9e07c 100644 --- a/crates/skippy-server/src/frontend/speculative.rs +++ b/crates/skippy-server/src/frontend/speculative.rs @@ -1,9 +1,16 @@ use super::*; +const SPEC_FALLBACK_MIN_WINDOWS: usize = 4; +const SPEC_FALLBACK_MIN_DRAFT_TOKENS: usize = 8; +const SPEC_FALLBACK_ACCEPT_RATE: f64 = 0.75; +const SPEC_FALLBACK_REPAIR_RATE: f64 = 0.50; +const SPEC_FALLBACK_MAX_MS_PER_COMMITTED_TOKEN: f64 = 220.0; + #[derive(Default)] pub(super) struct OpenAiSpeculativeStats { pub(super) windows: usize, pub(super) draft_tokens: usize, + pub(super) committed_tokens: usize, pub(super) accepted_tokens: usize, pub(super) rejected_tokens: usize, pub(super) full_accept_windows: usize, @@ -47,9 +54,26 @@ pub(super) struct OpenAiSpeculativeStats { pub(super) adaptive_window_grows: usize, pub(super) adaptive_window_shrinks: usize, pub(super) adaptive_window_enabled: bool, + pub(super) disabled_for_request: bool, } impl OpenAiSpeculativeStats { + pub(super) fn should_disable_for_request(&self) -> bool { + if self.disabled_for_request + || self.windows < SPEC_FALLBACK_MIN_WINDOWS + || self.draft_tokens < SPEC_FALLBACK_MIN_DRAFT_TOKENS + { + return false; + } + self.accept_rate() < SPEC_FALLBACK_ACCEPT_RATE + || self.repair_rate() >= SPEC_FALLBACK_REPAIR_RATE + || self.speculative_ms_per_committed_token() > SPEC_FALLBACK_MAX_MS_PER_COMMITTED_TOKEN + } + + pub(super) fn mark_disabled_for_request(&mut self) { + self.disabled_for_request = true; + } + pub(super) fn observe_verify_decision( &mut self, decision: VerifySpanDecision, @@ -99,6 +123,32 @@ impl OpenAiSpeculativeStats { } } + fn accept_rate(&self) -> f64 { + if self.draft_tokens == 0 { + 0.0 + } else { + self.accepted_tokens as f64 / self.draft_tokens as f64 + } + } + + fn repair_rate(&self) -> f64 { + if self.windows == 0 { + 0.0 + } else { + self.repair_required_windows as f64 / self.windows as f64 + } + } + + fn speculative_ms_per_committed_token(&self) -> f64 { + if self.committed_tokens == 0 { + 0.0 + } else { + let elapsed_ms = + self.draft_propose_ms + self.primary_verify_elapsed_ms + self.recovery_ms; + elapsed_ms / self.committed_tokens as f64 + } + } + pub(super) fn observe_reject(&mut self, decision: VerifySpanDecision) { if let Some(repair_input_count) = decision.repair_input_count { self.rejected_windows += 1; @@ -155,17 +205,21 @@ impl OpenAiSpeculativeStats { "llama_stage.spec.accepted".to_string(), json!(self.accepted_tokens), ); + attrs.insert( + "llama_stage.spec.committed".to_string(), + json!(self.committed_tokens), + ); attrs.insert( "llama_stage.spec.rejected".to_string(), json!(self.rejected_tokens), ); attrs.insert( "llama_stage.spec.accept_rate".to_string(), - json!(if self.draft_tokens == 0 { - 0.0 - } else { - self.accepted_tokens as f64 / self.draft_tokens as f64 - }), + json!(self.accept_rate()), + ); + attrs.insert( + "llama_stage.spec.ms_per_committed_token".to_string(), + json!(self.speculative_ms_per_committed_token()), ); attrs.insert( "llama_stage.spec.full_accept_windows".to_string(), @@ -291,6 +345,10 @@ impl OpenAiSpeculativeStats { "llama_stage.spec.window_shrinks".to_string(), json!(self.adaptive_window_shrinks), ); + attrs.insert( + "llama_stage.spec.disabled_for_request".to_string(), + json!(self.disabled_for_request), + ); } } diff --git a/crates/skippy-server/src/frontend/tests.rs b/crates/skippy-server/src/frontend/tests.rs index b148bd4bf..418c258cd 100644 --- a/crates/skippy-server/src/frontend/tests.rs +++ b/crates/skippy-server/src/frontend/tests.rs @@ -28,6 +28,78 @@ const MM_CTX_SIZE_ENV: &str = "SKIPPY_MM_CTX_SIZE"; const MM_MAX_TOKENS_ENV: &str = "SKIPPY_MM_MAX_TOKENS"; const MM_N_GPU_LAYERS_ENV: &str = "SKIPPY_MM_N_GPU_LAYERS"; +#[test] +fn speculative_fallback_waits_for_enough_evidence() { + let stats = OpenAiSpeculativeStats { + windows: 3, + draft_tokens: 7, + accepted_tokens: 0, + repair_required_windows: 3, + ..OpenAiSpeculativeStats::default() + }; + + assert!(!stats.should_disable_for_request()); +} + +#[test] +fn speculative_fallback_disables_low_acceptance_requests() { + let mut stats = OpenAiSpeculativeStats { + windows: 4, + draft_tokens: 8, + committed_tokens: 5, + accepted_tokens: 5, + repair_required_windows: 0, + ..OpenAiSpeculativeStats::default() + }; + + assert!(stats.should_disable_for_request()); + stats.mark_disabled_for_request(); + assert!(!stats.should_disable_for_request()); +} + +#[test] +fn speculative_fallback_disables_repair_heavy_requests() { + let stats = OpenAiSpeculativeStats { + windows: 4, + draft_tokens: 8, + committed_tokens: 8, + accepted_tokens: 8, + repair_required_windows: 2, + ..OpenAiSpeculativeStats::default() + }; + + assert!(stats.should_disable_for_request()); +} + +#[test] +fn speculative_fallback_keeps_high_acceptance_requests() { + let stats = OpenAiSpeculativeStats { + windows: 4, + draft_tokens: 8, + committed_tokens: 8, + accepted_tokens: 8, + repair_required_windows: 1, + primary_verify_elapsed_ms: 1_600.0, + ..OpenAiSpeculativeStats::default() + }; + + assert!(!stats.should_disable_for_request()); +} + +#[test] +fn speculative_fallback_disables_slow_accepted_requests() { + let stats = OpenAiSpeculativeStats { + windows: 4, + draft_tokens: 8, + committed_tokens: 8, + accepted_tokens: 8, + primary_verify_elapsed_ms: 1_900.0, + ..OpenAiSpeculativeStats::default() + }; + + assert!(stats.should_disable_for_request()); +} + #[test] fn proactive_eviction_attrs_are_bounded_and_request_free() { let attrs = proactive_eviction_attrs("error", Some("inactive_session"), 1024, 2, 768);