Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions crates/mesh-llm-cli/src/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,21 @@ pub enum ModelsCommand {
/// Override model ID in the manifest.
#[arg(long)]
model_id: Option<String>,
/// Draft model ref to declare for package default speculative decoding.
#[arg(long = "spec-draft-model")]
spec_draft_model: Option<String>,
/// Package speculative strategy name.
#[arg(long = "spec-strategy", default_value = "draft")]
spec_strategy: String,
/// Initial adaptive speculative decode window for the package strategy.
#[arg(long = "spec-initial-window", default_value_t = 16)]
spec_initial_window: u32,
/// Minimum adaptive speculative decode window for the package strategy.
#[arg(long = "spec-min-window", default_value_t = 2)]
spec_min_window: u32,
/// Maximum adaptive speculative decode window for the package strategy.
#[arg(long = "spec-max-window", default_value_t = 16)]
spec_max_window: u32,
/// HF Job hardware flavor. Use auto for the default CPU splitter baseline.
#[arg(long, default_value = "auto")]
flavor: String,
Expand Down
20 changes: 20 additions & 0 deletions crates/mesh-llm-cli/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,26 @@ pub enum Command {
#[arg(long)]
model_id: Option<String>,

/// Draft model ref to declare for package default speculative decoding.
#[arg(long = "spec-draft-model")]
spec_draft_model: Option<String>,

/// Package speculative strategy name.
#[arg(long = "spec-strategy", default_value = "draft")]
spec_strategy: String,

/// Initial adaptive speculative decode window for the package strategy.
#[arg(long = "spec-initial-window", default_value_t = 16)]
spec_initial_window: u32,

/// Minimum adaptive speculative decode window for the package strategy.
#[arg(long = "spec-min-window", default_value_t = 2)]
spec_min_window: u32,

/// Maximum adaptive speculative decode window for the package strategy.
#[arg(long = "spec-max-window", default_value_t = 16)]
spec_max_window: u32,

/// HF Job hardware flavor. Use auto for the default CPU splitter baseline.
#[arg(long, default_value = "auto")]
flavor: String,
Expand Down
60 changes: 40 additions & 20 deletions crates/mesh-llm-commands/src/model_package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ pub struct ModelPrepareArgs<'a> {
pub quant: Option<&'a str>,
pub target: Option<&'a str>,
pub model_id: Option<&'a str>,
pub spec_draft_model: Option<&'a str>,
pub spec_strategy: &'a str,
pub spec_initial_window: u32,
pub spec_min_window: u32,
pub spec_max_window: u32,
pub flavor: &'a str,
pub timeout: &'a str,
pub mesh_llm_ref: &'a str,
Expand All @@ -34,6 +39,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
quant,
target,
model_id,
spec_draft_model,
spec_strategy,
spec_initial_window,
spec_min_window,
spec_max_window,
flavor,
timeout,
mesh_llm_ref,
Expand Down Expand Up @@ -119,6 +129,11 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
quant: source_quant.map(|s| s.to_string()),
target: target.map(|s| s.to_string()),
model_id: model_id.map(|s| s.to_string()),
spec_draft_model: spec_draft_model.map(|s| s.to_string()),
spec_strategy: spec_strategy.to_string(),
spec_initial_window,
spec_min_window,
spec_max_window,
flavor: flavor.to_string(),
timeout_seconds,
mesh_llm_ref: mesh_llm_ref.to_string(),
Expand Down Expand Up @@ -183,26 +198,7 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
);

if !submitting {
let redacted = redacted_spec(&job.spec);
if json {
println!(
"{}",
serde_json::to_string_pretty(&json!({
"dryRun": true,
"confirmRequired": true,
"sourceRepo": job.source_repo,
"sourceFile": job.source_file,
"targetRepo": job.target_repo,
"modelId": job.model_id,
"jobPlan": job.job_plan,
"spec": redacted,
}))?
);
} else {
eprintln!();
eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit.");
println!("{}", serde_json::to_string_pretty(&redacted)?);
}
print_package_job_dry_run(&job, json)?;
return Ok(());
}

Expand Down Expand Up @@ -251,6 +247,30 @@ pub async fn dispatch_model_package(args: ModelPrepareArgs<'_>) -> Result<()> {
Ok(())
}

fn print_package_job_dry_run(job: &prepare::PrepareJob, json_output: bool) -> Result<()> {
let redacted = redacted_spec(&job.spec);
if json_output {
println!(
"{}",
serde_json::to_string_pretty(&json!({
"dryRun": true,
"confirmRequired": true,
"sourceRepo": job.source_repo,
"sourceFile": job.source_file,
"targetRepo": job.target_repo,
"modelId": job.model_id,
"jobPlan": job.job_plan,
"spec": redacted,
}))?
);
} else {
eprintln!();
eprintln!("🔍 Dry run — no HF Job was submitted. Add --confirm to submit.");
println!("{}", serde_json::to_string_pretty(&redacted)?);
}
Ok(())
}

async fn run_list_quants(
client: &hf_hub::HFClient,
source_repo: &str,
Expand Down
25 changes: 25 additions & 0 deletions crates/mesh-llm-config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,31 @@ store.update(|config| {
})?;
```

### Configure speculative decoding

Use the editor API for speculative defaults or model-specific speculative
settings. This keeps SDKs and apps on the same validated config schema as the
CLI.

```rust
use mesh_llm_config::ConfigStore;

let store = ConfigStore::default_path()?;
store.update(|config| {
config
.upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")?
.speculative()
.mode("draft")
.draft_hf_source(
"unsloth/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-Q4_K_M.gguf",
)
.draft_selection_policy("manual")
.draft_max_tokens(16);
Ok(())
})?;
```

### Configure plugins

Use plugin helpers for common cases instead of writing `[[plugin]]` tables.
Expand Down
104 changes: 103 additions & 1 deletion crates/mesh-llm-config/src/authoring.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::{
GpuAssignment, HardwareConfig, MeshConfig, ModelConfigDefaults, ModelConfigEntry,
ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, ThroughputConfig,
ModelFitConfig, MultimodalConfig, PluginConfigEntry, RequestDefaultsConfig, SpeculativeConfig,
ThroughputConfig,
};
use anyhow::{Result, bail};
use mesh_llm_types::runtime::ModelRuntimeKind;
Expand Down Expand Up @@ -256,6 +257,20 @@ impl ModelDefaultsEditor<'_> {
self
}

pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> {
SpeculativeConfigEditor {
speculative: self
.defaults
.speculative
.get_or_insert_with(Default::default),
}
}

pub fn clear_speculative(&mut self) -> &mut Self {
self.defaults.speculative = None;
self
}

fn hardware(&mut self) -> &mut HardwareConfig {
self.defaults.hardware.get_or_insert_with(Default::default)
}
Expand Down Expand Up @@ -332,6 +347,17 @@ impl ModelConfigEditor<'_> {
self
}

pub fn speculative(&mut self) -> SpeculativeConfigEditor<'_> {
SpeculativeConfigEditor {
speculative: self.model.speculative.get_or_insert_with(Default::default),
}
}

pub fn clear_speculative(&mut self) -> &mut Self {
self.model.speculative = None;
self
}

fn hardware(&mut self) -> &mut HardwareConfig {
self.model.hardware.get_or_insert_with(Default::default)
}
Expand All @@ -355,6 +381,82 @@ impl ModelConfigEditor<'_> {
}
}

pub struct SpeculativeConfigEditor<'a> {
speculative: &'a mut SpeculativeConfig,
}

impl SpeculativeConfigEditor<'_> {
pub fn mode(&mut self, mode: impl Into<String>) -> &mut Self {
self.speculative.mode = Some(mode.into());
self
}

pub fn package_strategy(&mut self, strategy: impl Into<String>) -> &mut Self {
self.speculative.package_strategy = Some(strategy.into());
self
}

pub fn draft_model_path(&mut self, path: impl Into<String>) -> &mut Self {
self.speculative.draft_model_path = Some(path.into());
self
}

pub fn draft_hf_source(
&mut self,
repo: impl Into<String>,
file: impl Into<String>,
) -> &mut Self {
self.speculative.draft_hf_repo = Some(repo.into());
self.speculative.draft_hf_file = Some(file.into());
self
}

pub fn draft_selection_policy(&mut self, policy: impl Into<String>) -> &mut Self {
self.speculative.draft_selection_policy = Some(policy.into());
self
}

pub fn pairing_fault(&mut self, policy: impl Into<String>) -> &mut Self {
self.speculative.pairing_fault = Some(policy.into());
self
}

pub fn draft_max_tokens(&mut self, tokens: u32) -> &mut Self {
self.speculative.draft_max_tokens = Some(tokens);
self
}

pub fn draft_min_tokens(&mut self, tokens: u32) -> &mut Self {
self.speculative.draft_min_tokens = Some(tokens);
self
}

pub fn draft_gpu_layers(&mut self, layers: i32) -> &mut Self {
self.speculative.draft_gpu_layers = Some(layers);
self
}

pub fn draft_device(&mut self, device: impl Into<String>) -> &mut Self {
self.speculative.draft_device = Some(device.into());
self
}

pub fn draft_threads(&mut self, threads: usize) -> &mut Self {
self.speculative.draft_threads = Some(threads);
self
}

pub fn draft_cache_types(
&mut self,
key: impl Into<String>,
value: impl Into<String>,
) -> &mut Self {
self.speculative.draft_cache_type_k = Some(key.into());
self.speculative.draft_cache_type_v = Some(value.into());
self
}
}

pub struct PluginConfigEditor<'a> {
plugin: &'a mut PluginConfigEntry,
}
Expand Down
67 changes: 66 additions & 1 deletion crates/mesh-llm-config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ mod validate;

pub use authoring::{
ConfigEditor, LocalServingNodeConfig, ModelConfigEditor, ModelDefaultsEditor,
PluginConfigEditor,
PluginConfigEditor, SpeculativeConfigEditor,
};
pub use model::*;
pub use store::{ConfigStore, config_path, config_to_toml, load_config, parse_config_toml};
Expand Down Expand Up @@ -204,6 +204,71 @@ ctx_size = 4096
assert!(fs::read_to_string(path).unwrap().contains("[[plugin]]"));
}

#[test]
fn config_editor_updates_model_speculative_without_callers_writing_toml() {
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("config.toml");
let store = ConfigStore::open(&path);

let config = store
.update(|config| {
config
.upsert_model("meta-llama/Llama-3.3-70B-Instruct-GGUF:Q3_K_M")?
.speculative()
.mode("draft")
.draft_hf_source(
"unsloth/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-Q4_K_M.gguf",
)
.draft_selection_policy("manual")
.pairing_fault("warn_disable")
.draft_max_tokens(16)
.draft_gpu_layers(-1);
Ok(())
})
.unwrap();

let speculative = config.models[0].speculative.as_ref().unwrap();
assert_eq!(speculative.mode.as_deref(), Some("draft"));
assert_eq!(speculative.draft_max_tokens, Some(16));
let raw = fs::read_to_string(path).unwrap();
assert!(raw.contains("[[models]]"));
assert!(raw.contains("[models.speculative]"));
assert!(raw.contains("draft_selection_policy = \"manual\""));
}

#[test]
fn config_editor_updates_default_speculative_package_strategy() {
let temp_dir = TempDir::new().unwrap();
let path = temp_dir.path().join("config.toml");
let store = ConfigStore::open(&path);

let config = store
.update(|config| {
config
.defaults()
.speculative()
.mode("auto")
.package_strategy("llama32-1b-q4");
Ok(())
})
.unwrap();

let speculative = config
.defaults
.as_ref()
.and_then(|defaults| defaults.speculative.as_ref())
.unwrap();
assert_eq!(speculative.mode.as_deref(), Some("auto"));
assert_eq!(
speculative.package_strategy.as_deref(),
Some("llama32-1b-q4")
);
let raw = fs::read_to_string(path).unwrap();
assert!(raw.contains("[defaults.speculative]"));
assert!(raw.contains("package_strategy = \"llama32-1b-q4\""));
}

#[test]
fn parse_config_toml_rejects_unknown_runtime_kind() {
let err = parse_config_toml(
Expand Down
2 changes: 2 additions & 0 deletions crates/mesh-llm-config/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,8 @@ pub struct SpeculativeConfig {
#[serde(default)]
pub mode: Option<String>,
#[serde(default)]
pub package_strategy: Option<String>,
#[serde(default)]
pub draft_model_path: Option<String>,
#[serde(default)]
pub draft_hf_repo: Option<String>,
Expand Down
Loading