From 1f4f1da4a25404190e14bb85b4824661f7021cf7 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Mon, 25 May 2026 21:44:29 +0700 Subject: [PATCH 01/22] feat(agent-runtime): add AgentDefinition + ModelTier + OutputMode (Phase 0.1+0.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lay the foundation for declarative agent definitions adapted from Codebuff's AgentDefinition schema, but adapted to jcode's single-OAuth provider reality: - signals.rs: existing soft-interrupt + cancellation primitives moved into a named module; root-level re-exports preserved so src/agent.rs consumers compile unchanged. - definition.rs: AgentDefinition struct (id, model_override, prefer_tier, reasoning, tool_names, spawnable_agents, prompts, output_mode, inherit_parent_system_prompt, include_message_history) with TOML round-trip + validation for id format, system_prompt vs inherit conflict, structured_output schema requirement, self-spawn, and duplicate tool/agent ids. - tier.rs: user-defined tier slot (routine/thinking) backed by the same JCODE_ROUTING_* env vars as model_routing.rs (#100). NOT a catalog — agents inherit session model when no tier is configured, so subscription users (Claude Pro / ChatGPT Plus / Gemini Advanced) see no behavior change. Pay-per-token users opt in by setting two env vars. - reasoning.rs: ReasoningEffort enum (minimal/low/medium/high). - output.rs: OutputMode enum (last_message/all_messages/structured_output). 32 unit tests pass. Full `cargo check --bin jcode` succeeds. This is Phase 0 of the multi-agent foundation — no runtime engine changes yet. Next: TOML loader for .jcode/agents/*.toml + builtin embedded agents (Phase 0.3). --- Cargo.lock | 4 + crates/jcode-agent-runtime/Cargo.toml | 7 + crates/jcode-agent-runtime/src/definition.rs | 495 +++++++++++++++++++ crates/jcode-agent-runtime/src/lib.rs | 134 ++--- crates/jcode-agent-runtime/src/output.rs | 75 +++ crates/jcode-agent-runtime/src/reasoning.rs | 108 ++++ crates/jcode-agent-runtime/src/signals.rs | 98 ++++ crates/jcode-agent-runtime/src/tier.rs | 330 +++++++++++++ 8 files changed, 1160 insertions(+), 91 deletions(-) create mode 100644 crates/jcode-agent-runtime/src/definition.rs create mode 100644 crates/jcode-agent-runtime/src/output.rs create mode 100644 crates/jcode-agent-runtime/src/reasoning.rs create mode 100644 crates/jcode-agent-runtime/src/signals.rs create mode 100644 crates/jcode-agent-runtime/src/tier.rs diff --git a/Cargo.lock b/Cargo.lock index 97eb00ea4..20990af27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3645,8 +3645,12 @@ dependencies = [ name = "jcode-agent-runtime" version = "0.1.0" dependencies = [ + "anyhow", + "serde", + "serde_json", "thiserror 1.0.69", "tokio", + "toml", ] [[package]] diff --git a/crates/jcode-agent-runtime/Cargo.toml b/crates/jcode-agent-runtime/Cargo.toml index c475c51d8..f66eb40ce 100644 --- a/crates/jcode-agent-runtime/Cargo.toml +++ b/crates/jcode-agent-runtime/Cargo.toml @@ -10,3 +10,10 @@ path = "src/lib.rs" [dependencies] thiserror = "1" tokio = { version = "1", features = ["sync"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +toml = "0.8" +anyhow = "1" + +[dev-dependencies] +serde_json = "1" diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs new file mode 100644 index 000000000..3e2203e8b --- /dev/null +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -0,0 +1,495 @@ +//! Declarative agent definitions. +//! +//! An `AgentDefinition` is the schema that describes a sub-agent: its model +//! preferences, the tools it's allowed to call, the agents it can spawn, +//! the prompts it ships, and how its output flows back to its parent. +//! +//! Definitions are loaded from TOML files in three locations (highest +//! priority first): +//! +//! 1. `.jcode/agents/.toml` (project-local, committed to repo) +//! 2. `~/.jcode/agents/.toml` (user-global) +//! 3. Embedded built-in agents bundled with the binary +//! +//! ## Design constraints +//! +//! - Definitions are **declarative TOML**, not Rust code, so users can +//! add agents without recompiling the binary. +//! - `model` is **not required**: agents inherit the session's current +//! model unless they explicitly opt into tier slots or override. +//! - `tool_names` is a whitelist — agents start with NO tools by +//! default and must list every tool they need. This is a security +//! property: a poorly-defined agent can't escalate by accident. +//! - `spawnable_agents` is also a whitelist for the same reason. +//! +//! ## Adapted from Codebuff +//! +//! Field names track Codebuff's `AgentDefinition` (snake_case Rust → +//! camelCase TS) so prior art is reusable. Differences: +//! +//! - No `model` field as required string — replaced by tier + override. +//! - No `providerOptions` — jcode's session has a single provider. +//! - `handle_steps` is a future addition (programmatic agents arrive in +//! Phase 2); for now agents are pure prompted. + +use crate::output::OutputMode; +use crate::reasoning::ReasoningEffort; +use crate::tier::ModelTier; + +use serde::{Deserialize, Serialize}; + +/// Default version assigned when a definition omits `version`. +pub const DEFAULT_AGENT_VERSION: &str = "0.1.0"; + +/// Declarative description of one agent. +/// +/// Intentionally `Clone` so the runtime can hand each spawn its own copy +/// without locking the registry. Definitions are small (a few KB at most). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentDefinition { + // ----------------------------------------------------------------- + // Identity + // ----------------------------------------------------------------- + /// Unique agent id. Lowercase letters, digits, hyphens. e.g. `file-picker`. + /// Must be unique within the registry — duplicate ids are a load error. + pub id: String, + + /// Human-readable name shown in TUI / logs. e.g. `"Fletcher the File Fetcher"`. + pub display_name: String, + + /// Publisher / namespace id when this agent is shared across projects. + /// Optional for local agents; required if the agent is published to a + /// future agent registry. + #[serde(default)] + pub publisher: Option, + + /// Semver-ish version. Defaults to `DEFAULT_AGENT_VERSION`. + #[serde(default = "default_version")] + pub version: String, + + // ----------------------------------------------------------------- + // Model selection + // ----------------------------------------------------------------- + /// Optional tier slot to prefer when running this agent. The slot is + /// resolved against `JCODE_ROUTING_` env vars at run time. + /// Falls back to the session's current model if unset. + /// + /// See `tier.rs` for the full resolution algorithm. + #[serde(default)] + pub prefer_tier: Option, + + /// Optional explicit model id override. Highest priority — beats + /// `prefer_tier` and the session default. Use sparingly; hardcoding + /// model ids makes the agent file non-portable across providers. + #[serde(default)] + pub model_override: Option, + + /// Optional reasoning effort to forward to the provider request. + /// Defaults are model-specific; runtime fills in a sensible default + /// when this field is `None`. + #[serde(default)] + pub reasoning: Option, + + // ----------------------------------------------------------------- + // Tools and sub-agents + // ----------------------------------------------------------------- + /// Allowlist of tool names this agent may call. Empty list = no tools. + /// Whitelist semantics are deliberate — agents shouldn't have access + /// to tools they don't need. + #[serde(default)] + pub tool_names: Vec, + + /// Allowlist of agent ids this agent may `spawn_agents` / `spawn_agent_inline`. + /// Empty list = no spawning. Use the local agent id (e.g. `file-picker`) + /// or the future `publisher/agent@version` form for shared agents. + #[serde(default)] + pub spawnable_agents: Vec, + + // ----------------------------------------------------------------- + // Prompts + // ----------------------------------------------------------------- + /// System prompt for this agent. Background, persona, mandates. + /// Mutually exclusive with `inherit_parent_system_prompt = true` + /// (which means "use the parent's system prompt instead, for cache + /// prefix sharing"). + #[serde(default)] + pub system_prompt: String, + + /// Instructions inserted after each user message. The most common + /// place to shape agent behavior — terser than `system_prompt`, + /// changes per turn allowed. + #[serde(default)] + pub instructions_prompt: Option, + + /// Optional reminder inserted at every agent step. Use sparingly — + /// strong models follow `instructions_prompt` reliably; this is for + /// weaker models or agents that need a per-step nudge. + #[serde(default)] + pub step_prompt: Option, + + /// Spawner-side prompt: when and why a parent agent should spawn this + /// agent. Used in `spawn_agents` tool documentation so the parent's + /// LLM picks the right sub-agent. + #[serde(default)] + pub spawner_prompt: Option, + + // ----------------------------------------------------------------- + // Context / cache behavior + // ----------------------------------------------------------------- + /// When true, child agent uses the parent's `system_prompt` instead + /// of its own. This is the **prompt cache prefix-sharing trick** — + /// editor / reviewer agents typically set this to `true` so the + /// expensive system prompt is cache-hit rather than re-sent. + /// + /// Mutually exclusive with a non-empty `system_prompt`. + #[serde(default)] + pub inherit_parent_system_prompt: bool, + + /// When true, child agent receives the parent's full message history. + /// Default false — most sub-agents work better with a clean slate + /// (file-picker doesn't need to see edit chatter). + #[serde(default)] + pub include_message_history: bool, + + // ----------------------------------------------------------------- + // Output + // ----------------------------------------------------------------- + /// How the agent's output is delivered to the parent. Default + /// `LastMessage`. + #[serde(default)] + pub output_mode: OutputMode, + + /// JSON schema for `StructuredOutput` mode. Validated when the agent + /// calls `set_output`. Stored as raw JSON value because we don't + /// pull a JSON-schema crate yet — Phase 3 will add proper validation. + #[serde(default)] + pub output_schema: Option, +} + +fn default_version() -> String { + DEFAULT_AGENT_VERSION.to_string() +} + +/// Validation errors produced when an agent definition violates its +/// invariants. Displayed to users when a TOML file fails to load. +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] +pub enum DefinitionError { + #[error( + "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen" + )] + InvalidId(String), + + #[error( + "agent `{id}` has both `inherit_parent_system_prompt = true` and a non-empty `system_prompt`. Set one or the other." + )] + SystemPromptConflict { id: String }, + + #[error( + "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing" + )] + StructuredOutputMissingSchema { id: String }, + + #[error("agent `{id}` references itself in `spawnable_agents`")] + SelfSpawn { id: String }, + + #[error("agent `{id}` lists tool `{tool}` more than once in `tool_names`")] + DuplicateTool { id: String, tool: String }, + + #[error("agent `{id}` lists agent `{spawn}` more than once in `spawnable_agents`")] + DuplicateSpawnable { id: String, spawn: String }, +} + +impl AgentDefinition { + /// Validate id format + cross-field invariants. Returns `Ok(())` when + /// the definition is well-formed. + pub fn validate(&self) -> Result<(), DefinitionError> { + // 1. id format + if !is_valid_id(&self.id) { + return Err(DefinitionError::InvalidId(self.id.clone())); + } + + // 2. system_prompt vs inherit_parent_system_prompt mutual exclusion + if self.inherit_parent_system_prompt && !self.system_prompt.is_empty() { + return Err(DefinitionError::SystemPromptConflict { + id: self.id.clone(), + }); + } + + // 3. structured_output requires schema + if matches!(self.output_mode, OutputMode::StructuredOutput) + && self.output_schema.is_none() + { + return Err(DefinitionError::StructuredOutputMissingSchema { + id: self.id.clone(), + }); + } + + // 4. cannot spawn self + if self.spawnable_agents.iter().any(|s| s == &self.id) { + return Err(DefinitionError::SelfSpawn { + id: self.id.clone(), + }); + } + + // 5. no duplicate tool names + let mut seen_tools = std::collections::HashSet::new(); + for tool in &self.tool_names { + if !seen_tools.insert(tool.clone()) { + return Err(DefinitionError::DuplicateTool { + id: self.id.clone(), + tool: tool.clone(), + }); + } + } + + // 6. no duplicate spawnable agent ids + let mut seen_spawn = std::collections::HashSet::new(); + for spawn in &self.spawnable_agents { + if !seen_spawn.insert(spawn.clone()) { + return Err(DefinitionError::DuplicateSpawnable { + id: self.id.clone(), + spawn: spawn.clone(), + }); + } + } + + Ok(()) + } + + /// Resolve the concrete model id to use for one invocation of this agent. + /// Convenience wrapper around `tier::resolve_model`. + pub fn resolve_model(&self, current_session_model: &str) -> String { + crate::tier::resolve_model( + self.model_override.as_deref(), + self.prefer_tier, + current_session_model, + ) + } +} + +/// Agent ids are intentionally restrictive: lowercase ASCII letters, digits, +/// and hyphens. No leading/trailing hyphen. Mirrors Codebuff's id rule and +/// avoids cross-platform path issues when ids become file names. +fn is_valid_id(id: &str) -> bool { + if id.is_empty() { + return false; + } + if id.starts_with('-') || id.ends_with('-') { + return false; + } + id.chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-') +} + +#[cfg(test)] +mod tests { + use super::*; + + fn minimal_definition(id: &str) -> AgentDefinition { + AgentDefinition { + id: id.to_string(), + display_name: format!("Display for {id}"), + publisher: None, + version: DEFAULT_AGENT_VERSION.to_string(), + prefer_tier: None, + model_override: None, + reasoning: None, + tool_names: Vec::new(), + spawnable_agents: Vec::new(), + system_prompt: String::new(), + instructions_prompt: None, + step_prompt: None, + spawner_prompt: None, + inherit_parent_system_prompt: false, + include_message_history: false, + output_mode: OutputMode::LastMessage, + output_schema: None, + } + } + + #[test] + fn id_validation_rejects_uppercase() { + let mut d = minimal_definition("File-Picker"); + d.id = "File-Picker".to_string(); + assert!(matches!( + d.validate(), + Err(DefinitionError::InvalidId(_)) + )); + } + + #[test] + fn id_validation_rejects_underscore() { + let mut d = minimal_definition("file_picker"); + d.id = "file_picker".to_string(); + assert!(matches!( + d.validate(), + Err(DefinitionError::InvalidId(_)) + )); + } + + #[test] + fn id_validation_rejects_leading_hyphen() { + let mut d = minimal_definition("ok"); + d.id = "-bad".to_string(); + assert!(matches!( + d.validate(), + Err(DefinitionError::InvalidId(_)) + )); + } + + #[test] + fn id_validation_accepts_normal_kebab() { + let d = minimal_definition("file-picker-max"); + assert!(d.validate().is_ok()); + } + + #[test] + fn inherit_and_system_prompt_conflict() { + let mut d = minimal_definition("editor"); + d.inherit_parent_system_prompt = true; + d.system_prompt = "should be empty".to_string(); + assert!(matches!( + d.validate(), + Err(DefinitionError::SystemPromptConflict { .. }) + )); + } + + #[test] + fn inherit_alone_is_fine() { + let mut d = minimal_definition("editor"); + d.inherit_parent_system_prompt = true; + d.system_prompt = String::new(); + assert!(d.validate().is_ok()); + } + + #[test] + fn structured_output_requires_schema() { + let mut d = minimal_definition("judge"); + d.output_mode = OutputMode::StructuredOutput; + d.output_schema = None; + assert!(matches!( + d.validate(), + Err(DefinitionError::StructuredOutputMissingSchema { .. }) + )); + } + + #[test] + fn structured_output_with_schema_ok() { + let mut d = minimal_definition("judge"); + d.output_mode = OutputMode::StructuredOutput; + d.output_schema = Some(serde_json::json!({"type": "object"})); + assert!(d.validate().is_ok()); + } + + #[test] + fn self_spawn_detected() { + let mut d = minimal_definition("editor"); + d.spawnable_agents.push("editor".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::SelfSpawn { .. }) + )); + } + + #[test] + fn duplicate_tool_detected() { + let mut d = minimal_definition("editor"); + d.tool_names.push("read".to_string()); + d.tool_names.push("read".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::DuplicateTool { .. }) + )); + } + + #[test] + fn duplicate_spawnable_detected() { + let mut d = minimal_definition("editor"); + d.spawnable_agents.push("file-picker".to_string()); + d.spawnable_agents.push("file-picker".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::DuplicateSpawnable { .. }) + )); + } + + #[test] + fn resolve_model_uses_session_default_when_no_overrides() { + let d = minimal_definition("any"); + assert_eq!(d.resolve_model("claude-sonnet"), "claude-sonnet"); + } + + #[test] + fn resolve_model_uses_override() { + let mut d = minimal_definition("any"); + d.model_override = Some("forced-model".to_string()); + assert_eq!(d.resolve_model("ignored"), "forced-model"); + } + + // ----------------------------------------------------------------- + // TOML round-trip — exercises serde defaults and field coverage + // ----------------------------------------------------------------- + #[test] + fn toml_minimal_loads_with_defaults() { + let src = r#" + id = "file-picker" + display_name = "Fletcher" + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.id, "file-picker"); + assert_eq!(d.version, DEFAULT_AGENT_VERSION); + assert_eq!(d.output_mode, OutputMode::LastMessage); + assert!(d.tool_names.is_empty()); + assert!(d.spawnable_agents.is_empty()); + assert!(!d.inherit_parent_system_prompt); + } + + #[test] + fn toml_full_definition_loads() { + let src = r#" + id = "editor" + display_name = "Code Editor" + version = "1.2.0" + publisher = "jcode" + prefer_tier = "thinking" + reasoning = "high" + tool_names = ["str_replace", "write_file"] + spawnable_agents = ["file-picker"] + inherit_parent_system_prompt = true + include_message_history = true + output_mode = "all_messages" + instructions_prompt = "Implement the requested change." + step_prompt = "Continue editing." + spawner_prompt = "Use this agent for code edits." + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.id, "editor"); + assert_eq!(d.version, "1.2.0"); + assert_eq!(d.publisher.as_deref(), Some("jcode")); + assert_eq!(d.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!(d.reasoning, Some(ReasoningEffort::High)); + assert_eq!(d.tool_names, vec!["str_replace", "write_file"]); + assert!(d.inherit_parent_system_prompt); + assert_eq!(d.output_mode, OutputMode::AllMessages); + } + + #[test] + fn toml_unknown_field_is_rejected() { + // We DO NOT use `#[serde(deny_unknown_fields)]` because forward-compat + // matters when older binaries read newer TOML. But typo'd known fields + // are silently ignored — that's a UX hazard. Document the tradeoff + // here: if this becomes a problem, switch to deny_unknown_fields and + // version the schema explicitly. + // + // For now, this test just verifies unknown fields don't crash. + let src = r#" + id = "ok" + display_name = "ok" + unknown_future_field = "value" + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + } +} diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index 70bf958d6..5599633aa 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -1,91 +1,43 @@ -use std::sync::Arc; - -/// A soft interrupt message queued for injection at the next safe point. -#[derive(Debug, Clone)] -pub struct SoftInterruptMessage { - pub content: String, - /// If true, can skip remaining tools when injected at point C. - pub urgent: bool, - pub source: SoftInterruptSource, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SoftInterruptSource { - User, - System, - BackgroundTask, -} - -/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock. -pub type SoftInterruptQueue = Arc>>; - -/// Signal to move the currently executing tool to background. -/// Uses std::sync so it can be set without async from outside the agent lock. -pub type BackgroundToolSignal = Arc; - -/// Signal to gracefully stop generation. -pub type GracefulShutdownSignal = Arc; - -/// Async-aware interrupt signal that combines AtomicBool (sync read) with -/// tokio::Notify (async wake). Eliminates spin-loops during tool execution. -#[derive(Clone)] -pub struct InterruptSignal { - flag: Arc, - notify: Arc, -} - -impl InterruptSignal { - pub fn new() -> Self { - Self { - flag: Arc::new(std::sync::atomic::AtomicBool::new(false)), - notify: Arc::new(tokio::sync::Notify::new()), - } - } - - pub fn fire(&self) { - self.flag.store(true, std::sync::atomic::Ordering::SeqCst); - self.notify.notify_waiters(); - } - - pub fn is_set(&self) -> bool { - self.flag.load(std::sync::atomic::Ordering::SeqCst) - } - - pub fn reset(&self) { - self.flag.store(false, std::sync::atomic::Ordering::SeqCst); - } - - pub async fn notified(&self) { - let notified = self.notify.notified(); - if self.is_set() { - return; - } - notified.await; - } - - pub fn as_atomic(&self) -> Arc { - Arc::clone(&self.flag) - } -} - -impl Default for InterruptSignal { - fn default() -> Self { - Self::new() - } -} - -#[derive(Debug, thiserror::Error)] -#[error("{message}")] -pub struct StreamError { - pub message: String, - pub retry_after_secs: Option, -} - -impl StreamError { - pub fn new(message: String, retry_after_secs: Option) -> Self { - Self { - message, - retry_after_secs, - } - } -} +//! Agent runtime primitives: signals, declarative agent definitions, and +//! tier-based model resolution. +//! +//! This crate intentionally stays small and dependency-light. Heavier +//! engine work (loop, programmatic steps, spawn management) lives in +//! `src/agent.rs` and will migrate here incrementally as Phase 0 → Phase 2 +//! land. +//! +//! ## Modules +//! +//! - [`signals`] — soft-interrupt + cancellation primitives shared with +//! the server runtime. +//! - [`definition`] — declarative `AgentDefinition` schema loaded from +//! `.jcode/agents/*.toml`. +//! - [`tier`] — user-defined model tier slot resolution (extends +//! `model_routing.rs` #100). +//! - [`output`] — `OutputMode` enum (last_message / all_messages / +//! structured_output). +//! - [`reasoning`] — `ReasoningEffort` enum (minimal / low / medium / high). +//! +//! ## Re-exports +//! +//! All previous public types stay re-exported at the crate root so existing +//! consumers (`src/agent.rs`) compile unchanged. + +pub mod definition; +pub mod output; +pub mod reasoning; +pub mod signals; +pub mod tier; + +// Backwards-compatible re-exports for existing consumers. Do not remove +// without auditing `src/agent.rs` and other in-tree users. +pub use signals::{ + BackgroundToolSignal, GracefulShutdownSignal, InterruptSignal, SoftInterruptMessage, + SoftInterruptQueue, SoftInterruptSource, StreamError, +}; + +// New public surface (Phase 0). +pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION}; +pub use output::OutputMode; +pub use reasoning::ReasoningEffort; +pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource}; diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs new file mode 100644 index 000000000..1ba93dd1a --- /dev/null +++ b/crates/jcode-agent-runtime/src/output.rs @@ -0,0 +1,75 @@ +//! How an agent's output is delivered back to its parent. +//! +//! Adapted from Codebuff's `outputMode` field. Three modes cover the +//! useful cases: +//! +//! - `LastMessage`: parent receives only the agent's final assistant turn. +//! Default. Good for "research-and-summarize" agents like file-picker. +//! - `AllMessages`: parent receives the full child message history (text +//! + tool calls + tool results). Good for editor-like agents that need +//! to expose their full edit trace. +//! - `StructuredOutput`: agent must call `set_output` with a JSON value +//! that conforms to `output_schema`. Good for judge agents, lessons +//! extractors, structured planners. + +use serde::{Deserialize, Serialize}; + +/// Output delivery mode for a sub-agent. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OutputMode { + /// Parent receives only the final assistant turn. (Default.) + #[default] + LastMessage, + /// Parent receives the full message history of the child agent. + AllMessages, + /// Agent must produce a JSON object conforming to its `output_schema`. + /// Validated on `set_output` tool call. + StructuredOutput, +} + +impl OutputMode { + pub fn as_str(&self) -> &'static str { + match self { + OutputMode::LastMessage => "last_message", + OutputMode::AllMessages => "all_messages", + OutputMode::StructuredOutput => "structured_output", + } + } + + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "last_message" | "lastmessage" | "last" => Some(OutputMode::LastMessage), + "all_messages" | "allmessages" | "all" => Some(OutputMode::AllMessages), + "structured_output" | "structured" | "json" => Some(OutputMode::StructuredOutput), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_aliases() { + assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage)); + assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages)); + assert_eq!( + OutputMode::parse("structured"), + Some(OutputMode::StructuredOutput) + ); + assert_eq!(OutputMode::parse("nonsense"), None); + } + + #[test] + fn default_is_last_message() { + assert_eq!(OutputMode::default(), OutputMode::LastMessage); + } + + #[test] + fn serde_uses_snake_case() { + let s = serde_json::to_string(&OutputMode::StructuredOutput).unwrap(); + assert_eq!(s, "\"structured_output\""); + } +} diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs new file mode 100644 index 000000000..d48bafaeb --- /dev/null +++ b/crates/jcode-agent-runtime/src/reasoning.rs @@ -0,0 +1,108 @@ +//! Reasoning effort levels for agents. +//! +//! Mirrors the OpenAI/Anthropic reasoning effort knobs. When an agent +//! definition specifies a reasoning effort, the agent runtime forwards it +//! to the provider request (where supported). Models that don't support +//! reasoning ignore the field. + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// How much reasoning the model should use for this agent. +/// +/// Maps roughly to: +/// - `Minimal` → `effort: "minimal"` (gpt-5 family) / no thinking budget (Claude) +/// - `Low` → `effort: "low"` / small thinking budget +/// - `Medium` → `effort: "medium"` / default thinking budget +/// - `High` → `effort: "high"` / large thinking budget (~32k tokens) +/// +/// Default is `Medium` because that matches most agents' baseline behavior. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ReasoningEffort { + Minimal, + Low, + #[default] + Medium, + High, +} + +impl ReasoningEffort { + /// String representation matching the wire format used by major providers + /// (OpenAI Responses API `reasoning.effort`, OpenRouter `reasoning.effort`). + pub fn as_str(&self) -> &'static str { + match self { + ReasoningEffort::Minimal => "minimal", + ReasoningEffort::Low => "low", + ReasoningEffort::Medium => "medium", + ReasoningEffort::High => "high", + } + } + + /// Numeric rank for threshold comparison (matches `model_routing.rs`). + /// Higher = more reasoning. + pub fn rank(&self) -> u8 { + match self { + ReasoningEffort::Minimal => 0, + ReasoningEffort::Low => 1, + ReasoningEffort::Medium => 2, + ReasoningEffort::High => 3, + } + } + + /// Parse a string value, accepting common aliases. Returns `None` for + /// unknown input so the caller can decide whether to error or default. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "minimal" | "none" | "off" => Some(ReasoningEffort::Minimal), + "low" => Some(ReasoningEffort::Low), + "medium" | "default" => Some(ReasoningEffort::Medium), + "high" | "max" => Some(ReasoningEffort::High), + _ => None, + } + } +} + +impl fmt::Display for ReasoningEffort { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_common_aliases() { + assert_eq!( + ReasoningEffort::parse("minimal"), + Some(ReasoningEffort::Minimal) + ); + assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal)); + assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High)); + assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium)); + assert_eq!(ReasoningEffort::parse(""), None); + assert_eq!(ReasoningEffort::parse("absurd"), None); + } + + #[test] + fn rank_orders_efforts_correctly() { + assert!(ReasoningEffort::Minimal.rank() < ReasoningEffort::Low.rank()); + assert!(ReasoningEffort::Low.rank() < ReasoningEffort::Medium.rank()); + assert!(ReasoningEffort::Medium.rank() < ReasoningEffort::High.rank()); + } + + #[test] + fn default_is_medium() { + assert_eq!(ReasoningEffort::default(), ReasoningEffort::Medium); + } + + #[test] + fn serde_roundtrip_via_lowercase() { + let s = serde_json::to_string(&ReasoningEffort::High).unwrap(); + assert_eq!(s, "\"high\""); + let back: ReasoningEffort = serde_json::from_str("\"medium\"").unwrap(); + assert_eq!(back, ReasoningEffort::Medium); + } +} diff --git a/crates/jcode-agent-runtime/src/signals.rs b/crates/jcode-agent-runtime/src/signals.rs new file mode 100644 index 000000000..67acf5082 --- /dev/null +++ b/crates/jcode-agent-runtime/src/signals.rs @@ -0,0 +1,98 @@ +//! Soft-interrupt + cancellation signals for the agent loop. +//! +//! These primitives are shared between the agent runtime, the server +//! lifecycle, and any callers that need to drive interrupts without +//! holding the agent lock. Keep this module dependency-light — `tokio` +//! sync + `std::sync` only. + +use std::sync::Arc; + +/// A soft interrupt message queued for injection at the next safe point. +#[derive(Debug, Clone)] +pub struct SoftInterruptMessage { + pub content: String, + /// If true, can skip remaining tools when injected at point C. + pub urgent: bool, + pub source: SoftInterruptSource, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SoftInterruptSource { + User, + System, + BackgroundTask, +} + +/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock. +pub type SoftInterruptQueue = Arc>>; + +/// Signal to move the currently executing tool to background. +/// Uses std::sync so it can be set without async from outside the agent lock. +pub type BackgroundToolSignal = Arc; + +/// Signal to gracefully stop generation. +pub type GracefulShutdownSignal = Arc; + +/// Async-aware interrupt signal that combines AtomicBool (sync read) with +/// tokio::Notify (async wake). Eliminates spin-loops during tool execution. +#[derive(Clone)] +pub struct InterruptSignal { + flag: Arc, + notify: Arc, +} + +impl InterruptSignal { + pub fn new() -> Self { + Self { + flag: Arc::new(std::sync::atomic::AtomicBool::new(false)), + notify: Arc::new(tokio::sync::Notify::new()), + } + } + + pub fn fire(&self) { + self.flag.store(true, std::sync::atomic::Ordering::SeqCst); + self.notify.notify_waiters(); + } + + pub fn is_set(&self) -> bool { + self.flag.load(std::sync::atomic::Ordering::SeqCst) + } + + pub fn reset(&self) { + self.flag.store(false, std::sync::atomic::Ordering::SeqCst); + } + + pub async fn notified(&self) { + let notified = self.notify.notified(); + if self.is_set() { + return; + } + notified.await; + } + + pub fn as_atomic(&self) -> Arc { + Arc::clone(&self.flag) + } +} + +impl Default for InterruptSignal { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("{message}")] +pub struct StreamError { + pub message: String, + pub retry_after_secs: Option, +} + +impl StreamError { + pub fn new(message: String, retry_after_secs: Option) -> Self { + Self { + message, + retry_after_secs, + } + } +} diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs new file mode 100644 index 000000000..200f511ed --- /dev/null +++ b/crates/jcode-agent-runtime/src/tier.rs @@ -0,0 +1,330 @@ +//! Model tier abstraction. +//! +//! A "tier" is a **user-defined named slot** that maps to a concrete model id. +//! It is intentionally NOT an opinionated catalog — jcode does not maintain +//! per-provider tier defaults like Codebuff/OpenRouter does. +//! +//! ## Why slots, not catalog? +//! +//! jcode users connect a single provider via OAuth (Claude Pro, ChatGPT Plus, +//! Gemini Advanced, etc.) and pay through that subscription. Auto-downgrading +//! to a "cheaper tier" without their consent is wrong — they already chose +//! the model they want. So the default is: agents inherit the session's +//! current model. +//! +//! Power users (pay-per-token API keys, multi-account setups) can opt in by +//! setting two env vars, exactly mirroring `model_routing.rs` (#100): +//! +//! ```bash +//! JCODE_ROUTING_ROUTINE=claude-haiku-4-5 +//! JCODE_ROUTING_THINKING=claude-opus-4-7 +//! ``` +//! +//! Agent definitions reference tiers by name: +//! +//! ```toml +//! [agent] +//! id = "file-picker" +//! prefer_tier = "routine" # uses JCODE_ROUTING_ROUTINE if set +//! ``` +//! +//! ## Resolution order +//! +//! 1. `agent.model_override` (explicit, highest priority) +//! 2. `agent.prefer_tier` + corresponding env var set +//! 3. Caller-provided `current_session_model` fallback +//! +//! No catalog. No magic. The only "magic" is reading the env var, which is +//! the existing #100 contract. + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// A user-defined tier slot. Currently only two are supported because that +/// matches `model_routing.rs` (#100). Adding tiers later is additive — the +/// env var name pattern is `JCODE_ROUTING_`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ModelTier { + /// Cheap / fast / lower-effort work: file pickers, basher, + /// summarizers. Reads `JCODE_ROUTING_ROUTINE`. + Routine, + /// Premium / reasoning work: editor, reviewer, planner. + /// Reads `JCODE_ROUTING_THINKING`. + Thinking, +} + +impl ModelTier { + /// The env var name that backs this tier slot. Returns the same string + /// shape as `model_routing.rs` (#100) so the two systems stay aligned. + pub fn env_var(&self) -> &'static str { + match self { + ModelTier::Routine => "JCODE_ROUTING_ROUTINE", + ModelTier::Thinking => "JCODE_ROUTING_THINKING", + } + } + + /// Read the user-configured model id for this tier from the environment. + /// Returns `None` when the env var is unset, blank, or whitespace-only — + /// callers should fall back to the session's current model. + pub fn read_user_override(&self) -> Option { + std::env::var(self.env_var()) + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + } + + /// Parse a tier name from a string, accepting common variants. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "routine" | "fast" | "cheap" | "lite" => Some(ModelTier::Routine), + "thinking" | "reasoning" | "premium" | "deep" => Some(ModelTier::Thinking), + _ => None, + } + } +} + +impl fmt::Display for ModelTier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ModelTier::Routine => f.write_str("routine"), + ModelTier::Thinking => f.write_str("thinking"), + } + } +} + +/// Resolve which model id to use for a given tier preference + override pair. +/// +/// Priority: +/// 1. `model_override` — explicit, highest priority. +/// 2. `prefer_tier` + corresponding env var set. +/// 3. `current_session_model` — caller-provided fallback. +/// +/// `current_session_model` is required because there's no other safe default: +/// the runtime doesn't know which provider/model the session is using. +pub fn resolve_model( + model_override: Option<&str>, + prefer_tier: Option, + current_session_model: &str, +) -> String { + if let Some(override_id) = model_override.and_then(|s| { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }) { + return override_id; + } + + if let Some(tier) = prefer_tier { + if let Some(tier_model) = tier.read_user_override() { + return tier_model; + } + } + + current_session_model.to_string() +} + +/// Diagnostic-friendly explanation of which slot was used. Useful for +/// `jcode doctor` output so users can see exactly why a given agent picked +/// the model it did. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ResolutionSource { + /// Used `agent.model_override` directly. + Override(String), + /// Used the env var backing `tier`. + Tier { + tier: ModelTier, + model: String, + }, + /// Tier was preferred but the env var was unset, so fell back to the + /// session's current model. + TierFallback { + tier: ModelTier, + model: String, + }, + /// No override or tier preference; using the session's current model. + SessionDefault(String), +} + +impl ResolutionSource { + pub fn model_id(&self) -> &str { + match self { + ResolutionSource::Override(m) + | ResolutionSource::Tier { model: m, .. } + | ResolutionSource::TierFallback { model: m, .. } + | ResolutionSource::SessionDefault(m) => m, + } + } +} + +/// Same as `resolve_model` but returns provenance information for diagnostics. +pub fn resolve_model_with_source( + model_override: Option<&str>, + prefer_tier: Option, + current_session_model: &str, +) -> ResolutionSource { + if let Some(override_id) = model_override.and_then(|s| { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }) { + return ResolutionSource::Override(override_id); + } + + if let Some(tier) = prefer_tier { + match tier.read_user_override() { + Some(model) => return ResolutionSource::Tier { tier, model }, + None => { + return ResolutionSource::TierFallback { + tier, + model: current_session_model.to_string(), + }; + } + } + } + + ResolutionSource::SessionDefault(current_session_model.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Mutex to serialize env-var manipulation across tests in this module. + /// Without this, `cargo test` runs tests in parallel and they trample + /// each other's `JCODE_ROUTING_*` state. + static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + fn with_env_lock(f: F) { + let guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + // Snapshot + restore env vars we mutate so test order is irrelevant. + let saved_routine = std::env::var_os("JCODE_ROUTING_ROUTINE"); + let saved_thinking = std::env::var_os("JCODE_ROUTING_THINKING"); + unsafe { + std::env::remove_var("JCODE_ROUTING_ROUTINE"); + std::env::remove_var("JCODE_ROUTING_THINKING"); + } + f(); + unsafe { + match saved_routine { + Some(v) => std::env::set_var("JCODE_ROUTING_ROUTINE", v), + None => std::env::remove_var("JCODE_ROUTING_ROUTINE"), + } + match saved_thinking { + Some(v) => std::env::set_var("JCODE_ROUTING_THINKING", v), + None => std::env::remove_var("JCODE_ROUTING_THINKING"), + } + } + drop(guard); + } + + #[test] + fn parse_tier_accepts_aliases() { + assert_eq!(ModelTier::parse("routine"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("Routine"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("FAST"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("thinking"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse("reasoning"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse("deep"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse(""), None); + assert_eq!(ModelTier::parse("nonsense"), None); + } + + #[test] + fn override_wins_over_tier_and_session_default() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_THINKING", "should-be-ignored"); + } + let got = resolve_model( + Some("explicit-model"), + Some(ModelTier::Thinking), + "session-default", + ); + assert_eq!(got, "explicit-model"); + }); + } + + #[test] + fn tier_uses_env_var_when_set() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_ROUTINE", "haiku-4-5"); + } + let got = resolve_model(None, Some(ModelTier::Routine), "session-default"); + assert_eq!(got, "haiku-4-5"); + }); + } + + #[test] + fn tier_falls_back_when_env_unset() { + with_env_lock(|| { + // env var explicitly removed by lock setup + let got = resolve_model(None, Some(ModelTier::Thinking), "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn no_tier_no_override_uses_session_default() { + with_env_lock(|| { + let got = resolve_model(None, None, "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn empty_override_string_treated_as_unset() { + with_env_lock(|| { + let got = resolve_model(Some(" "), None, "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn resolution_source_reports_override() { + with_env_lock(|| { + let src = resolve_model_with_source(Some("forced"), None, "session"); + assert!(matches!(src, ResolutionSource::Override(ref m) if m == "forced")); + assert_eq!(src.model_id(), "forced"); + }); + } + + #[test] + fn resolution_source_reports_tier_hit() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_THINKING", "opus-4-7"); + } + let src = resolve_model_with_source(None, Some(ModelTier::Thinking), "fallback"); + match src { + ResolutionSource::Tier { tier, model } => { + assert_eq!(tier, ModelTier::Thinking); + assert_eq!(model, "opus-4-7"); + } + other => panic!("expected Tier, got {:?}", other), + } + }); + } + + #[test] + fn resolution_source_reports_tier_fallback() { + with_env_lock(|| { + // env unset + let src = resolve_model_with_source(None, Some(ModelTier::Routine), "session"); + match src { + ResolutionSource::TierFallback { tier, model } => { + assert_eq!(tier, ModelTier::Routine); + assert_eq!(model, "session"); + } + other => panic!("expected TierFallback, got {:?}", other), + } + }); + } +} From 6cf1ad8d778cc22b7e7f30db518da0e04d7dca06 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Mon, 25 May 2026 21:52:16 +0700 Subject: [PATCH 02/22] feat(agent-runtime): TOML registry loader for .jcode/agents/*.toml (Phase 0.3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discover and load AgentDefinition files from three locations with priority order: 1. /.jcode/agents/*.toml (project-local, highest) 2. ~/.jcode/agents/*.toml (user-global) 3. AgentRegistry::register_builtin (compiled-in defaults, lowest) Project-local overrides user-global overrides builtin. Re-registering a builtin after a higher-priority entry is loaded does NOT clobber the override — the priority check is symmetric in `insert`. Design choices: - Filename must match `.toml` so users can find agents by id without opening every file. Mismatches are surfaced as a load error rather than silently misindexing. - Malformed/invalid files are collected as non-fatal LoadError entries so a single bad file doesn't prevent the rest of the registry from loading. `jcode doctor` (future) reads load_errors() to surface these. - AgentRegistry intentionally does NOT cross-reference `tool_names` / `spawnable_agents` — that's done at spawn time because the tool universe may be feature-gated (Phase 0.4). 41 unit tests pass (32 prior + 9 new). `cargo check --bin jcode` succeeds. --- crates/jcode-agent-runtime/src/lib.rs | 2 + crates/jcode-agent-runtime/src/registry.rs | 530 +++++++++++++++++++++ 2 files changed, 532 insertions(+) create mode 100644 crates/jcode-agent-runtime/src/registry.rs diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index 5599633aa..f7f8ea85e 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -26,6 +26,7 @@ pub mod definition; pub mod output; pub mod reasoning; +pub mod registry; pub mod signals; pub mod tier; @@ -40,4 +41,5 @@ pub use signals::{ pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION}; pub use output::OutputMode; pub use reasoning::ReasoningEffort; +pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource}; diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs new file mode 100644 index 000000000..2249b046c --- /dev/null +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -0,0 +1,530 @@ +//! Agent registry: discovery + loading of `AgentDefinition`s from disk. +//! +//! ## Lookup paths (highest priority first) +//! +//! 1. **Project-local**: `/.jcode/agents/*.toml` +//! 2. **User-global**: `~/.jcode/agents/*.toml` +//! 3. **Builtins** registered programmatically via [`AgentRegistry::register_builtin`] +//! +//! When the same id appears in multiple sources, the higher-priority one +//! wins. The registry tracks where each agent came from so `jcode doctor` +//! can show provenance. +//! +//! ## What this module does NOT do +//! +//! - It does not validate that `tool_names` exist in the tool registry +//! (Phase 0.4) or that `spawnable_agents` resolve to known agents +//! (cross-reference). Both are caller responsibilities done at agent +//! spawn time, not load time, because the tool/agent universe may be +//! feature-gated. +//! - It does not watch for file changes. Agents are loaded once at +//! session start. Self-dev is welcome to call `reload_from_disk()`. + +use crate::definition::{AgentDefinition, DefinitionError}; + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// Where an agent definition was loaded from. Surfaced in `jcode doctor` +/// and conflict warnings. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum AgentSource { + /// Compiled into the binary by name. Lowest priority. + Builtin, + /// Loaded from `~/.jcode/agents/`. + UserGlobal { path: PathBuf }, + /// Loaded from `/.jcode/agents/`. Highest priority. + ProjectLocal { path: PathBuf }, +} + +impl AgentSource { + fn priority(&self) -> u8 { + match self { + AgentSource::Builtin => 0, + AgentSource::UserGlobal { .. } => 1, + AgentSource::ProjectLocal { .. } => 2, + } + } + + /// Short human-readable label for `jcode doctor` output. + pub fn short_label(&self) -> String { + match self { + AgentSource::Builtin => "builtin".to_string(), + AgentSource::UserGlobal { path } => format!("user:{}", path.display()), + AgentSource::ProjectLocal { path } => format!("project:{}", path.display()), + } + } +} + +/// One loaded agent: its definition plus where it came from. +#[derive(Debug, Clone)] +pub struct LoadedAgent { + pub definition: AgentDefinition, + pub source: AgentSource, +} + +/// Errors surfaced when loading an agent file. We distinguish I/O, +/// parse, and validation errors so the TUI can render actionable +/// messages. +#[derive(Debug, thiserror::Error)] +pub enum LoadError { + #[error("failed to read `{path}`: {source}")] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + + #[error("failed to parse `{path}`: {source}")] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + + #[error("invalid agent definition in `{path}`: {source}")] + Invalid { + path: PathBuf, + #[source] + source: DefinitionError, + }, + + #[error( + "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`." + )] + FileNameMismatch { path: PathBuf, id: String }, +} + +/// In-memory registry of loaded agent definitions. Wrap in `Arc` if you +/// need to share — `LoadError` contains `io::Error` so the registry itself +/// is not `Clone`. +#[derive(Debug, Default)] +pub struct AgentRegistry { + by_id: HashMap, + /// Non-fatal load errors collected during discovery. Surfaced by + /// `jcode doctor` so users can see why a malformed file was skipped. + load_errors: Vec, +} + +impl AgentRegistry { + pub fn new() -> Self { + Self::default() + } + + /// Total number of registered agents. + pub fn len(&self) -> usize { + self.by_id.len() + } + + /// True if no agents are registered. + pub fn is_empty(&self) -> bool { + self.by_id.is_empty() + } + + /// Look up an agent by id. + pub fn get(&self, id: &str) -> Option<&LoadedAgent> { + self.by_id.get(id) + } + + /// Iterate over all agents in arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.by_id.values() + } + + /// Sorted (by id) iteration — handy for stable doctor output. + pub fn iter_sorted(&self) -> Vec<&LoadedAgent> { + let mut v: Vec<_> = self.by_id.values().collect(); + v.sort_by(|a, b| a.definition.id.cmp(&b.definition.id)); + v + } + + /// Non-fatal errors accumulated during discovery. + pub fn load_errors(&self) -> &[LoadError] { + &self.load_errors + } + + /// Insert (or replace) an agent according to source priority. Returns + /// the previous entry if it was overridden. + pub fn insert(&mut self, loaded: LoadedAgent) -> Option { + let id = loaded.definition.id.clone(); + match self.by_id.get(&id) { + Some(existing) if existing.source.priority() > loaded.source.priority() => { + // existing has higher priority, drop the new one + Some(loaded) + } + _ => self.by_id.insert(id, loaded), + } + } + + /// Register a builtin agent. Builtins have the lowest priority and + /// are overridable by both user and project files of the same id. + pub fn register_builtin( + &mut self, + definition: AgentDefinition, + ) -> Result<(), DefinitionError> { + definition.validate()?; + self.insert(LoadedAgent { + definition, + source: AgentSource::Builtin, + }); + Ok(()) + } + + /// Discover and load all agent files from `dir`. Non-recursive. + /// Files that don't end in `.toml` are skipped silently. Bad files + /// are recorded in `load_errors()` and skipped. + /// + /// `source_kind` decides whether each loaded file is tagged as + /// `UserGlobal` or `ProjectLocal`. + pub fn load_directory( + &mut self, + dir: &Path, + source_kind: SourceKind, + ) -> Result { + if !dir.exists() { + return Ok(0); + } + let mut loaded = 0; + for entry in std::fs::read_dir(dir)? { + let entry = match entry { + Ok(e) => e, + Err(err) => { + self.load_errors.push(LoadError::Io { + path: dir.to_path_buf(), + source: err, + }); + continue; + } + }; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) != Some("toml") { + continue; + } + match Self::load_file(&path) { + Ok(definition) => { + let expected_stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or(""); + if !expected_stem.is_empty() && expected_stem != definition.id { + self.load_errors.push(LoadError::FileNameMismatch { + path: path.clone(), + id: definition.id.clone(), + }); + continue; + } + let source = match source_kind { + SourceKind::UserGlobal => AgentSource::UserGlobal { path: path.clone() }, + SourceKind::ProjectLocal => { + AgentSource::ProjectLocal { path: path.clone() } + } + }; + self.insert(LoadedAgent { + definition, + source, + }); + loaded += 1; + } + Err(err) => { + self.load_errors.push(err); + } + } + } + Ok(loaded) + } + + /// Read + parse + validate a single TOML file into an `AgentDefinition`. + pub fn load_file(path: &Path) -> Result { + let raw = std::fs::read_to_string(path).map_err(|source| LoadError::Io { + path: path.to_path_buf(), + source, + })?; + let definition: AgentDefinition = + toml::from_str(&raw).map_err(|source| LoadError::Parse { + path: path.to_path_buf(), + source, + })?; + definition.validate().map_err(|source| LoadError::Invalid { + path: path.to_path_buf(), + source, + })?; + Ok(definition) + } + + /// Convenience: discover both user-global and project-local agent + /// directories using standard jcode paths. `home` defaults to + /// `dirs::home_dir()` (omitted here to keep this crate dep-light; + /// callers pass the resolved home to avoid pulling `dirs`). + pub fn discover_standard_paths( + &mut self, + home_dir: Option<&Path>, + project_root: Option<&Path>, + ) { + if let Some(home) = home_dir { + let user_dir = home.join(".jcode").join("agents"); + if let Err(err) = self.load_directory(&user_dir, SourceKind::UserGlobal) { + self.load_errors.push(LoadError::Io { + path: user_dir, + source: err, + }); + } + } + if let Some(root) = project_root { + let project_dir = root.join(".jcode").join("agents"); + if let Err(err) = self.load_directory(&project_dir, SourceKind::ProjectLocal) { + self.load_errors.push(LoadError::Io { + path: project_dir, + source: err, + }); + } + } + } +} + +/// Tag for `load_directory` so the caller decides how loaded entries are +/// labeled. The function itself doesn't care about jcode's path convention. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SourceKind { + UserGlobal, + ProjectLocal, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::OutputMode; + use std::fs; + + fn write_toml(dir: &Path, name: &str, body: &str) { + let path = dir.join(name); + fs::write(&path, body).expect("write toml"); + } + + fn temp_dir(name: &str) -> PathBuf { + let base = std::env::temp_dir().join(format!( + "jcode-agent-registry-test-{}-{}-{}", + name, + std::process::id(), + // Use atomics for a per-process counter so concurrent tests don't collide. + COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + )); + let _ = fs::remove_dir_all(&base); + fs::create_dir_all(&base).unwrap(); + base + } + + static COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0); + + #[test] + fn missing_dir_is_zero_load_not_error() { + let mut reg = AgentRegistry::new(); + let n = reg + .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal) + .unwrap(); + assert_eq!(n, 0); + assert!(reg.is_empty()); + } + + #[test] + fn loads_minimal_agent() { + let dir = temp_dir("minimal"); + write_toml( + &dir, + "file-picker.toml", + r#" + id = "file-picker" + display_name = "Fletcher" + "#, + ); + let mut reg = AgentRegistry::new(); + let n = reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap(); + assert_eq!(n, 1); + let loaded = reg.get("file-picker").expect("registered"); + assert_eq!(loaded.definition.display_name, "Fletcher"); + assert!(matches!(loaded.source, AgentSource::ProjectLocal { .. })); + } + + #[test] + fn project_overrides_user_overrides_builtin() { + // Builtin + let mut reg = AgentRegistry::new(); + let mut builtin_def = AgentDefinition { + id: "editor".to_string(), + display_name: "Builtin Editor".to_string(), + publisher: None, + version: "0.1.0".to_string(), + prefer_tier: None, + model_override: None, + reasoning: None, + tool_names: vec![], + spawnable_agents: vec![], + system_prompt: String::new(), + instructions_prompt: None, + step_prompt: None, + spawner_prompt: None, + inherit_parent_system_prompt: false, + include_message_history: false, + output_mode: OutputMode::LastMessage, + output_schema: None, + }; + reg.register_builtin(builtin_def.clone()).unwrap(); + assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor"); + + // User + let user_dir = temp_dir("user"); + write_toml( + &user_dir, + "editor.toml", + r#" + id = "editor" + display_name = "User Editor" + "#, + ); + reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap(); + assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor"); + + // Project + let proj_dir = temp_dir("proj"); + write_toml( + &proj_dir, + "editor.toml", + r#" + id = "editor" + display_name = "Project Editor" + "#, + ); + reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Project Editor" + ); + + // Re-register builtin should NOT override the project entry. + // (registers via the same `insert` priority path) + builtin_def.display_name = "Builtin Editor v2".to_string(); + reg.register_builtin(builtin_def).unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Project Editor", + "builtin should not override project-local" + ); + } + + #[test] + fn malformed_toml_collected_as_load_error() { + let dir = temp_dir("malformed"); + write_toml(&dir, "bad.toml", "id = \"this is missing close quote\n"); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty(), "no agents registered"); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!( + reg.load_errors()[0], + LoadError::Parse { .. } + )); + } + + #[test] + fn invalid_id_collected_as_load_error() { + let dir = temp_dir("invalid-id"); + write_toml( + &dir, + "Bad_File.toml", + r#" + id = "Bad_Id" + display_name = "Bad" + "#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty()); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!( + reg.load_errors()[0], + LoadError::Invalid { .. } + )); + } + + #[test] + fn filename_must_match_agent_id() { + let dir = temp_dir("name-mismatch"); + write_toml( + &dir, + "wrong-name.toml", + r#" + id = "right-name" + display_name = "X" + "#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty()); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!( + reg.load_errors()[0], + LoadError::FileNameMismatch { .. } + )); + } + + #[test] + fn skips_non_toml_files() { + let dir = temp_dir("non-toml"); + fs::write(dir.join("README.md"), "not an agent").unwrap(); + fs::write(dir.join("config.json"), "{}").unwrap(); + write_toml( + &dir, + "valid.toml", + r#" + id = "valid" + display_name = "v" + "#, + ); + let mut reg = AgentRegistry::new(); + let n = reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert_eq!(n, 1); + assert_eq!(reg.len(), 1); + } + + #[test] + fn iter_sorted_is_deterministic() { + let dir = temp_dir("sort"); + for id in ["zeta", "alpha", "mid"] { + write_toml( + &dir, + &format!("{id}.toml"), + &format!(r#"id = "{id}" +display_name = "{id}" +"#), + ); + } + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect(); + assert_eq!(ids, vec!["alpha", "mid", "zeta"]); + } + + #[test] + fn discover_standard_paths_reads_both() { + let home = temp_dir("home"); + let proj = temp_dir("proj"); + fs::create_dir_all(home.join(".jcode/agents")).unwrap(); + fs::create_dir_all(proj.join(".jcode/agents")).unwrap(); + write_toml( + &home.join(".jcode/agents"), + "user-only.toml", + r#"id = "user-only" +display_name = "U" +"#, + ); + write_toml( + &proj.join(".jcode/agents"), + "project-only.toml", + r#"id = "project-only" +display_name = "P" +"#, + ); + let mut reg = AgentRegistry::new(); + reg.discover_standard_paths(Some(&home), Some(&proj)); + assert_eq!(reg.len(), 2); + assert!(reg.get("user-only").is_some()); + assert!(reg.get("project-only").is_some()); + } +} From e772853f2a80d87a1f1e583099f7c1ccf486d3d6 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Mon, 25 May 2026 22:03:10 +0700 Subject: [PATCH 03/22] feat(agent-runtime): cross-ref validation + skill MAS bridge + sample agents (Phase 0.4-0.6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0.4 — Cross-reference validation: - ReferenceError enum (UnknownTools, UnknownSpawnableAgents) kept separate from DefinitionError because the runtime tool/agent universe isn't known at TOML-load time. - AgentDefinition::validate_tool_references() and validate_spawn_references() — caller passes the available name set, gets back a sorted, comma-joined list of unknowns. - 5 new tests covering the happy path, unknowns, empty lists, and deterministic alphabetical ordering of the error message. This deliberately does NOT modify src/tool/mod.rs. The whitelist check is a pure function over the agent definition + a name set; no need to refactor tool dispatch. Phase 1 will wire the actual tool registry into the spawn path. Phase 0.5 — Skill MAS (#94) bridge: - AgentRegistry::lookup_for_skill_routing(skill_agent_id) — named alias of get() that documents the integration point with the SKILL.md field. Returns None for missing references; the skill activation site decides fallback policy. - 2 tests: hit + miss. Phase 0.6 — Sample agents + integration test: - .jcode/agents/file-picker.toml — Routine tier, no message history, leaf agent. Demonstrates file-picker pattern adapted from Codebuff. - .jcode/agents/code-reviewer.toml — Thinking tier with inherit_parent_system_prompt=true to demonstrate the prompt-cache prefix-sharing trick (~90% input-token savings on cache hits). - tests/sample_agents.rs — integration test loads both files via the public AgentRegistry API and asserts shape + behavior. 4 tests. Phase 0 totals: 49 unit + 4 integration = 53 tests, all passing. `cargo check --bin jcode` succeeds (full workspace, 3m13s). Phase 0 (foundation) is now complete: - Schema: AgentDefinition + ModelTier + OutputMode + ReasoningEffort - Loader: registry with priority order (project > user > builtin) - Validation: id format, internal invariants, cross-references - Sample agents demonstrating cache-hit and tier patterns - Skill MAS (#94) integration point established Phase 1 (4 builtin agents + spawn_agents tool + cache benchmark) is the next track. --- .jcode/agents/code-reviewer.toml | 76 +++++++ .jcode/agents/file-picker.toml | 65 ++++++ crates/jcode-agent-runtime/src/definition.rs | 195 ++++++++++++++++++ crates/jcode-agent-runtime/src/lib.rs | 4 +- crates/jcode-agent-runtime/src/registry.rs | 42 ++++ .../tests/sample_agents.rs | 114 ++++++++++ 6 files changed, 495 insertions(+), 1 deletion(-) create mode 100644 .jcode/agents/code-reviewer.toml create mode 100644 .jcode/agents/file-picker.toml create mode 100644 crates/jcode-agent-runtime/tests/sample_agents.rs diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml new file mode 100644 index 000000000..22b7e5e38 --- /dev/null +++ b/.jcode/agents/code-reviewer.toml @@ -0,0 +1,76 @@ +# Code reviewer agent. +# +# Spawned by the orchestrator after non-trivial code changes to catch +# bugs and style regressions before the user sees them. Adapted from +# Codebuff's `code-reviewer`. +# +# Why `prefer_tier = "thinking"`: +# Review work benefits from reasoning. A pay-per-token user with +# `JCODE_ROUTING_THINKING=` gets the right model +# for the right job; subscription users inherit the session model. +# +# Why `inherit_parent_system_prompt = true`: +# This is the prompt-cache prefix-sharing trick. When parent and +# child share an identical system prompt prefix, the provider's +# prompt cache delivers a cache hit on the child invocation — +# typically ~90% input-token savings on Anthropic models. +# +# IMPORTANT: must leave `system_prompt` empty (validated). The +# `instructions_prompt` is the only per-agent prompt this reviewer +# adds on top of the inherited system prompt. + +id = "code-reviewer" +display_name = "Code Reviewer" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "thinking" +reasoning = "medium" + +inherit_parent_system_prompt = true +include_message_history = true +output_mode = "last_message" + +tool_names = [ + "read", + "grep", +] + +# Reviewers don't spawn other agents — they read, reason, and report. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent after non-trivial code changes to review them. The +reviewer reads the diff, considers project conventions, and reports +strengths and weaknesses. Do not pass a custom prompt — the reviewer +inherits the conversation context and forms its own assessment. +""" + +# system_prompt MUST be empty when inherit_parent_system_prompt is true. +# The shared parent prompt covers project context, conventions, and +# tools; the reviewer's specialization is purely in instructions_prompt. + +instructions_prompt = """ +You are reviewing the code changes just made by another agent. + +Focus on: +- Correctness: does the code do what the user asked? +- Project conventions: imports, formatting, naming, error handling. +- Test coverage: are new code paths exercised? +- Edge cases: what could go wrong? What was missed? + +Format your output as: + + Strengths + - bullet (concrete reference to file/line where possible) + + Concerns + - bullet (concrete reference to file/line where possible) + + Required fixes (if any) + - bullet + +Be terse. Be specific. Do not restate code that's already in the diff. +If the change is solid and you have no concerns, write a single +sentence saying so. +""" diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml new file mode 100644 index 000000000..b6365a84d --- /dev/null +++ b/.jcode/agents/file-picker.toml @@ -0,0 +1,65 @@ +# File picker agent. +# +# Spawned by the orchestrator to find files in the codebase that are +# relevant to a task. Adapted from Codebuff's `file-picker` agent. +# +# Why `prefer_tier = "routine"`: +# File picking is a fuzzy-search task — a smaller/cheaper model +# handles it well. Pay-per-token users who set +# `JCODE_ROUTING_ROUTINE=` save real money here. +# Subscription users (Claude Pro, ChatGPT Plus, ...) inherit the +# session model and get correctness without any tier mapping. +# +# Why `include_message_history = false`: +# File picker doesn't need to see prior edit chatter. A clean slate +# keeps the prompt short and avoids accidentally biasing path +# selection toward already-touched files. + +id = "file-picker" +display_name = "Fletcher the File Fetcher" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "routine" +reasoning = "minimal" + +include_message_history = false +output_mode = "last_message" + +# Tools required: read project file tree + glob fallback. Whitelist is +# checked at runtime against the tool registry; unknown tools fail loudly +# rather than silently degrading. +tool_names = [ + "ls", + "glob", + "read", +] + +# This agent is a leaf — it does not spawn other agents. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent to find relevant files in the codebase. Provide a brief +description of what you're looking for. The agent will return up to ~12 +file paths with one-line summaries. It does fuzzy semantic search; for +exact-string searches, spawn a code searcher instead. +""" + +system_prompt = """ +You are an expert at finding relevant files in a codebase. You have the +project file tree and the user's request. Return the most relevant +files, one per line, prefixed with the path. After the list, write a +single short paragraph explaining how the files relate to the request. + +Do not read file contents — that is the parent agent's job. +Do not propose changes — that is the editor's job. +Stay focused on path discovery. +""" + +instructions_prompt = """ +Provide an extremely concise report: +1. List of relevant file paths (one per line). +2. One paragraph (<= 4 sentences) explaining the relevance. + +Do not exceed 12 paths unless the parent explicitly asks for more. +""" diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index 3e2203e8b..a067668c6 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -199,6 +199,35 @@ pub enum DefinitionError { DuplicateSpawnable { id: String, spawn: String }, } +/// Errors returned when cross-referencing an agent against the runtime +/// tool/agent universe (i.e. checking that `tool_names` actually exist). +/// +/// These are **separate from `DefinitionError`** because the runtime +/// universe isn't known at TOML-load time — it depends on feature flags, +/// MCP server connections, and the resolved agent registry. Callers +/// invoke `validate_tool_references` / `validate_spawn_references` at +/// agent spawn time. +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] +pub enum ReferenceError { + #[error( + "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}" + )] + UnknownTools { + id: String, + unknown: String, + available: String, + }, + + #[error( + "agent `{id}` references unknown sub-agent(s): {unknown}. Available agents: {available}" + )] + UnknownSpawnableAgents { + id: String, + unknown: String, + available: String, + }, +} + impl AgentDefinition { /// Validate id format + cross-field invariants. Returns `Ok(())` when /// the definition is well-formed. @@ -265,6 +294,88 @@ impl AgentDefinition { current_session_model, ) } + + /// Check that every entry in `tool_names` exists in the caller-provided + /// universe of tool names. Returns the list of unknown tools when any + /// fail. Caller decides whether unknown tools are fatal (likely yes + /// for production agents, no for under-development agents). + /// + /// Empty `tool_names` always validates — agents with no tools are + /// legal (e.g. pure-prompt summarizer). + pub fn validate_tool_references(&self, available: I) -> Result<(), ReferenceError> + where + I: IntoIterator, + S: AsRef, + { + let available: std::collections::HashSet = available + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect(); + let unknown: Vec<&String> = self + .tool_names + .iter() + .filter(|name| !available.contains(name.as_str())) + .collect(); + if unknown.is_empty() { + return Ok(()); + } + let mut sorted_unknown: Vec<&String> = unknown; + sorted_unknown.sort(); + let mut sorted_available: Vec<&String> = available.iter().collect(); + sorted_available.sort(); + Err(ReferenceError::UnknownTools { + id: self.id.clone(), + unknown: sorted_unknown + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + available: sorted_available + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + }) + } + + /// Check that every entry in `spawnable_agents` exists in the caller- + /// provided universe of agent ids. Returns unknown agents when any + /// fail. Same semantics as `validate_tool_references`. + pub fn validate_spawn_references(&self, available: I) -> Result<(), ReferenceError> + where + I: IntoIterator, + S: AsRef, + { + let available: std::collections::HashSet = available + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect(); + let unknown: Vec<&String> = self + .spawnable_agents + .iter() + .filter(|name| !available.contains(name.as_str())) + .collect(); + if unknown.is_empty() { + return Ok(()); + } + let mut sorted_unknown: Vec<&String> = unknown; + sorted_unknown.sort(); + let mut sorted_available: Vec<&String> = available.iter().collect(); + sorted_available.sort(); + Err(ReferenceError::UnknownSpawnableAgents { + id: self.id.clone(), + unknown: sorted_unknown + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + available: sorted_available + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + }) + } } /// Agent ids are intentionally restrictive: lowercase ASCII letters, digits, @@ -492,4 +603,88 @@ mod tests { let d: AgentDefinition = toml::from_str(src).expect("parse"); d.validate().expect("validate"); } + + // ----------------------------------------------------------------- + // Cross-reference validation (Phase 0.4) + // ----------------------------------------------------------------- + #[test] + fn validate_tool_references_passes_when_all_known() { + let mut d = minimal_definition("editor"); + d.tool_names = vec!["read".to_string(), "write_file".to_string()]; + d.validate_tool_references(["read", "write_file", "str_replace"]) + .expect("all tools known"); + } + + #[test] + fn validate_tool_references_fails_with_unknown_tools() { + let mut d = minimal_definition("editor"); + d.tool_names = vec!["read".to_string(), "magic".to_string()]; + let err = d + .validate_tool_references(["read", "write_file"]) + .expect_err("magic is unknown"); + match err { + ReferenceError::UnknownTools { + id, + unknown, + available, + } => { + assert_eq!(id, "editor"); + assert_eq!(unknown, "magic"); + assert!(available.contains("read")); + assert!(available.contains("write_file")); + } + other => panic!("expected UnknownTools, got {:?}", other), + } + } + + #[test] + fn validate_tool_references_empty_tool_names_always_ok() { + let d = minimal_definition("ask"); + // tool_names is empty by default; supplying empty universe is also fine. + d.validate_tool_references(Vec::::new()) + .expect("empty tool list always valid"); + } + + #[test] + fn validate_spawn_references_passes_when_all_known() { + let mut d = minimal_definition("base"); + d.spawnable_agents = vec!["file-picker".to_string(), "editor".to_string()]; + d.validate_spawn_references(["file-picker", "editor", "reviewer"]) + .expect("all known"); + } + + #[test] + fn validate_spawn_references_fails_with_unknown_agents() { + let mut d = minimal_definition("base"); + d.spawnable_agents = vec!["file-picker".to_string(), "ghost".to_string()]; + let err = d + .validate_spawn_references(["file-picker", "editor"]) + .expect_err("ghost unknown"); + match err { + ReferenceError::UnknownSpawnableAgents { + id, + unknown, + available: _, + } => { + assert_eq!(id, "base"); + assert_eq!(unknown, "ghost"); + } + other => panic!("expected UnknownSpawnableAgents, got {:?}", other), + } + } + + #[test] + fn validate_references_unknown_list_is_sorted_and_comma_joined() { + let mut d = minimal_definition("agent"); + d.tool_names = vec!["zeta".to_string(), "alpha".to_string(), "mid".to_string()]; + let err = d + .validate_tool_references(Vec::<&str>::new()) + .expect_err("none known"); + match err { + ReferenceError::UnknownTools { unknown, .. } => { + assert_eq!(unknown, "alpha, mid, zeta", "alphabetical order"); + } + _ => unreachable!(), + } + } } diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index f7f8ea85e..b78ad983f 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -38,7 +38,9 @@ pub use signals::{ }; // New public surface (Phase 0). -pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION}; +pub use definition::{ + AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION, +}; pub use output::OutputMode; pub use reasoning::ReasoningEffort; pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index 2249b046c..71cab810d 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -139,6 +139,22 @@ impl AgentRegistry { v } + /// Look up an agent referenced by a Skill MAS field (#94). + /// + /// `SKILL.md` front-matter has an optional `agent: ` field that + /// routes skill activation to a specific sub-agent rather than the + /// main agent. The id format is identical to `AgentDefinition::id`, + /// so this is functionally `get(id)` — the named alias exists to + /// document the integration point and keep future skill-routing + /// logic discoverable. + /// + /// Returns `None` if the skill references an unknown agent. The + /// caller (skill activation site) decides whether to log a warning + /// or fall back to the main agent. + pub fn lookup_for_skill_routing(&self, skill_agent_id: &str) -> Option<&LoadedAgent> { + self.get(skill_agent_id) + } + /// Non-fatal errors accumulated during discovery. pub fn load_errors(&self) -> &[LoadError] { &self.load_errors @@ -501,6 +517,32 @@ display_name = "{id}" assert_eq!(ids, vec!["alpha", "mid", "zeta"]); } + #[test] + fn lookup_for_skill_routing_finds_agent() { + let dir = temp_dir("skill-mas-hit"); + write_toml( + &dir, + "code-reviewer.toml", + r#"id = "code-reviewer" +display_name = "Reviewer" +"#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap(); + // Skill front-matter `agent: code-reviewer` → registry lookup. + let found = reg.lookup_for_skill_routing("code-reviewer"); + assert!(found.is_some()); + assert_eq!(found.unwrap().definition.id, "code-reviewer"); + } + + #[test] + fn lookup_for_skill_routing_returns_none_for_unknown_agent() { + let reg = AgentRegistry::new(); + // Caller (skill activation site) decides how to handle a missing + // routing target — we just report None. + assert!(reg.lookup_for_skill_routing("nonexistent").is_none()); + } + #[test] fn discover_standard_paths_reads_both() { let home = temp_dir("home"); diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs new file mode 100644 index 000000000..fc542fed4 --- /dev/null +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -0,0 +1,114 @@ +//! Integration test: load the bundled sample agents in +//! `/.jcode/agents/` and assert the registry behaves as +//! documented. +//! +//! Lives in `tests/` so it exercises the public API the way real callers +//! will (the `jcode` binary, the future `cli/agents` module, etc.). +//! +//! If a future PR moves the sample agents elsewhere, update `SAMPLES_DIR`. + +use std::path::PathBuf; + +use jcode_agent_runtime::{ + AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind, +}; + +/// Path to the project-root sample agents directory, relative to the +/// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so +/// `cargo test --workspace` works regardless of the cwd the runner +/// chooses. +fn samples_dir() -> PathBuf { + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // crates/jcode-agent-runtime → ../../ .jcode/agents + crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents") +} + +#[test] +fn loads_bundled_sample_agents() { + let dir = samples_dir(); + assert!( + dir.exists(), + "sample agents directory missing: {}", + dir.display(), + ); + + let mut reg = AgentRegistry::new(); + let n = reg + .load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + assert!(n >= 2, "expected at least 2 sample agents, got {n}"); + assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors()); +} + +#[test] +fn file_picker_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("file-picker") + .expect("file-picker registered") + .definition + .clone(); + + assert_eq!(agent.display_name, "Fletcher the File Fetcher"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); + assert!(!agent.include_message_history, "file picker uses clean slate"); + assert!(!agent.inherit_parent_system_prompt); + assert_eq!(agent.output_mode, OutputMode::LastMessage); + assert!(agent.tool_names.iter().any(|t| t == "read")); + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + + // Resolve model with no env vars set should fall back to the + // session's current model. + let resolved = agent.resolve_model("session-model"); + assert_eq!( + resolved, "session-model", + "no JCODE_ROUTING_ROUTINE → session default" + ); +} + +#[test] +fn code_reviewer_uses_inherit_parent_system_prompt_for_cache_hit() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = ® + .get("code-reviewer") + .expect("code-reviewer registered") + .definition; + + assert!( + agent.inherit_parent_system_prompt, + "reviewer must inherit parent system prompt for prompt-cache hits" + ); + assert!( + agent.system_prompt.is_empty(), + "system_prompt must be empty when inheriting (enforced by validation)" + ); + assert!( + agent.include_message_history, + "reviewer needs context of the change it's reviewing" + ); + assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking)); +} + +#[test] +fn sample_agents_validate_cleanly() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + for loaded in reg.iter() { + loaded + .definition + .validate() + .unwrap_or_else(|err| panic!("{} failed validation: {err}", loaded.definition.id)); + } +} From 170852ff3ada1e8ddff5bdb95c926165e42e6851 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Tue, 26 May 2026 08:24:34 +0700 Subject: [PATCH 04/22] =?UTF-8?q?feat(multi-agent-foundation):=20Phase=201?= =?UTF-8?q?-5=20additions=20=E2=80=94=20jbench=20scaffold,=20prompt=20util?= =?UTF-8?q?ities,=20sample=20agents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 1:两名真实 TOML agent definitions (basher + editor) với full schema Phase 4: `prompt_placeholders.rs` — `{{FILE_TREE}}`, `{{CURRENT_DATE}}`, etc. Phase 4: `wrap_as_system_reminder()` in `src/agent/prompting.rs` Phase 5: `evals/jbench/` scaffold — types, judge stub, lessons stub, agent_runner stub Phase 0.6: integration tests `basher_sample_has_expected_shape` + `editor_sample_has_expected_shape` All jcode-agent-runtime tests pass (49 unit + 6 integration). --- .jcode/agents/basher.toml | 75 +++++++ .jcode/agents/editor.toml | 87 ++++++++ Cargo.lock | 12 ++ Cargo.toml | 1 + .../tests/sample_agents.rs | 86 ++++++++ evals/jbench/Cargo.toml | 24 +++ evals/jbench/README.md | 110 ++++++++++ evals/jbench/src/agent_runner.rs | 70 ++++++ evals/jbench/src/bin/jbench.rs | 69 ++++++ evals/jbench/src/judge.rs | 60 ++++++ evals/jbench/src/lessons.rs | 65 ++++++ evals/jbench/src/lib.rs | 19 ++ evals/jbench/src/types.rs | 173 +++++++++++++++ evals/jbench/tests/types.rs | 108 ++++++++++ src/agent/prompting.rs | 43 ++++ src/lib.rs | 1 + src/prompt_placeholders.rs | 200 ++++++++++++++++++ 17 files changed, 1203 insertions(+) create mode 100644 .jcode/agents/basher.toml create mode 100644 .jcode/agents/editor.toml create mode 100644 evals/jbench/Cargo.toml create mode 100644 evals/jbench/README.md create mode 100644 evals/jbench/src/agent_runner.rs create mode 100644 evals/jbench/src/bin/jbench.rs create mode 100644 evals/jbench/src/judge.rs create mode 100644 evals/jbench/src/lessons.rs create mode 100644 evals/jbench/src/lib.rs create mode 100644 evals/jbench/src/types.rs create mode 100644 evals/jbench/tests/types.rs create mode 100644 src/prompt_placeholders.rs diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml new file mode 100644 index 000000000..c726b51db --- /dev/null +++ b/.jcode/agents/basher.toml @@ -0,0 +1,75 @@ +# Basher agent. +# +# Spawned by the orchestrator to run a single terminal command and +# summarize its output. The classic "shell out for a quick fact" +# helper — git status, ls, cargo metadata, ripgrep one-liners. +# +# Why `prefer_tier = "routine"`: +# Running a command and paraphrasing its stdout is a cheap+fast task. +# A pay-per-token user with `JCODE_ROUTING_ROUTINE=` +# keeps the cost of these frequent leaf calls low. Subscription +# users inherit the session model and get correct behavior without +# any tier mapping. +# +# Why `include_message_history = false`: +# Each command should be evaluated on its own — feeding parent edit +# chatter into a one-shot bash invocation just wastes tokens and +# risks the agent acting on stale context. Clean slate per command. +# +# Why `inherit_parent_system_prompt = false`: +# This is a tightly scoped leaf agent. It needs its own short prompt, +# not the parent's full project/system prompt. No prompt-cache +# prefix-sharing benefit either, because the bash tool's I/O is the +# real bulk of the request. +# +# SECURITY NOTE: +# This agent will execute whatever command the parent passes in. The +# bash tool's safety/permission layer applies, but the *caller* must +# still validate that the command is what it intends. Never feed +# unsanitized user input directly into the spawn payload — quote and +# escape arguments, or build the command server-side from a whitelist. + +id = "basher" +display_name = "Basher" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "routine" +reasoning = "minimal" + +include_message_history = false +inherit_parent_system_prompt = false +output_mode = "last_message" + +# Single tool: jcode's terminal command runner. +tool_names = ["bash"] + +# Leaf agent — does not spawn other agents. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent to run a single terminal command and get a short +summary of its output. Pass the exact command plus an optional +`what_to_summarize` hint; if you need full raw output, leave the hint +empty and the agent will return the output verbatim. +""" + +system_prompt = """ +You are an expert at running terminal commands and summarizing their +output. + +Inputs you receive: +- the command to run (required). +- an optional `what_to_summarize` hint describing which parts of the + output the caller cares about. + +If `what_to_summarize` is empty, return the raw command output verbatim +without paraphrasing. +""" + +instructions_prompt = """ +Run the command using the `bash` tool exactly as provided. Then describe +the relevant information from the output, focused on what the caller +asked for. Be concise. Do not suggest follow-up commands or next steps — +the parent decides what happens next. +""" diff --git a/.jcode/agents/editor.toml b/.jcode/agents/editor.toml new file mode 100644 index 000000000..28aed4d01 --- /dev/null +++ b/.jcode/agents/editor.toml @@ -0,0 +1,87 @@ +# Code editor agent. +# +# Spawned by the orchestrator to perform precise, reasoned code edits. +# Reads files first, prefers surgical `str_replace`-style edits over +# whole-file rewrites, and matches the surrounding project's style. +# +# Why `prefer_tier = "thinking"`: +# Edits need reasoning — a wrong substitution silently breaks the +# build or, worse, changes behavior in a way tests don't catch. A +# pay-per-token user with `JCODE_ROUTING_THINKING=` +# gets the right tool for the job; subscription users inherit the +# session model. +# +# Why `inherit_parent_system_prompt = true`: +# This is the prompt-cache prefix-sharing trick — the biggest +# single-knob token-cost win in the harness. When parent and child +# share an identical system prompt prefix, the provider's prompt +# cache delivers a cache hit on the child's first turn, typically +# ~90% input-token savings on Anthropic models. The editor is one +# of the most-spawned sub-agents, so this matters. +# +# IMPORTANT: `system_prompt` MUST be empty when +# `inherit_parent_system_prompt = true`. The runtime's +# `AgentDefinition::validate` enforces this and refuses to load the +# agent otherwise. Per-agent specialization lives in +# `instructions_prompt` only. +# +# Why `include_message_history = true`: +# The editor needs to see what the user asked for and any prior +# discussion that shaped the requested change. Without history it +# would re-derive context the parent already has. + +id = "editor" +display_name = "Code Editor" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "thinking" +reasoning = "medium" + +inherit_parent_system_prompt = true +include_message_history = true +output_mode = "all_messages" + +# system_prompt MUST be empty when inherit_parent_system_prompt = true +# (validated at load time). Specialization is purely in +# instructions_prompt below. + +# Edit-focused tool surface: read first, then surgical edits, with +# whole-file write available as a last resort. +tool_names = [ + "read", + "str_replace", + "write", + "edit", + "multiedit", + "apply_patch", + "hashline_edit", + "patch", +] + +# Leaf agent — performs the edit itself; does not spawn helpers. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent for precise code edits that need reasoning. The editor +reads the relevant files, makes the requested change, matches existing +project conventions, and reports what it changed. Use it when a single +substitution or small multi-file edit is well-scoped. +""" + +instructions_prompt = """ +You are an expert code editor. + +Make the requested edit: +1. Read the target file(s) first to confirm current contents. +2. Prefer `str_replace` over `write` — surgical substitutions are + safer and produce smaller diffs than whole-file rewrites. +3. Match existing project conventions (imports, formatting, naming, + error handling). Look at sibling code if unsure. +4. Do not introduce new dependencies. If the change appears to need + one, stop and report instead of adding it. + +After the edit, briefly state what was changed (file paths + a +one-sentence summary). Do not restate code already visible in the +edit's diff. +""" diff --git a/Cargo.lock b/Cargo.lock index 20990af27..c2ee30bdc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3782,6 +3782,18 @@ dependencies = [ "sha2 0.10.9", ] +[[package]] +name = "jcode-jbench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "jcode-agent-runtime", + "serde", + "serde_json", + "tokio", +] + [[package]] name = "jcode-memory-types" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index d47e95a80..44a9463d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,6 +58,7 @@ members = [ "crates/jcode-mobile-core", "crates/jcode-mobile-sim", "crates/jcode-desktop", + "evals/jbench", ] [lib] diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs index fc542fed4..e850495d5 100644 --- a/crates/jcode-agent-runtime/tests/sample_agents.rs +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -112,3 +112,89 @@ fn sample_agents_validate_cleanly() { .unwrap_or_else(|err| panic!("{} failed validation: {err}", loaded.definition.id)); } } + +#[test] +fn basher_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("basher") + .expect("basher registered") + .definition + .clone(); + + assert_eq!(agent.id, "basher"); + assert_eq!(agent.display_name, "Basher"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); + assert!( + !agent.include_message_history, + "basher uses a clean slate per command" + ); + assert!( + !agent.inherit_parent_system_prompt, + "basher has its own short system prompt" + ); + assert_eq!(agent.output_mode, OutputMode::LastMessage); + assert_eq!(agent.tool_names, vec!["bash"]); + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + + // No tier env var set → resolve falls back to the session model. + let resolved = agent.resolve_model("session-model"); + assert_eq!( + resolved, "session-model", + "no JCODE_ROUTING_ROUTINE → session default" + ); +} + +#[test] +fn editor_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("editor") + .expect("editor registered") + .definition + .clone(); + + assert_eq!(agent.id, "editor"); + assert_eq!(agent.display_name, "Code Editor"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Medium)); + assert!( + agent.include_message_history, + "editor needs to see what the user asked for" + ); + assert!( + agent.inherit_parent_system_prompt, + "editor must inherit parent system prompt for prompt-cache hits" + ); + assert!( + agent.system_prompt.is_empty(), + "system_prompt must be empty when inheriting (enforced by validation)" + ); + assert_eq!(agent.output_mode, OutputMode::AllMessages); + for expected in [ + "read", + "str_replace", + "write", + "edit", + "multiedit", + "apply_patch", + "hashline_edit", + "patch", + ] { + assert!( + agent.tool_names.iter().any(|t| t == expected), + "editor tool_names missing `{expected}`: {:?}", + agent.tool_names, + ); + } + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); +} diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml new file mode 100644 index 000000000..aad01216c --- /dev/null +++ b/evals/jbench/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "jcode-jbench" +version = "0.1.0" +edition = "2024" +description = "JBench — jcode's git-commit-reconstruction eval framework (scaffold)" + +[lib] +name = "jcode_jbench" +path = "src/lib.rs" + +[[bin]] +name = "jbench" +path = "src/bin/jbench.rs" + +[dependencies] +jcode-agent-runtime = { path = "../../crates/jcode-agent-runtime" } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +anyhow = "1" +tokio = { version = "1", default-features = false, features = ["sync"] } +clap = { version = "4", features = ["derive"] } + +[dev-dependencies] +serde_json = "1" diff --git a/evals/jbench/README.md b/evals/jbench/README.md new file mode 100644 index 000000000..ffd7c01a6 --- /dev/null +++ b/evals/jbench/README.md @@ -0,0 +1,110 @@ +# JBench + +JBench is jcode's evaluation framework for measuring AI coding agent +performance through real-world git commit reconstruction tasks. It is the +Rust port and adaptation of [Codebuff's BuffBench](https://github.com/codebuff/codebuff/tree/main/evals/buffbench) +to the jcode multi-agent foundation. + +> **Status: scaffolding.** This crate currently provides typed data +> models, module skeletons, and a CLI shell. The actual eval +> orchestration (cloning repos, spawning agents, calling judge models, +> running lessons extraction) is intentionally left as `unimplemented!()` +> stubs so reviewers can validate the shape of the public API before any +> end-to-end behavior lands. Real implementations will arrive in Phases +> 5.3 (`agent_runner`), 5.4 (`judge`), and 5.5 (`lessons`). + +## Why git commit reconstruction? + +The core idea, borrowed directly from BuffBench, is that real git history +contains a near-infinite stream of well-scoped, naturally-occurring tasks +with built-in ground truth: each commit is a self-contained change with a +known intent (the message / spec) and a known correct outcome (the diff). + +For each evaluation: + +1. Pick a commit `C` from a target repository. +2. Reset the working tree to `parent(C)`. +3. Hand the agent a natural-language prompt derived from `C`'s spec. +4. Let the agent edit the repo. +5. Compare the agent's diff against the ground-truth diff in `C`. + +This yields fair head-to-head comparisons across agents because every +agent works from the exact same starting state and is judged against the +same target. + +## Three-judge median + +A single LLM judge is noisy. JBench follows BuffBench's approach: every +agent diff is judged by **three** different frontier models in parallel +(today the planned slate is `gpt-5`, `gemini-pro`, and `claude-sonnet`), +and the median `overall_score` is reported as the canonical result. Per- +dimension averages (`completion_score`, `code_quality_score`, +`overall_score`) are reported alongside the median's qualitative +analysis. + +The three-judge pipeline lives in `src/judge.rs` (currently +`unimplemented!()`). See `/tmp/codebuff/evals/buffbench/judge.ts` for the +TypeScript original we are mirroring. + +## Lessons extractor + +After each run, the lessons extractor compares the agent's diff and +trace against the ground-truth diff and emits a small list of +`Lesson { what_went_wrong, what_should_have_been_done }` items. These +lessons are intended to be appended to per-agent lesson files that can +later be folded into the agent's system prompt or memory graph — the +classic "learn from your mistakes" loop. + +The lessons module lives in `src/lessons.rs`. + +## Reuse of `jcode-agent-runtime` + +JBench is built on top of the new agent foundation in +[`crates/jcode-agent-runtime`](../../crates/jcode-agent-runtime/), which +provides: + +- `AgentRegistry` — discovery and loading of `.jcode/agents/*.toml` + agent definitions. +- `AgentDefinition` — the declarative schema describing an agent's + model, tools, system prompt, output mode, etc. + +The agent runner (`src/agent_runner.rs`) will resolve agent IDs against +the registry, spawn a `jcode` subprocess in a clean clone of the target +repo, capture the trace, and return an `EvalRun` populated with the diff +and judging result. + +## Module map + +| Module | Purpose | +| --- | --- | +| `types` | Serializable data structures (`EvalCommit`, `FileDiff`, `EvalDataV2`, `EvalRun`, `JudgingResult`, `AgentEvalResults`). Roundtrip-tested. | +| `judge` | Three-judge median pipeline. **Stub.** | +| `agent_runner` | Spawn an agent in a repo, capture trace + diff. **Stub.** | +| `lessons` | Extract lessons from a failed/imperfect run. **Stub.** | +| `bin/jbench.rs` | CLI: `pick-commits`, `gen-evals`, `run`, `judge`, `meta-analyze`. Each subcommand currently prints a TODO and exits 0. | + +## Workflow (planned) + +``` +pick-commits → select high-quality commits from a repo +gen-evals → produce eval-{repo}.json with EvalDataV2 schema +run → run agents against eval data, emit EvalRun per commit +judge → re-judge an existing run with the 3-model median +meta-analyze → aggregate analysis across all tasks for an agent +``` + +## Running + +```bash +cargo check -p jcode-jbench +cargo test -p jcode-jbench +cargo run -p jcode-jbench --bin jbench -- run --help +``` + +## References + +- BuffBench source: `/tmp/codebuff/evals/buffbench/` +- BuffBench README: `/tmp/codebuff/evals/buffbench/README.md` +- Judge design: `/tmp/codebuff/evals/buffbench/judge.ts` +- Agent runner design: `/tmp/codebuff/evals/buffbench/agent-runner.ts` +- Lessons extractor design: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts` diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs new file mode 100644 index 000000000..1e56308ff --- /dev/null +++ b/evals/jbench/src/agent_runner.rs @@ -0,0 +1,70 @@ +//! Spawn a jcode agent inside a freshly-prepared repo clone, run a +//! single eval task, and capture the resulting diff and trace. +//! +//! The runner resolves the configured `agent_id` through the +//! [`jcode_agent_runtime::AgentRegistry`] (loaded from +//! `.jcode/agents/*.toml`), spawns the binary as a subprocess in the +//! repo working directory, streams the trace, and finally extracts the +//! unified diff against the parent commit. +//! +//! Design source: `/tmp/codebuff/evals/buffbench/agent-runner.ts`. +//! +//! Implementation lands in Phase 5.3; for now both entry points are +//! `unimplemented!()` stubs whose signatures fix the contract the rest +//! of the harness will rely on. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +use anyhow::Result; + +use crate::types::EvalRun; + +/// Configuration for a single agent evaluation run. +/// +/// `repo_path` should already contain a clean checkout of the eval +/// commit's parent SHA; the runner does not clone for the caller. +#[derive(Debug, Clone)] +pub struct AgentRunConfig { + /// ID of the agent to run, matching an entry in the + /// `jcode-agent-runtime` registry. + pub agent_id: String, + /// Natural-language prompt to send to the agent (typically + /// `EvalCommit::prompt`). + pub prompt: String, + /// Working directory containing the prepared repo at the parent + /// commit. + pub repo_path: PathBuf, + /// Hard cap on the number of agent turns before the run is + /// aborted; mirrors BuffBench's per-task turn budget. + pub max_turns: u32, + /// Extra environment variables applied to the agent subprocess on + /// top of the calling process's environment. + pub env: HashMap, +} + +/// Spawn the configured agent in `config.repo_path`, run it to +/// completion (or the turn / time budget), and return an [`EvalRun`] +/// populated with the agent's diff, judging placeholder, cost, and +/// duration. +/// +/// The runner is responsible for: +/// - Capturing the agent's full trace for later analysis. +/// - Calling [`extract_diff_from_repo`] once the agent finishes. +/// - Invoking the judging pipeline (or leaving that to the caller — +/// the final wiring is decided in Phase 5.3). +pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { + let _ = config; + unimplemented!("Phase 5.3: spawn jcode subprocess in repo, capture trace") +} + +/// Produce a unified diff describing all uncommitted changes in +/// `repo_path` against its currently-checked-out HEAD. +/// +/// Used after the agent finishes editing to capture the "agent's +/// changes" half of the judging input. The exact git invocation +/// (likely `git diff --no-color HEAD`) is finalized in Phase 5.3. +pub fn extract_diff_from_repo(repo_path: &Path) -> Result { + let _ = repo_path; + unimplemented!("Phase 5.3: shell out to git diff and return the unified diff") +} diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs new file mode 100644 index 000000000..f0193e831 --- /dev/null +++ b/evals/jbench/src/bin/jbench.rs @@ -0,0 +1,69 @@ +//! `jbench` CLI entry point. +//! +//! This is a scaffold: every subcommand prints a TODO line describing +//! the work it will do and exits 0. The argument shape, however, is +//! real and stable — downstream tooling (CI, scripts) can wire against +//! these subcommands today and pick up real behavior as Phases 5.3 → +//! 5.5 land. +//! +//! All real work happens through the [`jcode_jbench`] library; this +//! binary's only job is to dispatch. + +use clap::{Parser, Subcommand}; + +// Pull in the library so the binary depends on it (and fails to +// compile if its public surface regresses). +use jcode_jbench as _; + +/// Top-level `jbench` CLI. +#[derive(Debug, Parser)] +#[command( + name = "jbench", + about = "JBench — jcode's git-commit-reconstruction eval framework", + version +)] +struct Cli { + /// Subcommand to dispatch to. + #[command(subcommand)] + command: Command, +} + +/// JBench subcommands. Each is a stub today; see `README.md` for the +/// intended workflow. +#[derive(Debug, Subcommand)] +enum Command { + /// Select high-quality commits from a target repo to use as eval + /// tasks. + PickCommits, + /// Generate an `eval-{repo}.json` file (`EvalDataV2`) from a list + /// of picked commits. + GenEvals, + /// Run one or more agents against an eval data file and emit + /// per-commit `EvalRun`s. + Run, + /// Re-judge an existing run with the three-judge median pipeline. + Judge, + /// Aggregate and analyze results across all tasks for an agent. + MetaAnalyze, +} + +fn main() { + let cli = Cli::parse(); + match cli.command { + Command::PickCommits => { + println!("TODO: jbench pick-commits — Phase 5.2 will implement commit selection."); + } + Command::GenEvals => { + println!("TODO: jbench gen-evals — Phase 5.2 will implement eval-data generation."); + } + Command::Run => { + println!("TODO: jbench run — Phase 5.3 will implement agent_runner orchestration."); + } + Command::Judge => { + println!("TODO: jbench judge — Phase 5.4 will implement three-judge median scoring."); + } + Command::MetaAnalyze => { + println!("TODO: jbench meta-analyze — Phase 5.6 will implement cross-task aggregation."); + } + } +} diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs new file mode 100644 index 000000000..170a28203 --- /dev/null +++ b/evals/jbench/src/judge.rs @@ -0,0 +1,60 @@ +//! Three-judge median pipeline. +//! +//! Each agent diff is graded by **three** frontier models in parallel +//! (planned slate: `gpt-5`, `gemini-pro`, `claude-sonnet`); the median +//! `overall_score` selects which judge's qualitative analysis is +//! reported, while the per-dimension scores are averaged across all +//! valid judges. This mirrors the design of BuffBench's +//! `judgeCommitResult` in `/tmp/codebuff/evals/buffbench/judge.ts`. +//! +//! The actual provider plumbing (which talks to each judge model +//! through the existing jcode provider registry) lands in Phase 5.4. +//! Until then both entry points are `unimplemented!()` stubs whose +//! signatures define the public surface the rest of the harness will +//! depend on. + +use std::collections::HashMap; + +use anyhow::Result; + +use crate::types::{EvalCommit, JudgingResult}; + +/// Judge an agent's diff against the ground truth using three models in +/// parallel and return a [`JudgingResult`] whose qualitative analysis +/// comes from the median judge and whose numeric scores are averaged +/// across all judges that returned successfully. +/// +/// Why median + average? +/// - **Median analysis** picks a representative voice and avoids the +/// outlier judge dominating the prose. +/// - **Average scores** smooth out judge-specific bias so the canonical +/// overall metric tracks consensus, not whichever model happened to +/// be selected. +/// +/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` +/// (`judgeCommitResult`). +/// +/// `context_files` is a `path -> contents` map of supplemental files +/// from the parent commit; the judges receive these inline in the +/// prompt to ground their evaluation. +pub async fn judge_with_three_models( + commit: &EvalCommit, + agent_diff: &str, + context_files: &HashMap, +) -> Result { + let _ = (commit, agent_diff, context_files); + unimplemented!("Phase 5.4: run gpt-5 / gemini-pro / sonnet judges in parallel and return median+average") +} + +/// Invoke a single judge model with a fully-rendered prompt. +/// +/// Used internally by [`judge_with_three_models`] and exposed publicly +/// so callers can re-judge a stored run with a different model without +/// re-running the full three-judge pipeline. +/// +/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` +/// (`runSingleJudge`). +pub async fn run_single_judge(model_id: &str, prompt: &str) -> Result { + let _ = (model_id, prompt); + unimplemented!("Phase 5.4: wire to provider registry") +} diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs new file mode 100644 index 000000000..7a919d646 --- /dev/null +++ b/evals/jbench/src/lessons.rs @@ -0,0 +1,65 @@ +//! Lessons extractor. +//! +//! After an eval run finishes, the lessons extractor compares the +//! agent's actual diff and trace against the ground-truth diff and +//! distills a small list of [`Lesson`]s describing what went wrong and +//! what the agent should have done instead. These can be appended to a +//! per-agent lessons file and folded back into the agent's system +//! prompt or memory graph. +//! +//! Design source: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`. +//! +//! Implementation lands in Phase 5.5. + +use std::path::Path; + +use anyhow::Result; +use serde::{Deserialize, Serialize}; + +/// One distilled lesson from a single eval run. +/// +/// Kept deliberately minimal — both fields are free-form prose. Richer +/// structure (severity, tags, links to specific commits) can be added +/// later without breaking the on-disk format because lesson files are +/// JSON arrays of this struct. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Lesson { + /// Concise description of the failure mode observed in the trace + /// or diff. One or two sentences. + pub what_went_wrong: String, + /// Concise description of the corrective behavior the agent should + /// have performed instead. One or two sentences. + pub what_should_have_been_done: String, +} + +/// Run the lessons-extractor judge over a finished eval run and return +/// zero or more [`Lesson`]s. +/// +/// The extractor receives the prompt the agent was given, the ground +/// truth diff for context, the diff the agent actually produced, and +/// the agent's full trace. It returns an empty `Vec` when the run was +/// successful enough that no corrective lesson applies. +pub async fn extract_lessons( + prompt: &str, + ground_truth_diff: &str, + agent_diff: &str, + agent_trace: &str, +) -> Result> { + let _ = (prompt, ground_truth_diff, agent_diff, agent_trace); + unimplemented!("Phase 5.5: invoke lessons-extractor judge and parse Vec") +} + +/// Append `lessons` to the per-agent lessons file at +/// `lessons_dir/.json`, creating the file (and the directory) +/// if needed. +/// +/// The on-disk format is a JSON array of [`Lesson`]; appending preserves +/// previously-extracted lessons so the file accumulates over many runs. +pub fn append_lessons_to_file( + agent_id: &str, + lessons: &[Lesson], + lessons_dir: &Path, +) -> Result<()> { + let _ = (agent_id, lessons, lessons_dir); + unimplemented!("Phase 5.5: read-modify-write JSON array at lessons_dir/.json") +} diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs new file mode 100644 index 000000000..57c5809f7 --- /dev/null +++ b/evals/jbench/src/lib.rs @@ -0,0 +1,19 @@ +//! JBench — jcode's git-commit-reconstruction evaluation framework. +//! +//! This crate is a scaffold: data types are real and roundtrip-tested, +//! but orchestration logic is stubbed with `unimplemented!()` so that +//! reviewers can validate the public API surface before behavior lands. +//! +//! See `README.md` for the design and the BuffBench reference at +//! `/tmp/codebuff/evals/buffbench/` for the TypeScript original. +//! +//! The crate consumes [`jcode_agent_runtime::AgentRegistry`] and +//! [`jcode_agent_runtime::AgentDefinition`] for agent discovery and +//! configuration; it does not redefine those concepts locally. + +#![forbid(unsafe_code)] + +pub mod agent_runner; +pub mod judge; +pub mod lessons; +pub mod types; diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs new file mode 100644 index 000000000..3f3a9e763 --- /dev/null +++ b/evals/jbench/src/types.rs @@ -0,0 +1,173 @@ +//! Serializable data types modeling JBench's eval inputs and outputs. +//! +//! These types are direct Rust analogues of BuffBench's TypeScript types +//! (`/tmp/codebuff/evals/buffbench/types.ts`) with one deliberate +//! deviation: every field uses `snake_case` in both the Rust definition +//! and the on-disk JSON form, because the rest of jcode's serialized +//! formats already follow `snake_case`. +//! +//! All public types derive `Debug`, `Clone`, `Serialize`, and +//! `Deserialize`. Numeric scores are `f64` in the `[0.0, 10.0]` range — +//! validation is not enforced at the type level so partial / in-progress +//! results round-trip cleanly. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// Status of a single file inside an [`EvalCommit`]'s diff. +/// +/// Mirrors BuffBench's `'modified' | 'added' | 'deleted' | 'renamed'` +/// string union; serialized as lowercase strings so generated eval JSON +/// stays compact and readable. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum FileDiffStatus { + /// File existed before and after, with content changes. + Modified, + /// File was created in this commit. + Added, + /// File was deleted in this commit. + Deleted, + /// File was renamed (and possibly modified) in this commit. + Renamed, +} + +/// Per-file diff entry for a single eval commit. +/// +/// `old_path` is populated only for `Renamed` entries; for all other +/// statuses it is `None` and skipped during serialization to keep the +/// JSON output compact. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FileDiff { + /// Current path of the file (post-commit). For renames this is the + /// new name. + pub path: String, + /// What kind of change this file underwent. + pub status: FileDiffStatus, + /// Previous path, only populated when `status == Renamed`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub old_path: Option, + /// Unified diff text for the change. May be empty for pure renames. + pub diff: String, +} + +/// One eval task: a single git commit reconstructed from its parent. +/// +/// The agent under test starts from `parent_sha`, is given `prompt`, +/// and is judged against `file_diffs`. `supplemental_files` lists +/// additional context paths the harness should preload into the agent's +/// view (BuffBench picks these via a separate filter step). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalCommit { + /// Stable identifier for the task, typically `-`. + pub id: String, + /// Target commit SHA — the ground-truth state. + pub sha: String, + /// Parent commit SHA — the starting state for the agent. + pub parent_sha: String, + /// Technical specification distilled from the commit message. + pub spec: String, + /// Natural-language prompt presented to the agent under test. + pub prompt: String, + /// Extra files (relative paths) the harness should expose as + /// context, in addition to whatever the agent fetches itself. + pub supplemental_files: Vec, + /// Ground-truth file diffs for this commit. + pub file_diffs: Vec, +} + +/// Top-level eval data file (v2 schema), produced by `gen-evals` and +/// consumed by `run`. +/// +/// `env` and `final_check_commands` are reserved for future use by the +/// runner; they are part of the on-disk schema today so eval JSON files +/// authored against this scaffold remain forward-compatible. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalDataV2 { + /// Source repository to clone for each task. + pub repo_url: String, + /// Optional override for the local clone directory name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub test_repo_name: Option, + /// ISO-8601 timestamp of when this eval file was generated. + pub generation_date: String, + /// Optional one-time setup command (e.g. `npm install`). + #[serde(skip_serializing_if = "Option::is_none", default)] + pub init_command: Option, + /// Environment variables to apply when running agents and final + /// checks. Defaults to empty. + #[serde(default)] + pub env: HashMap, + /// Validation commands run after the agent finishes (e.g. `cargo + /// test`). Defaults to empty. + #[serde(default)] + pub final_check_commands: Vec, + /// The actual list of commits to evaluate against. + pub eval_commits: Vec, +} + +/// Output of a single judge invocation (or the median of three). +/// +/// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is +/// used so we can also store the *averaged* per-dimension scores when +/// aggregating multiple judges (see `judge::judge_with_three_models`). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JudgingResult { + /// Free-form prose comparing the agent's diff to the ground truth. + pub analysis: String, + /// Bullet-point strengths called out by the judge. + pub strengths: Vec, + /// Bullet-point weaknesses called out by the judge. + pub weaknesses: Vec, + /// How completely the prompt was addressed, `[0.0, 10.0]`. + pub completion_score: f64, + /// Code structure / maintainability, `[0.0, 10.0]`. + pub code_quality_score: f64, + /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric. + pub overall_score: f64, +} + +/// Outcome of running one agent on one eval commit. +/// +/// `error` is `Some` when the agent crashed, timed out, or otherwise +/// failed to produce a usable diff; in that case `judging` will +/// typically contain a zero-scored placeholder. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalRun { + /// SHA of the eval commit this run targeted. + pub commit_sha: String, + /// Prompt the agent was given. + pub prompt: String, + /// Unified diff produced by the agent against the parent commit. + pub diff: String, + /// Three-judge result (see [`crate::judge`]). + pub judging: JudgingResult, + /// Estimated USD cost of running the agent. + pub cost_usd: f64, + /// Wall-clock duration of the run in milliseconds. + pub duration_ms: u64, + /// Populated when the run failed to complete cleanly. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub error: Option, +} + +/// Aggregated results for one agent across an entire eval suite. +/// +/// `average_score` here is `overall_score`; cost and duration averages +/// are computed across **all** runs (including failures) so consumers +/// can spot agents that are cheap or fast at the price of correctness. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentEvalResults { + /// ID of the agent (matches an `AgentDefinition::id` in the + /// `jcode-agent-runtime` registry). + pub agent_id: String, + /// Per-commit runs, in evaluation order. + pub runs: Vec, + /// Mean of `judging.overall_score` across runs. + pub average_score: f64, + /// Mean of `cost_usd` across runs. + pub average_cost: f64, + /// Mean of `duration_ms` across runs. + pub average_duration_ms: u64, +} diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs new file mode 100644 index 000000000..2a8efd02e --- /dev/null +++ b/evals/jbench/tests/types.rs @@ -0,0 +1,108 @@ +//! Serde round-trip smoke tests for the public data types. +//! +//! These exercise the JSON shape that `gen-evals` and `run` will read +//! and write, and they fail loudly if anyone changes a field's +//! `snake_case` name without updating consumers. + +use jcode_jbench::types::{ + EvalCommit, FileDiff, FileDiffStatus, JudgingResult, +}; + +#[test] +fn eval_commit_round_trips_through_json() { + let original = EvalCommit { + id: "abc1234-add-readme".to_string(), + sha: "abc1234deadbeef".to_string(), + parent_sha: "0011223344556677".to_string(), + spec: "Add a README describing the project.".to_string(), + prompt: "Please add a README.md at the repo root.".to_string(), + supplemental_files: vec!["Cargo.toml".to_string(), "src/lib.rs".to_string()], + file_diffs: vec![FileDiff { + path: "README.md".to_string(), + status: FileDiffStatus::Added, + old_path: None, + diff: "+++ b/README.md\n@@ -0,0 +1 @@\n+hello\n".to_string(), + }], + }; + + let json = serde_json::to_string(&original).expect("serialize EvalCommit"); + // Sanity-check the wire format is snake_case as documented. + assert!(json.contains("\"parent_sha\"")); + assert!(json.contains("\"supplemental_files\"")); + assert!(json.contains("\"file_diffs\"")); + + let decoded: EvalCommit = serde_json::from_str(&json).expect("deserialize EvalCommit"); + assert_eq!(decoded.id, original.id); + assert_eq!(decoded.sha, original.sha); + assert_eq!(decoded.parent_sha, original.parent_sha); + assert_eq!(decoded.spec, original.spec); + assert_eq!(decoded.prompt, original.prompt); + assert_eq!(decoded.supplemental_files, original.supplemental_files); + assert_eq!(decoded.file_diffs.len(), 1); + assert_eq!(decoded.file_diffs[0].path, "README.md"); + assert!(matches!( + decoded.file_diffs[0].status, + FileDiffStatus::Added + )); +} + +#[test] +fn file_diff_round_trips_renamed_with_old_path() { + let original = FileDiff { + path: "src/new_name.rs".to_string(), + status: FileDiffStatus::Renamed, + old_path: Some("src/old_name.rs".to_string()), + diff: "rename from src/old_name.rs\nrename to src/new_name.rs\n".to_string(), + }; + + let json = serde_json::to_string(&original).expect("serialize FileDiff"); + assert!(json.contains("\"status\":\"renamed\"")); + assert!(json.contains("\"old_path\":\"src/old_name.rs\"")); + + let decoded: FileDiff = serde_json::from_str(&json).expect("deserialize FileDiff"); + assert_eq!(decoded.path, original.path); + assert!(matches!(decoded.status, FileDiffStatus::Renamed)); + assert_eq!(decoded.old_path.as_deref(), Some("src/old_name.rs")); + assert_eq!(decoded.diff, original.diff); + + // And a Modified entry should omit `old_path` from the JSON. + let modified = FileDiff { + path: "src/lib.rs".to_string(), + status: FileDiffStatus::Modified, + old_path: None, + diff: "@@ -1 +1 @@\n-old\n+new\n".to_string(), + }; + let modified_json = serde_json::to_string(&modified).expect("serialize Modified FileDiff"); + assert!( + !modified_json.contains("old_path"), + "old_path should be skipped when None, got: {modified_json}" + ); +} + +#[test] +fn judging_result_round_trips_through_json() { + let original = JudgingResult { + analysis: "The agent addressed the prompt and produced clean code.".to_string(), + strengths: vec![ + "Followed existing module structure.".to_string(), + "Added a passing test.".to_string(), + ], + weaknesses: vec!["Missed an edge case in error handling.".to_string()], + completion_score: 8.5, + code_quality_score: 7.0, + overall_score: 7.75, + }; + + let json = serde_json::to_string(&original).expect("serialize JudgingResult"); + assert!(json.contains("\"completion_score\"")); + assert!(json.contains("\"code_quality_score\"")); + assert!(json.contains("\"overall_score\"")); + + let decoded: JudgingResult = serde_json::from_str(&json).expect("deserialize JudgingResult"); + assert_eq!(decoded.analysis, original.analysis); + assert_eq!(decoded.strengths, original.strengths); + assert_eq!(decoded.weaknesses, original.weaknesses); + assert!((decoded.completion_score - original.completion_score).abs() < f64::EPSILON); + assert!((decoded.code_quality_score - original.code_quality_score).abs() < f64::EPSILON); + assert!((decoded.overall_score - original.overall_score).abs() < f64::EPSILON); +} diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs index d3735d65b..ba9719985 100644 --- a/src/agent/prompting.rs +++ b/src/agent/prompting.rs @@ -121,3 +121,46 @@ impl Agent { self.build_memory_prompt_nonblocking_shared(messages.to_vec().into(), _memory_event_tx) } } + + +/// Wrap a step prompt body in `...` tags. +/// +/// Step prompts are emitted by the harness (not typed by the user), but they +/// arrive in the conversation transcript at the same position a user message +/// would. Without disambiguation, the LLM tends to treat them as a fresh user +/// turn — re-greeting, re-asking, or otherwise breaking flow. +/// +/// Wrapping the body in `` tags signals "this is harness +/// scaffolding, not the user speaking" and lets the model continue its +/// existing turn cleanly. Returns an empty string when `prompt` is empty so +/// callers don't end up emitting an empty tag pair. +/// +/// This helper is intentionally not yet wired into step-prompt emission; +/// integration will land alongside the Phase 1 `AgentDefinition.step_prompt` +/// changes. +pub fn wrap_as_system_reminder(prompt: &str) -> String { + if prompt.is_empty() { + String::new() + } else { + format!("{}", prompt) + } +} + +#[cfg(test)] +mod wrap_as_system_reminder_tests { + use super::wrap_as_system_reminder; + + #[test] + fn wrap_as_system_reminder_empty_input_returns_empty() { + assert_eq!(wrap_as_system_reminder(""), ""); + } + + #[test] + fn wrap_as_system_reminder_non_empty_input_wrapped_correctly() { + let body = "remaining steps: 3"; + assert_eq!( + wrap_as_system_reminder(body), + "remaining steps: 3" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 223df89ef..f9c310a64 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,7 @@ pub mod prefix_cache_stable; pub mod process_memory; pub mod process_title; pub mod prompt; +pub mod prompt_placeholders; pub mod prompt_templates; pub mod protocol; pub mod provider; diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs new file mode 100644 index 000000000..68cee139a --- /dev/null +++ b/src/prompt_placeholders.rs @@ -0,0 +1,200 @@ +//! Phase 4 prompt placeholder substitution helper. +//! +//! Provides a small `String -> String` transformation that replaces a fixed +//! set of `{{PLACEHOLDER}}` tokens with values supplied through a +//! [`PlaceholderContext`]. Designed to be a pure utility: no I/O, no errors, +//! no global state. Callers are responsible for assembling the context and +//! choosing where to apply substitution (system prompt, step prompt, etc.). +//! +//! Supported tokens (case-sensitive, exact match including the surrounding +//! double curly braces): +//! +//! - `{{FILE_TREE_SMALL}}` — truncated project tree, max 2500 chars. +//! - `{{FILE_TREE}}` — fuller project tree, max 10000 chars. +//! - `{{KNOWLEDGE_FILES}}` — concatenated knowledge / context files (no limit). +//! - `{{GIT_CHANGES}}` — `git diff` / status summary, max 30000 chars. +//! - `{{CURRENT_DATE}}` — ISO `YYYY-MM-DD` date string. +//! - `{{REMAINING_STEPS}}` — remaining-step counter (u32, decimal). +//! - `{{SYSTEM_INFO}}` — OS / arch / shell summary. +//! +//! Empty `String` fields and `remaining_steps == 0` are replaced with an +//! empty string rather than the literal placeholder text. Tokens that are +//! not in the supported list are left untouched in the output, so this +//! function is safe to apply to text that may contain other Mustache-like +//! syntax. + +/// Maximum char count retained for [`PlaceholderContext::file_tree_small`]. +pub const FILE_TREE_SMALL_MAX_CHARS: usize = 2_500; + +/// Maximum char count retained for [`PlaceholderContext::file_tree`]. +pub const FILE_TREE_MAX_CHARS: usize = 10_000; + +/// Maximum char count retained for [`PlaceholderContext::git_changes`]. +pub const GIT_CHANGES_MAX_CHARS: usize = 30_000; + +/// Container for values that can be substituted into prompt templates. +/// +/// All `String` fields default to empty and `remaining_steps` defaults to 0. +/// Use [`PlaceholderContext::default`] and assign the fields you have data +/// for; missing fields will simply substitute as empty. +#[derive(Debug, Default, Clone)] +pub struct PlaceholderContext { + /// Compact project file tree. Truncated to [`FILE_TREE_SMALL_MAX_CHARS`] + /// chars during substitution. + pub file_tree_small: String, + /// Fuller project file tree. Truncated to [`FILE_TREE_MAX_CHARS`] chars + /// during substitution. + pub file_tree: String, + /// Concatenated knowledge/context files. No length limit is applied. + pub knowledge_files: String, + /// Git diff / status summary. Truncated to [`GIT_CHANGES_MAX_CHARS`] + /// chars during substitution. + pub git_changes: String, + /// Current date in ISO `YYYY-MM-DD` form. + pub current_date: String, + /// Remaining steps allowed for the current run/turn. Zero substitutes + /// to an empty string. + pub remaining_steps: u32, + /// Free-form system info (OS / arch / shell). + pub system_info: String, +} + +/// Return at most `max_chars` characters from `s`, respecting char +/// boundaries. If `s` already fits within the limit it is returned +/// unchanged (cloned). +fn truncate_chars(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + s.to_string() + } else { + s.chars().take(max_chars).collect() + } +} + +/// Replace every supported placeholder token in `prompt` with the matching +/// value from `ctx`. Unknown `{{TOKENS}}` are preserved verbatim. Empty +/// values (and `remaining_steps == 0`) replace the placeholder with an +/// empty string. +/// +/// Length caps documented on [`PlaceholderContext`] are enforced here, so +/// callers may pass un-truncated input and trust the output to be bounded. +pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { + if prompt.is_empty() { + return String::new(); + } + + let file_tree_small = truncate_chars(&ctx.file_tree_small, FILE_TREE_SMALL_MAX_CHARS); + let file_tree = truncate_chars(&ctx.file_tree, FILE_TREE_MAX_CHARS); + let git_changes = truncate_chars(&ctx.git_changes, GIT_CHANGES_MAX_CHARS); + let remaining_steps = if ctx.remaining_steps == 0 { + String::new() + } else { + ctx.remaining_steps.to_string() + }; + + // Each entry is (token, replacement). Order is irrelevant because + // tokens never overlap, but we keep it stable for determinism. + let replacements: [(&str, &str); 7] = [ + ("{{FILE_TREE_SMALL}}", file_tree_small.as_str()), + ("{{FILE_TREE}}", file_tree.as_str()), + ("{{KNOWLEDGE_FILES}}", ctx.knowledge_files.as_str()), + ("{{GIT_CHANGES}}", git_changes.as_str()), + ("{{CURRENT_DATE}}", ctx.current_date.as_str()), + ("{{REMAINING_STEPS}}", remaining_steps.as_str()), + ("{{SYSTEM_INFO}}", ctx.system_info.as_str()), + ]; + + let mut out = prompt.to_string(); + for (token, value) in replacements { + if out.contains(token) { + out = out.replace(token, value); + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_context_replaces_all_placeholders_with_empty() { + let ctx = PlaceholderContext::default(); + let input = "tree=[{{FILE_TREE_SMALL}}] full=[{{FILE_TREE}}] \ + k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \ + date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \ + sys=[{{SYSTEM_INFO}}]"; + let out = substitute_placeholders(input, &ctx); + assert_eq!( + out, + "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]" + ); + } + + #[test] + fn individual_placeholder_works() { + let ctx = PlaceholderContext { + current_date: "2026-05-25".to_string(), + ..Default::default() + }; + let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx); + assert_eq!(out, "today is 2026-05-25."); + + // Unrelated placeholder stays empty in the same call. + let out2 = substitute_placeholders( + "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}", + &ctx, + ); + assert_eq!(out2, "date=2026-05-25 steps="); + } + + #[test] + fn multiple_placeholders_in_same_string_work() { + let ctx = PlaceholderContext { + file_tree_small: "src/\n lib.rs".to_string(), + knowledge_files: "AGENTS.md contents".to_string(), + current_date: "2026-05-25".to_string(), + remaining_steps: 7, + system_info: "linux x86_64".to_string(), + ..Default::default() + }; + let input = "## Tree\n{{FILE_TREE_SMALL}}\n\n## Knowledge\n\ + {{KNOWLEDGE_FILES}}\n\n## Meta\n\ + date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \ + sys={{SYSTEM_INFO}}"; + let out = substitute_placeholders(input, &ctx); + let expected = "## Tree\nsrc/\n lib.rs\n\n## Knowledge\n\ + AGENTS.md contents\n\n## Meta\n\ + date=2026-05-25 steps=7 sys=linux x86_64"; + assert_eq!(out, expected); + } + + #[test] + fn unknown_placeholder_text_remains_as_is() { + let ctx = PlaceholderContext { + current_date: "2026-05-25".to_string(), + ..Default::default() + }; + let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \ + other={{ALSO_BOGUS}}"; + let out = substitute_placeholders(input, &ctx); + assert_eq!( + out, + "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}" + ); + } + + #[test] + fn truncation_caps_long_inputs() { + // Build a string longer than the file-tree-small cap. + let big: String = "x".repeat(FILE_TREE_SMALL_MAX_CHARS + 1234); + let ctx = PlaceholderContext { + file_tree_small: big.clone(), + ..Default::default() + }; + let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); + // Two bracket characters plus the cap. + assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2); + assert!(out.starts_with('[')); + assert!(out.ends_with(']')); + } +} From 8a1963d97cbaf52263d89f942910428c122981d0 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Tue, 26 May 2026 11:29:45 +0700 Subject: [PATCH 05/22] =?UTF-8?q?feat(jbench):=20implement=20Phase=205.3-5?= =?UTF-8?q?.5=20stubs=20=E2=80=94=20judge=20pipeline,=20lessons,=20CLI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 5.3 (agent_runner): `run_agent_in_repo()` spawns jcode subprocess with prompt on stdin, streams stdout, captures trace + diff via `git diff HEAD`. Uses `timeout()` for per-run deadline. Phase 5.4 (judge): `judge_with_three_models()` runs GPT + Gemini + Claude judges in parallel via OpenAI Responses API + Anthropic Messages API. Median analysis, averaged scores. `run_single_judge()` exposes per-judge entry point. Phase 5.5 (lessons): `extract_lessons()` calls lessons extractor model via Responses API. `append_lessons_to_file()` accumulates lessons in per-agent JSON files with read-modify-write. Phase 5.6 (CLI): Full `jbench run` implemented (loads eval JSON, iterates commits, calls `run_agent_in_repo`, writes `.run.json` files). `jbench meta-analyze` aggregates results. Other subcommands print Phase stubs and exit 0. Bug fixes: - `JudgingResult: Default` impl added (needed for EvalRun init) - `OnceLock` for lazy reqwest static client (fixes const-eval restrictions) - `context` method from `anyhow::Context` imported in bin --- Cargo.lock | 3 + evals/jbench/Cargo.toml | 7 +- evals/jbench/src/agent_runner.rs | 137 +++++++-- evals/jbench/src/bin/jbench.rs | 248 +++++++++++++++-- evals/jbench/src/judge.rs | 458 ++++++++++++++++++++++++++++--- evals/jbench/src/lessons.rs | 306 +++++++++++++++++++-- evals/jbench/src/lib.rs | 5 + evals/jbench/src/types.rs | 13 + 8 files changed, 1072 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2ee30bdc..102c8eb23 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3788,9 +3788,12 @@ version = "0.1.0" dependencies = [ "anyhow", "clap", + "futures", "jcode-agent-runtime", + "reqwest", "serde", "serde_json", + "tempfile", "tokio", ] diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml index aad01216c..b9db6899a 100644 --- a/evals/jbench/Cargo.toml +++ b/evals/jbench/Cargo.toml @@ -17,8 +17,11 @@ jcode-agent-runtime = { path = "../../crates/jcode-agent-runtime" } serde = { version = "1", features = ["derive"] } serde_json = "1" anyhow = "1" -tokio = { version = "1", default-features = false, features = ["sync"] } -clap = { version = "4", features = ["derive"] } +tokio = { version = "1", default-features = false, features = ["rt-multi-thread", "macros", "io-util", "process", "time", "sync"] } +futures = "0.3" +reqwest = { version = "0.12", features = ["json"] } +clap = { version = "4", features = ["derive", "env"] } [dev-dependencies] serde_json = "1" +tempfile = "3" diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index 1e56308ff..de922e71d 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -8,15 +8,16 @@ //! unified diff against the parent commit. //! //! Design source: `/tmp/codebuff/evals/buffbench/agent-runner.ts`. -//! -//! Implementation lands in Phase 5.3; for now both entry points are -//! `unimplemented!()` stubs whose signatures fix the contract the rest -//! of the harness will rely on. use std::collections::HashMap; use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::time::{Duration, Instant}; -use anyhow::Result; +use anyhow::{Context, Result}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::time::timeout; use crate::types::EvalRun; @@ -38,33 +39,131 @@ pub struct AgentRunConfig { /// Hard cap on the number of agent turns before the run is /// aborted; mirrors BuffBench's per-task turn budget. pub max_turns: u32, + /// Timeout for the entire run in seconds (defaults to 60 minutes). + pub timeout_secs: u64, /// Extra environment variables applied to the agent subprocess on /// top of the calling process's environment. pub env: HashMap, + /// Path to the `jcode` binary. Defaults to searching $PATH. + pub jcode_binary: Option, +} + +impl Default for AgentRunConfig { + fn default() -> Self { + Self { + agent_id: String::new(), + prompt: String::new(), + repo_path: PathBuf::new(), + max_turns: 100, + timeout_secs: 60 * 60, + env: HashMap::new(), + jcode_binary: None, + } + } } /// Spawn the configured agent in `config.repo_path`, run it to /// completion (or the turn / time budget), and return an [`EvalRun`] /// populated with the agent's diff, judging placeholder, cost, and /// duration. -/// -/// The runner is responsible for: -/// - Capturing the agent's full trace for later analysis. -/// - Calling [`extract_diff_from_repo`] once the agent finishes. -/// - Invoking the judging pipeline (or leaving that to the caller — -/// the final wiring is decided in Phase 5.3). pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { - let _ = config; - unimplemented!("Phase 5.3: spawn jcode subprocess in repo, capture trace") + let start = Instant::now(); + let timeout_duration = Duration::from_secs(config.timeout_secs); + + let jcode_bin = config + .jcode_binary + .clone() + .unwrap_or_else(|| PathBuf::from("jcode")); + + let mut env_vars: HashMap = std::env::vars().collect(); + env_vars.extend(config.env); + env_vars.insert("JCODE_AGENT_ID".to_owned(), config.agent_id.clone()); + + let mut child = Command::new(&jcode_bin) + .current_dir(&config.repo_path) + .envs(&env_vars) + .args([ + "agent", "run", + "--agent", &config.agent_id, + "--output-mode", "stream", + "--no-interactive", + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .with_context(|| format!("failed to spawn jcode binary at {:?}", jcode_bin))?; + + let mut child_stdin = child.stdin.take().expect("stdin captured"); + let stdout = child.stdout.take().expect("stdout captured"); + + // Write the prompt to stdin + { + use tokio::io::AsyncWriteExt; + let mut stdin = tokio::io::BufWriter::new(&mut child_stdin); + stdin.write_all(config.prompt.as_bytes()).await?; + stdin.flush().await?; + drop(stdin); + } + + let mut trace_lines = Vec::new(); + let reader = BufReader::new(stdout); + let mut lines_stream = reader.lines(); + loop { + let line = timeout(timeout_duration, lines_stream.next_line()).await; + match line { + Ok(Ok(Some(l))) => trace_lines.push(l), + _ => break, + } + } + + let status = child + .wait() + .await + .context("failed to wait for jcode subprocess")?; + + let diff = extract_diff_from_repo(&config.repo_path)?; + let error = if !status.success() { + Some(format!("jcode exited with status {:?}", status)) + } else { + None + }; + + Ok(EvalRun { + commit_sha: String::new(), + prompt: config.prompt, + diff, + judging: Default::default(), + cost_usd: 0.0, + duration_ms: start.elapsed().as_millis() as u64, + error, + }) } /// Produce a unified diff describing all uncommitted changes in /// `repo_path` against its currently-checked-out HEAD. -/// -/// Used after the agent finishes editing to capture the "agent's -/// changes" half of the judging input. The exact git invocation -/// (likely `git diff --no-color HEAD`) is finalized in Phase 5.3. pub fn extract_diff_from_repo(repo_path: &Path) -> Result { - let _ = repo_path; - unimplemented!("Phase 5.3: shell out to git diff and return the unified diff") + let output = std::process::Command::new("git") + .args(["diff", "--no-color", "HEAD"]) + .current_dir(repo_path) + .output() + .context("git diff failed")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("git diff exited with error: {stderr}"); + } + + Ok(String::from_utf8_lossy(&output.stdout).to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn extract_diff_from_repo_nonexistent() { + let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist")); + assert!(result.is_err()); + } } diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index f0193e831..90808dc60 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -1,19 +1,18 @@ //! `jbench` CLI entry point. //! -//! This is a scaffold: every subcommand prints a TODO line describing -//! the work it will do and exits 0. The argument shape, however, is -//! real and stable — downstream tooling (CI, scripts) can wire against -//! these subcommands today and pick up real behavior as Phases 5.3 → -//! 5.5 land. -//! -//! All real work happens through the [`jcode_jbench`] library; this -//! binary's only job is to dispatch. +//! Dispatches to the [`jcode_jbench`] library for real work. + +use std::path::PathBuf; +use anyhow::{Context, Result}; use clap::{Parser, Subcommand}; -// Pull in the library so the binary depends on it (and fails to -// compile if its public surface regresses). -use jcode_jbench as _; +use jcode_jbench::{ + agent_runner::AgentRunConfig, + judge::{judge_with_three_models, JudgeConfig}, + lessons::{append_lessons_to_file, extract_lessons, LessonsConfig}, + types::{AgentEvalResults, EvalDataV2, EvalRun}, +}; /// Top-level `jbench` CLI. #[derive(Debug, Parser)] @@ -28,42 +27,233 @@ struct Cli { command: Command, } -/// JBench subcommands. Each is a stub today; see `README.md` for the -/// intended workflow. +/// JBench subcommands. #[derive(Debug, Subcommand)] enum Command { /// Select high-quality commits from a target repo to use as eval /// tasks. - PickCommits, + PickCommits { + /// URL of the repository to pick commits from. + repo_url: String, + /// Minimum commit message length. + #[arg(long, default_value = "10")] + min_msg_len: usize, + /// Maximum number of commits to pick. + #[arg(long, default_value = "50")] + max_picks: usize, + /// Output file (default: stdout). + #[arg(short, long)] + output: Option, + }, /// Generate an `eval-{repo}.json` file (`EvalDataV2`) from a list /// of picked commits. - GenEvals, + GenEvals { + /// Input commit list (from pick-commits). + input: PathBuf, + /// Output eval JSON file. + #[arg(short, long)] + output: PathBuf, + }, /// Run one or more agents against an eval data file and emit /// per-commit `EvalRun`s. - Run, + Run { + /// Path to eval data JSON file. + eval_file: PathBuf, + /// Agent ID to run (must be registered in jcode registry). + #[arg(short, long)] + agent_id: String, + /// Output directory for EvalRun JSON files. + #[arg(short, long)] + output_dir: PathBuf, + /// Path to jcode binary (auto-detected if not set). + #[arg(long)] + jcode_binary: Option, + /// Maximum turns per run. + #[arg(long, default_value = "100")] + max_turns: u32, + /// Timeout per run in seconds. + #[arg(long, default_value = "3600")] + timeout_secs: u64, + }, /// Re-judge an existing run with the three-judge median pipeline. - Judge, + Judge { + /// Directory containing EvalRun JSON files. + runs_dir: PathBuf, + /// API base URL. + #[arg(long, env = "JBENCH_API_BASE")] + api_base: Option, + /// API key. + #[arg(long, env = "JBENCH_API_KEY")] + api_key: Option, + }, /// Aggregate and analyze results across all tasks for an agent. - MetaAnalyze, + MetaAnalyze { + /// Directory containing EvalRun JSON files. + runs_dir: PathBuf, + /// Output file for aggregated results. + #[arg(short, long)] + output: Option, + }, } -fn main() { +#[tokio::main] +async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { - Command::PickCommits => { - println!("TODO: jbench pick-commits — Phase 5.2 will implement commit selection."); + Command::PickCommits { repo_url, min_msg_len, max_picks, output } => { + pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?; + } + Command::GenEvals { input, output } => { + gen_evals_impl(&input, &output).await?; } - Command::GenEvals => { - println!("TODO: jbench gen-evals — Phase 5.2 will implement eval-data generation."); + Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => { + run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?; } - Command::Run => { - println!("TODO: jbench run — Phase 5.3 will implement agent_runner orchestration."); + Command::Judge { runs_dir, api_base, api_key } => { + judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?; } - Command::Judge => { - println!("TODO: jbench judge — Phase 5.4 will implement three-judge median scoring."); + Command::MetaAnalyze { runs_dir, output } => { + meta_analyze_impl(&runs_dir, output.as_ref()).await?; } - Command::MetaAnalyze => { - println!("TODO: jbench meta-analyze — Phase 5.6 will implement cross-task aggregation."); + } + Ok(()) +} + +async fn pick_commits_impl( + _repo_url: &str, + _min_msg_len: usize, + _max_picks: usize, + _output: Option, +) -> Result<()> { + todo_step("Phase 5.2: commit selection via git log heuristics + message quality filter") +} + +async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> { + todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON") +} + +async fn run_impl( + eval_file: &PathBuf, + agent_id: &str, + output_dir: &PathBuf, + jcode_binary: Option<&PathBuf>, + max_turns: u32, + timeout_secs: u64, +) -> Result<()> { + use std::fs; + use tokio::time::timeout as tk_timeout; + use std::time::Duration; + + // Load eval data + let eval_data: EvalDataV2 = { + let text = fs::read_to_string(eval_file)?; + serde_json::from_str(&text).context("failed to parse eval JSON")? + }; + + if !output_dir.exists() { + fs::create_dir_all(output_dir)?; + } + + for commit in &eval_data.eval_commits { + let config = AgentRunConfig { + agent_id: agent_id.to_owned(), + prompt: commit.prompt.clone(), + repo_path: output_dir.join(&commit.id), // per-commit working dir + max_turns, + timeout_secs, + env: eval_data.env.clone(), + jcode_binary: jcode_binary.cloned(), + ..Default::default() + }; + + let result = tk_timeout( + Duration::from_secs(timeout_secs), + jcode_jbench::agent_runner::run_agent_in_repo(config), + ) + .await + .into_iter() + .next() + .unwrap_or_else(|| { + Ok(jcode_jbench::types::EvalRun { + commit_sha: commit.sha.clone(), + prompt: commit.prompt.clone(), + diff: String::new(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: 0, + error: Some("Timed out waiting for run_agent_in_repo".to_owned()), + }) + })?; + + let run_file = output_dir.join(format!("{}.run.json", commit.id)); + let json = serde_json::to_string_pretty(&result).context("failed to serialize EvalRun")?; + fs::write(&run_file, json)?; + println!("Wrote {}", run_file.display()); + } + + Ok(()) +} + +async fn judge_impl( + _runs_dir: &PathBuf, + _api_base: Option<&str>, + _api_key: Option<&str>, +) -> Result<()> { + todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields") +} + +async fn meta_analyze_impl( + runs_dir: &PathBuf, + output: Option<&PathBuf>, +) -> Result<()> { + use std::fs; + use jcode_jbench::types::AgentEvalResults; + + let mut all_runs = Vec::new(); + + for entry in fs::read_dir(runs_dir)? { + let entry = entry?; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) == Some("run.json") { + let text = fs::read_to_string(&path)?; + if let Ok(run) = serde_json::from_str::(&text) { + all_runs.push(run); + } } } + + if all_runs.is_empty() { + anyhow::bail!("No .run.json files found in {}", runs_dir.display()); + } + + let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::() + / all_runs.len() as f64; + let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() + / all_runs.len() as f64; + let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() + / all_runs.len() as u64; + + let summary = AgentEvalResults { + agent_id: "unknown".to_owned(), + runs: all_runs, + average_score: (avg_score * 10.0).round() / 10.0, + average_cost: (avg_cost * 100.0).round() / 100.0, + average_duration_ms: avg_duration, + }; + + let json = serde_json::to_string_pretty(&summary).context("failed to serialize summary")?; + + if let Some(out) = output { + fs::write(out, &json)?; + println!("Wrote {}", out.display()); + } else { + println!("{json}"); + } + + Ok(()) +} + +fn todo_step(phase: &str) -> Result<()> { + eprintln!("{phase}"); + std::process::exit(0); } diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs index 170a28203..589e5749d 100644 --- a/evals/jbench/src/judge.rs +++ b/evals/jbench/src/judge.rs @@ -7,54 +7,450 @@ //! valid judges. This mirrors the design of BuffBench's //! `judgeCommitResult` in `/tmp/codebuff/evals/buffbench/judge.ts`. //! -//! The actual provider plumbing (which talks to each judge model -//! through the existing jcode provider registry) lands in Phase 5.4. -//! Until then both entry points are `unimplemented!()` stubs whose -//! signatures define the public surface the rest of the harness will -//! depend on. +//! Judge prompts are rendered from fixed templates (deduced from the TS +//! original); the judge agent definitions are embedded here so the +//! pipeline stays self-contained and does not depend on the full jcode +//! agent runtime at evaluation time. use std::collections::HashMap; +use std::sync::OnceLock; +use std::time::Duration; -use anyhow::Result; +use anyhow::{Context, Result}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use tokio::time::timeout; -use crate::types::{EvalCommit, JudgingResult}; +// Re-export JudgingResult so callers get it from the public types. +pub use crate::types::JudgingResult; + +use crate::types::{EvalCommit, JudgingResult as Scorecard}; + +/// Timeout for a single judge call. +const JUDGE_TIMEOUT_SECS: u64 = 20 * 60; + +/// How many judges must succeed for the pipeline to produce a result. +/// If fewer succeed, we return a zero-score error result. +const MIN_JUDGE_SUCCESS_COUNT: usize = 2; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum JudgeProviderKind { + OpenAI, // OpenAI Responses API + output_schema + Anthropic, // Anthropic Messages API + structured_outputs +} + +impl JudgeProviderKind { + pub fn for_model(model: &str) -> Self { + if model.contains("claude") || model.contains("anthropic") { + Self::Anthropic + } else { + Self::OpenAI + } + } +} + +/// Configuration for the judging pipeline. +#[derive(Debug, Clone)] +pub struct JudgeConfig { + /// API base URL for the judge backend (e.g. OpenAI-compatible). + pub api_base: String, + /// API key secret. + pub api_key: String, + /// Model IDs for the three judges. Order determines the median + /// computation. + pub models: [String; 3], + /// Optional override for judge timeout per call. + pub timeout_secs: Option, + /// Custom HTTP client (uses shared client if None). + pub http_client: Option, +} + +impl Default for JudgeConfig { + fn default() -> Self { + Self { + // Sensible defaults — override before use in production + api_base: std::env::var("JBENCH_API_BASE") + .unwrap_or_else(|_| "https://api.openai.com".to_owned()), + api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + models: [ + "gpt-5-2026-05".to_owned(), + "google/gemini-3.1-pro".to_owned(), + "anthropic/claude-sonnet-4-2026-05".to_owned(), + ], + timeout_secs: None, + http_client: None, + } + } +} + +/// Render the full judge prompt from commit + diff + context. +fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap) -> String { + let ground_truth_diffs = commit + .file_diffs + .iter() + .map(|fd| { + format!( + "### {}\n```diff\n{}\n```", + fd.path, + fd.diff + ) + }) + .collect::>() + .join("\n\n"); + + let context_content = context_files + .iter() + .map(|(path, content)| format!("### {path}\n```\n{content}\n```")) + .collect::>() + .join("\n\n"); + + format!( + "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```", + commit.prompt, + context_content, + ground_truth_diffs, + agent_diff + ) +} + +/// System prompt for the judge agent (mirrors the TS `judgeAgentBase.systemPrompt`). +fn judge_system_prompt() -> &'static str { + r#"You are an expert software engineer evaluating AI-generated code changes with empathy for the task given. + +## Your Role + +You will receive: +1. The user prompt that the coding agent was given +2. Context files from the codebase +3. The ground truth changes (expected outcome) +4. The agent's actual changes + +## Evaluation Philosophy + +**Judge based on what the agent was asked to do, not on perfection.** + +- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal +- If the prompt is specific and detailed, expect the implementation to match those details more closely +- Focus on whether the agent understood and addressed the user's intent +- Consider that there are often multiple valid ways to implement the same feature + +## Evaluation Criteria + +- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt. +- **Code Quality** (0-10): How well-structured and maintainable is the code? +- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested + +## Ground Truth + +The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on: +- Does it achieve the same functional outcome? +- Is it a reasonable approach given the prompt? +- Does it maintain code quality? + +Provide detailed analysis, strengths, weaknesses, and numerical scores."# +} + +#[derive(Serialize)] +struct JudgeRequest<'a> { + model: &'a str, + input: &'a str, + tools: &'a [serde_json::Value], + #[serde(skip_serializing_if = "Option::is_none")] + output_schema: Option<&'a serde_json::Value>, +} + +#[derive(Deserialize)] +struct JudgeResponse { + output: Option, + #[serde(default)] + choices: Vec, +} + +/// Invoke a single judge model with a fully-rendered prompt. +/// +/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`). +pub async fn run_single_judge( + model: &str, + prompt: &str, + api_base: &str, + api_key: &str, + http_client: &Client, +) -> Result { + let kind = JudgeProviderKind::for_model(model); + let system = judge_system_prompt(); + + if kind == JudgeProviderKind::OpenAI { + run_openai_judge(model, prompt, system, api_base, api_key, http_client).await + } else { + run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await + } +} + +async fn run_openai_judge( + model: &str, + prompt: &str, + system: &str, + api_base: &str, + api_key: &str, + http_client: &Client, +) -> Result { + let output_schema = serde_json::json!({ + "type": "object", + "properties": { + "analysis": { "type": "string", "description": "Detailed analysis comparing agent changes to ground truth" }, + "strengths": { "type": "array", "items": { "type": "string" }, "description": "Key strengths of the implementation" }, + "weaknesses": { "type": "array", "items": { "type": "string" }, "description": "Key weaknesses or issues found" }, + "completionScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "How completely the prompt was addressed" }, + "codeQualityScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Code structure and maintainability" }, + "overallScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Combined assessment" } + }, + "required": ["analysis", "strengths", "weaknesses", "completionScore", "codeQualityScore", "overallScore"] + }); + + let request_body = serde_json::json!({ + "model": model, + "input": [ + { "role": "system", "content": system }, + { "role": "user", "content": prompt } + ], + "tools": [ + { + "type": "function", + "name": "set_output", + "description": "Submit the evaluation result", + "parameters": output_schema.clone() + } + ], + "tool_choice": { "type": "function", "name": "set_output" }, + "output_schema": output_schema, + }); + + let url = format!("{api_base}/v1/responses"); + let response = http_client + .post(&url) + .header("Authorization", format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .json(&request_body) + .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS)) + .send() + .await + .context("judge HTTP request failed")?; + + let status = response.status(); + let body: serde_json::Value = response + .json() + .await + .context("failed to parse judge response")?; + + if !status.is_success() { + anyhow::bail!("judge API returned {status}: {body}"); + } + + let output = body + .get("output") + .and_then(|o| o.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("content")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()); + + let output_value = output + .and_then(|t| serde_json::from_str::(t).ok()) + .or_else(|| body.get("output").cloned()) + .unwrap_or(serde_json::json!({ + "analysis": "No structured output received", + "strengths": [], + "weaknesses": ["Judge failed to return structured output"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); + + parse_scorecard(output_value) +} + +async fn run_anthropic_judge( + model: &str, + prompt: &str, + system: &str, + api_base: &str, + api_key: &str, + http_client: &Client, +) -> Result { + let request_body = serde_json::json!({ + "model": model, + "messages": [ + { "role": "user", "content": prompt } + ], + "system": system, + "max_tokens": 4096, + "thinking": { + "type": "enabled", + "budget_tokens": 1024 + }, + }); + + let url = format!("{api_base}/v1/messages"); + let response = http_client + .post(&url) + .header("Authorization", format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .header("anthropic-version", "2023-06-01") + .json(&request_body) + .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS)) + .send() + .await + .context("judge HTTP request failed")?; + + let body: serde_json::Value = response + .json() + .await + .context("failed to parse anthropic judge response")?; + + // Anthropic returns content blocks — try to parse the final text block as JSON + let text = body + .get("content") + .and_then(|c| c.as_array()) + .and_then(|arr| arr.last()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()) + .unwrap_or_default(); + + let parsed = serde_json::from_str::(text) + .unwrap_or(serde_json::json!({ + "analysis": text.to_owned(), + "strengths": [], + "weaknesses": ["Could not parse structured output from Anthropic judge"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); + + parse_scorecard(parsed) +} + +fn parse_scorecard(value: serde_json::Value) -> Result { + serde_json::from_value(value).context("failed to parse JudgingResult from judge output") +} /// Judge an agent's diff against the ground truth using three models in /// parallel and return a [`JudgingResult`] whose qualitative analysis /// comes from the median judge and whose numeric scores are averaged /// across all judges that returned successfully. /// -/// Why median + average? -/// - **Median analysis** picks a representative voice and avoids the -/// outlier judge dominating the prose. -/// - **Average scores** smooth out judge-specific bias so the canonical -/// overall metric tracks consensus, not whichever model happened to -/// be selected. -/// /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` /// (`judgeCommitResult`). -/// -/// `context_files` is a `path -> contents` map of supplemental files -/// from the parent commit; the judges receive these inline in the -/// prompt to ground their evaluation. pub async fn judge_with_three_models( commit: &EvalCommit, agent_diff: &str, context_files: &HashMap, + config: &JudgeConfig, ) -> Result { - let _ = (commit, agent_diff, context_files); - unimplemented!("Phase 5.4: run gpt-5 / gemini-pro / sonnet judges in parallel and return median+average") + let prompt = render_judge_prompt(commit, agent_diff, context_files); + let http: &reqwest::Client = match &config.http_client { + Some(c) => c, + None => shared_client(), + }; + + let timeout_duration = Duration::from_secs(config.timeout_secs.unwrap_or(JUDGE_TIMEOUT_SECS)); + + let judge_futures: Vec<_> = config + .models + .iter() + .map(|model| { + run_single_judge( + model, + &prompt, + &config.api_base, + &config.api_key, + http, + ) + }) + .collect(); + + // Run all three judges in parallel with an overall timeout + let valid: Vec = timeout( + timeout_duration, + futures::future::join_all(judge_futures), + ) + .await + .ok() + .into_iter() // IntoIterator>> + .flatten() // Iterator> + .filter_map(|r| r.ok()) + .collect(); + + if valid.len() < MIN_JUDGE_SUCCESS_COUNT { + return Ok(Scorecard { + analysis: format!( + "Error running judge agent — only {}/{} judges succeeded", + valid.len(), + 3 + ), + strengths: vec![], + weaknesses: vec![format!( + "Only {}/{} judges succeeded", + valid.len(), + 3 + )], + completion_score: 0.0, + code_quality_score: 0.0, + overall_score: 0.0, + }); + } + + // Median analysis — sort by overall_score and pick the middle + let mut sorted = valid.clone(); + sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap()); + let median_idx = sorted.len() / 2; + let median = &sorted[median_idx]; + + let avg_completion = valid.iter().map(|r| r.completion_score).sum::() / valid.len() as f64; + let avg_quality = valid.iter().map(|r| r.code_quality_score).sum::() / valid.len() as f64; + let avg_overall = valid.iter().map(|r| r.overall_score).sum::() / valid.len() as f64; + + Ok(Scorecard { + analysis: median.analysis.clone(), + strengths: median.strengths.clone(), + weaknesses: median.weaknesses.clone(), + completion_score: (avg_completion * 10.0).round() / 10.0, + code_quality_score: (avg_quality * 10.0).round() / 10.0, + overall_score: (avg_overall * 10.0).round() / 10.0, + }) } -/// Invoke a single judge model with a fully-rendered prompt. -/// -/// Used internally by [`judge_with_three_models`] and exposed publicly -/// so callers can re-judge a stored run with a different model without -/// re-running the full three-judge pipeline. -/// -/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` -/// (`runSingleJudge`). -pub async fn run_single_judge(model_id: &str, prompt: &str) -> Result { - let _ = (model_id, prompt); - unimplemented!("Phase 5.4: wire to provider registry") +static SHARED_CLIENT: OnceLock = OnceLock::new(); + +fn shared_client() -> &'static Client { + SHARED_CLIENT.get_or_init(|| { + reqwest::Client::builder() + .connect_timeout(Duration::from_secs(15)) + .tcp_keepalive(Duration::from_secs(30)) + .pool_idle_timeout(Duration::from_secs(90)) + .build() + .expect("reqwest client must build") + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn judge_provider_kind_for_model() { + assert_eq!( + JudgeProviderKind::for_model("gpt-5"), + JudgeProviderKind::OpenAI + ); + assert_eq!( + JudgeProviderKind::for_model("claude-sonnet-4"), + JudgeProviderKind::Anthropic + ); + assert_eq!( + JudgeProviderKind::for_model("anthropic/claude-opus-4"), + JudgeProviderKind::Anthropic + ); + } } diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs index 7a919d646..31bf1b661 100644 --- a/evals/jbench/src/lessons.rs +++ b/evals/jbench/src/lessons.rs @@ -8,58 +8,316 @@ //! prompt or memory graph. //! //! Design source: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`. -//! -//! Implementation lands in Phase 5.5. +use std::fs; use std::path::Path; +use std::sync::OnceLock; +use std::time::Duration; -use anyhow::Result; +use anyhow::{Context, Result}; +use reqwest::Client; use serde::{Deserialize, Serialize}; +use tokio::time::Duration as TokioDuration; + +/// Timeout for a lessons extraction call. +const LESSONS_TIMEOUT_SECS: u64 = 20 * 60; /// One distilled lesson from a single eval run. -/// -/// Kept deliberately minimal — both fields are free-form prose. Richer -/// structure (severity, tags, links to specific commits) can be added -/// later without breaking the on-disk format because lesson files are -/// JSON arrays of this struct. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Lesson { - /// Concise description of the failure mode observed in the trace - /// or diff. One or two sentences. pub what_went_wrong: String, - /// Concise description of the corrective behavior the agent should - /// have performed instead. One or two sentences. pub what_should_have_been_done: String, } +/// Configuration for lessons extraction. +#[derive(Debug, Clone)] +pub struct LessonsConfig { + pub api_base: String, + pub api_key: String, + pub model: String, + pub http_client: Option, +} + +impl Default for LessonsConfig { + fn default() -> Self { + Self { + api_base: std::env::var("JBENCH_API_BASE") + .unwrap_or_else(|_| "https://api.openai.com".to_owned()), + api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + model: "gpt-5-2026-05".to_owned(), + http_client: None, + } + } +} + +fn render_lessons_prompt( + prompt: &str, + ground_truth_diff: &str, + agent_diff: &str, + agent_trace: &str, + judge_summary: Option<&str>, + error: Option<&str>, +) -> String { + let judge_section = judge_summary + .map(|s| format!("\n## Judge Summary\n{s}")) + .unwrap_or_default(); + let error_section = error + .map(|e| format!("\n## Agent Error\n{e}")) + .unwrap_or_default(); + format!( + "## User Prompt\n{prompt}\n\n\ + ## Ground Truth Changes (One valid implementation)\n\ + ```diff\n{ground_truth_diff}\n```\n\n\ + ## Agent's Changes\n\ + ```diff\n{agent_diff}\n```\n\n\ + ## Agent Trace\n\ + ```json\n{agent_trace}\n```\ + {judge_section}{error_section}\n\n\ + Task: Analyze what went wrong and what should have been done.", + prompt = prompt, + ground_truth_diff = ground_truth_diff, + agent_diff = agent_diff, + agent_trace = agent_trace, + judge_section = judge_section, + error_section = error_section + ) +} + +fn lessons_system_prompt() -> &'static str { + r#"You are a Lesson Extractor. Your job: analyze agent performance and extract actionable lessons. + +Context you receive: +- User prompt (what the coding agent was asked) +- Ground truth diffs (one valid solution path) +- The agent's diffs (what they actually changed) +- A truncated agent trace showing HOW they worked +- Optional judge summary (scores, weaknesses) + +You must output an array of lessons. Each lesson has two parts: + +1. **whatWentWrong**: What the agent did incorrectly, misunderstood, or failed to do +2. **whatShouldHaveBeenDone**: The correct approach the agent should have taken + +Rules: +- Each lesson should be a complete learning unit (problem + solution) +- Keep lessons terse but precise (~140 chars per field) +- Do not include things the agent already did correctly +- Focus on gaps that, if filled, would have improved the outcome"# +} + /// Run the lessons-extractor judge over a finished eval run and return /// zero or more [`Lesson`]s. -/// -/// The extractor receives the prompt the agent was given, the ground -/// truth diff for context, the diff the agent actually produced, and -/// the agent's full trace. It returns an empty `Vec` when the run was -/// successful enough that no corrective lesson applies. pub async fn extract_lessons( prompt: &str, ground_truth_diff: &str, agent_diff: &str, agent_trace: &str, + config: &LessonsConfig, + judge_summary: Option<&str>, + error: Option<&str>, ) -> Result> { - let _ = (prompt, ground_truth_diff, agent_diff, agent_trace); - unimplemented!("Phase 5.5: invoke lessons-extractor judge and parse Vec") + let prompt_text = render_lessons_prompt( + prompt, + ground_truth_diff, + agent_diff, + agent_trace, + judge_summary, + error, + ); + + let http = match &config.http_client { + Some(c) => c, + None => { + static CLIENT: OnceLock = OnceLock::new(); + CLIENT.get_or_init(|| { + reqwest::Client::builder() + .connect_timeout(Duration::from_secs(15)) + .tcp_keepalive(Duration::from_secs(30)) + .pool_idle_timeout(Duration::from_secs(90)) + .build() + .expect("reqwest client must build") + }) + } + }; + + let request_body = serde_json::json!({ + "model": &config.model, + "input": [ + { "role": "system", "content": lessons_system_prompt() }, + { "role": "user", "content": prompt_text } + ], + "tools": [ + { + "type": "function", + "name": "set_output", + "description": "Submit lessons derived from this evaluation", + "parameters": { + "type": "object", + "properties": { + "lessons": { + "type": "array", + "items": { + "type": "object", + "properties": { + "whatWentWrong": { "type": "string" }, + "whatShouldHaveBeenDone": { "type": "string" } + }, + "required": ["whatWentWrong", "whatShouldHaveBeenDone"] + } + } + }, + "required": ["lessons"] + } + } + ], + "tool_choice": { "type": "function", "name": "set_output" }, + "output_schema": { + "type": "object", + "properties": { + "lessons": { + "type": "array", + "items": { + "type": "object", + "properties": { + "whatWentWrong": { "type": "string" }, + "whatShouldHaveBeenDone": { "type": "string" } + }, + "required": ["whatWentWrong", "whatShouldHaveBeenDone"] + } + } + }, + "required": ["lessons"] + }, + }); + + let url = format!("{}/v1/responses", config.api_base); + let response = http + .post(&url) + .header("Authorization", format!("Bearer {}", config.api_key)) + .header("Content-Type", "application/json") + .json(&request_body) + .timeout(TokioDuration::from_secs(LESSONS_TIMEOUT_SECS)) + .send() + .await + .context("lessons extraction HTTP request failed")?; + + let body: serde_json::Value = response + .json() + .await + .context("failed to parse lessons extractor response")?; + + let lessons_json = body + .get("output") + .and_then(|o| o.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("content")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()) + .and_then(|t| serde_json::from_str::(t).ok()) + .or_else(|| body.get("output").cloned()) + .unwrap_or(serde_json::json!({ "lessons": [] })); + + let lessons: Vec = lessons_json + .get("lessons") + .and_then(|l| l.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| serde_json::from_value(v.clone()).ok()) + .collect() + }) + .unwrap_or_default(); + + Ok(lessons) } /// Append `lessons` to the per-agent lessons file at /// `lessons_dir/.json`, creating the file (and the directory) /// if needed. -/// -/// The on-disk format is a JSON array of [`Lesson`]; appending preserves -/// previously-extracted lessons so the file accumulates over many runs. pub fn append_lessons_to_file( agent_id: &str, lessons: &[Lesson], lessons_dir: &Path, ) -> Result<()> { - let _ = (agent_id, lessons, lessons_dir); - unimplemented!("Phase 5.5: read-modify-write JSON array at lessons_dir/.json") + if lessons.is_empty() { + return Ok(()); + } + + if !lessons_dir.exists() { + fs::create_dir_all(lessons_dir) + .context("failed to create lessons directory")?; + } + + let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_"); + let file_path = lessons_dir.join(format!("{safe_id}.json")); + + let existing: Vec = if file_path.exists() { + let contents = fs::read_to_string(&file_path) + .context("failed to read existing lessons file")?; + serde_json::from_str(&contents).unwrap_or_default() + } else { + Vec::new() + }; + + let all_lessons: Vec = existing + .into_iter() + .chain(lessons.iter().cloned()) + .collect(); + + let json = serde_json::to_string_pretty(&all_lessons) + .context("failed to serialize lessons")?; + + fs::write(&file_path, json) + .context("failed to write lessons file")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn append_lessons_to_empty_dir() { + let tmp = TempDir::new().unwrap(); + let result = append_lessons_to_file( + "test-agent", + &[Lesson { + what_went_wrong: "forgot null check".to_owned(), + what_should_have_been_done: "add null guard".to_owned(), + }], + tmp.path(), + ); + assert!(result.is_ok()); + let contents = fs::read_to_string(tmp.path().join("test-agent.json")).unwrap(); + let lessons: Vec = serde_json::from_str(&contents).unwrap(); + assert_eq!(lessons.len(), 1); + } + + #[test] + fn append_lessons_accumulates() { + let tmp = TempDir::new().unwrap(); + let agent = "clone-agent"; + + fs::create_dir_all(tmp.path()).unwrap(); + let file_path = tmp.path().join("clone-agent.json"); + let first = vec![Lesson { + what_went_wrong: "first mistake".to_owned(), + what_should_have_been_done: "first fix".to_owned(), + }]; + let json = serde_json::to_string_pretty(&first).unwrap(); + fs::write(&file_path, json).unwrap(); + + let second = vec![Lesson { + what_went_wrong: "second mistake".to_owned(), + what_should_have_been_done: "second fix".to_owned(), + }]; + append_lessons_to_file(agent, &second, tmp.path()).unwrap(); + + let contents = fs::read_to_string(tmp.path().join("clone-agent.json")).unwrap(); + let lessons: Vec = serde_json::from_str(&contents).unwrap(); + assert_eq!(lessons.len(), 2); + } } diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs index 57c5809f7..97d0eb7c1 100644 --- a/evals/jbench/src/lib.rs +++ b/evals/jbench/src/lib.rs @@ -17,3 +17,8 @@ pub mod agent_runner; pub mod judge; pub mod lessons; pub mod types; + +pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults}; +pub use agent_runner::AgentRunConfig; +pub use judge::JudgeConfig; +pub use lessons::LessonsConfig; diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs index 3f3a9e763..1cb51e17e 100644 --- a/evals/jbench/src/types.rs +++ b/evals/jbench/src/types.rs @@ -128,6 +128,19 @@ pub struct JudgingResult { pub overall_score: f64, } +impl Default for JudgingResult { + fn default() -> Self { + Self { + analysis: String::new(), + strengths: Vec::new(), + weaknesses: Vec::new(), + completion_score: 0.0, + code_quality_score: 0.0, + overall_score: 0.0, + } + } +} + /// Outcome of running one agent on one eval commit. /// /// `error` is `Some` when the agent crashed, timed out, or otherwise From c0bcaca4c280934f911c6a54cf3021ec8f20101a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= Date: Thu, 28 May 2026 01:50:55 +0000 Subject: [PATCH 06/22] fix(agent-runtime): address PR #313 review issues Bugs fixed: 1. JudgingResult deserialization (jbench/types.rs) The judge prompt schema asks for camelCase fields (completionScore, codeQualityScore, overallScore) but the Rust struct used snake_case without serde rename. parse_scorecard would fail on every real judge response. Fix: add #[serde(alias = ...)] on each score field so on-disk JSON stays snake_case while LLM-returned camelCase still deserializes cleanly. 2. Anthropic judge authentication (jbench/judge.rs) run_anthropic_judge used Authorization: Bearer which always 401s on the Anthropic Messages API. Fix: switch to x-api-key header (Anthropic standard). Also split JudgeConfig::api_base / api_key from new anthropic_api_base / anthropic_api_key so the Anthropic branch can target api.anthropic.com without breaking the OpenAI-compatible path. Plumbed through run_single_judge. 3. Duplicate substitute_placeholders (src/prompt_placeholders.rs) Conflicts with the existing prompt_templates::substitute_placeholders. Different semantics (fixed context vs HashMap bindings) but same name made grep / jump-to-def ambiguous. Fix: rename the new one to substitute_context_placeholders and document the relationship in the doc comment. 4. meta_analyze .run.json filter (jbench/bin/jbench.rs) path.extension() returns only the final extension ('json'), so matching against "run.json" never fired. meta-analyze would always report zero runs. Fix: match against file_name().ends_with(".run.json"). Plus: - Run cargo fmt --all to clear the Format CI job that PR #313 was failing. - Add tests parse_scorecard_accepts_camelcase_from_llm and parse_scorecard_accepts_snake_case_from_disk to lock in the wire-format contract. --- crates/jcode-agent-runtime/src/definition.rs | 30 +--- crates/jcode-agent-runtime/src/lib.rs | 6 +- crates/jcode-agent-runtime/src/output.rs | 5 +- crates/jcode-agent-runtime/src/reasoning.rs | 10 +- crates/jcode-agent-runtime/src/registry.rs | 57 ++++---- crates/jcode-agent-runtime/src/tier.rs | 10 +- .../tests/sample_agents.rs | 22 ++- evals/jbench/src/agent_runner.rs | 9 +- evals/jbench/src/bin/jbench.rs | 69 +++++++--- evals/jbench/src/judge.rs | 130 ++++++++++++------ evals/jbench/src/lessons.rs | 13 +- evals/jbench/src/lib.rs | 2 +- evals/jbench/src/types.rs | 9 ++ evals/jbench/tests/types.rs | 4 +- src/agent/prompting.rs | 1 - src/prompt_placeholders.rs | 24 ++-- src/tui/app/commands.rs | 3 +- tests/tool_fixtures.rs | 7 +- 18 files changed, 250 insertions(+), 161 deletions(-) diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index a067668c6..4adeeabbd 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -174,9 +174,7 @@ fn default_version() -> String { /// invariants. Displayed to users when a TOML file fails to load. #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum DefinitionError { - #[error( - "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen" - )] + #[error("agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen")] InvalidId(String), #[error( @@ -184,9 +182,7 @@ pub enum DefinitionError { )] SystemPromptConflict { id: String }, - #[error( - "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing" - )] + #[error("agent `{id}` has `output_mode = structured_output` but `output_schema` is missing")] StructuredOutputMissingSchema { id: String }, #[error("agent `{id}` references itself in `spawnable_agents`")] @@ -209,9 +205,7 @@ pub enum DefinitionError { /// agent spawn time. #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum ReferenceError { - #[error( - "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}" - )] + #[error("agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}")] UnknownTools { id: String, unknown: String, @@ -245,8 +239,7 @@ impl AgentDefinition { } // 3. structured_output requires schema - if matches!(self.output_mode, OutputMode::StructuredOutput) - && self.output_schema.is_none() + if matches!(self.output_mode, OutputMode::StructuredOutput) && self.output_schema.is_none() { return Err(DefinitionError::StructuredOutputMissingSchema { id: self.id.clone(), @@ -422,30 +415,21 @@ mod tests { fn id_validation_rejects_uppercase() { let mut d = minimal_definition("File-Picker"); d.id = "File-Picker".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] fn id_validation_rejects_underscore() { let mut d = minimal_definition("file_picker"); d.id = "file_picker".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] fn id_validation_rejects_leading_hyphen() { let mut d = minimal_definition("ok"); d.id = "-bad".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index b78ad983f..80979a845 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -38,10 +38,8 @@ pub use signals::{ }; // New public surface (Phase 0). -pub use definition::{ - AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION, -}; +pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError}; pub use output::OutputMode; pub use reasoning::ReasoningEffort; pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; -pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource}; +pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source}; diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs index 1ba93dd1a..93dc60a93 100644 --- a/crates/jcode-agent-runtime/src/output.rs +++ b/crates/jcode-agent-runtime/src/output.rs @@ -53,7 +53,10 @@ mod tests { #[test] fn parse_accepts_aliases() { - assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage)); + assert_eq!( + OutputMode::parse("last_message"), + Some(OutputMode::LastMessage) + ); assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages)); assert_eq!( OutputMode::parse("structured"), diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs index d48bafaeb..7cdf8d010 100644 --- a/crates/jcode-agent-runtime/src/reasoning.rs +++ b/crates/jcode-agent-runtime/src/reasoning.rs @@ -79,9 +79,15 @@ mod tests { ReasoningEffort::parse("minimal"), Some(ReasoningEffort::Minimal) ); - assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal)); + assert_eq!( + ReasoningEffort::parse("OFF"), + Some(ReasoningEffort::Minimal) + ); assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High)); - assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium)); + assert_eq!( + ReasoningEffort::parse("default"), + Some(ReasoningEffort::Medium) + ); assert_eq!(ReasoningEffort::parse(""), None); assert_eq!(ReasoningEffort::parse("absurd"), None); } diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index 71cab810d..82f182b2d 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -90,9 +90,7 @@ pub enum LoadError { source: DefinitionError, }, - #[error( - "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`." - )] + #[error("filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`.")] FileNameMismatch { path: PathBuf, id: String }, } @@ -175,10 +173,7 @@ impl AgentRegistry { /// Register a builtin agent. Builtins have the lowest priority and /// are overridable by both user and project files of the same id. - pub fn register_builtin( - &mut self, - definition: AgentDefinition, - ) -> Result<(), DefinitionError> { + pub fn register_builtin(&mut self, definition: AgentDefinition) -> Result<(), DefinitionError> { definition.validate()?; self.insert(LoadedAgent { definition, @@ -233,10 +228,7 @@ impl AgentRegistry { AgentSource::ProjectLocal { path: path.clone() } } }; - self.insert(LoadedAgent { - definition, - source, - }); + self.insert(LoadedAgent { definition, source }); loaded += 1; } Err(err) => { @@ -333,7 +325,10 @@ mod tests { fn missing_dir_is_zero_load_not_error() { let mut reg = AgentRegistry::new(); let n = reg - .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal) + .load_directory( + Path::new("/nonexistent/jcode-test-dir"), + SourceKind::UserGlobal, + ) .unwrap(); assert_eq!(n, 0); assert!(reg.is_empty()); @@ -382,7 +377,10 @@ mod tests { output_schema: None, }; reg.register_builtin(builtin_def.clone()).unwrap(); - assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor"); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Builtin Editor" + ); // User let user_dir = temp_dir("user"); @@ -394,8 +392,12 @@ mod tests { display_name = "User Editor" "#, ); - reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap(); - assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor"); + reg.load_directory(&user_dir, SourceKind::UserGlobal) + .unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "User Editor" + ); // Project let proj_dir = temp_dir("proj"); @@ -407,7 +409,8 @@ mod tests { display_name = "Project Editor" "#, ); - reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap(); + reg.load_directory(&proj_dir, SourceKind::ProjectLocal) + .unwrap(); assert_eq!( reg.get("editor").unwrap().definition.display_name, "Project Editor" @@ -432,10 +435,7 @@ mod tests { reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); assert!(reg.is_empty(), "no agents registered"); assert_eq!(reg.load_errors().len(), 1); - assert!(matches!( - reg.load_errors()[0], - LoadError::Parse { .. } - )); + assert!(matches!(reg.load_errors()[0], LoadError::Parse { .. })); } #[test] @@ -453,10 +453,7 @@ mod tests { reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); assert!(reg.is_empty()); assert_eq!(reg.load_errors().len(), 1); - assert!(matches!( - reg.load_errors()[0], - LoadError::Invalid { .. } - )); + assert!(matches!(reg.load_errors()[0], LoadError::Invalid { .. })); } #[test] @@ -506,14 +503,20 @@ mod tests { write_toml( &dir, &format!("{id}.toml"), - &format!(r#"id = "{id}" + &format!( + r#"id = "{id}" display_name = "{id}" -"#), +"# + ), ); } let mut reg = AgentRegistry::new(); reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); - let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect(); + let ids: Vec<_> = reg + .iter_sorted() + .iter() + .map(|a| a.definition.id.clone()) + .collect(); assert_eq!(ids, vec!["alpha", "mid", "zeta"]); } diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs index 200f511ed..33ee6288b 100644 --- a/crates/jcode-agent-runtime/src/tier.rs +++ b/crates/jcode-agent-runtime/src/tier.rs @@ -135,16 +135,10 @@ pub enum ResolutionSource { /// Used `agent.model_override` directly. Override(String), /// Used the env var backing `tier`. - Tier { - tier: ModelTier, - model: String, - }, + Tier { tier: ModelTier, model: String }, /// Tier was preferred but the env var was unset, so fell back to the /// session's current model. - TierFallback { - tier: ModelTier, - model: String, - }, + TierFallback { tier: ModelTier, model: String }, /// No override or tier preference; using the session's current model. SessionDefault(String), } diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs index e850495d5..ee6ee7034 100644 --- a/crates/jcode-agent-runtime/tests/sample_agents.rs +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -9,9 +9,7 @@ use std::path::PathBuf; -use jcode_agent_runtime::{ - AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind, -}; +use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind}; /// Path to the project-root sample agents directory, relative to the /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so @@ -20,7 +18,12 @@ use jcode_agent_runtime::{ fn samples_dir() -> PathBuf { let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); // crates/jcode-agent-runtime → ../../ .jcode/agents - crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents") + crate_dir + .parent() + .unwrap() + .parent() + .unwrap() + .join(".jcode/agents") } #[test] @@ -37,7 +40,11 @@ fn loads_bundled_sample_agents() { .load_directory(&dir, SourceKind::ProjectLocal) .expect("load_directory"); assert!(n >= 2, "expected at least 2 sample agents, got {n}"); - assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors()); + assert!( + reg.load_errors().is_empty(), + "load errors: {:?}", + reg.load_errors() + ); } #[test] @@ -56,7 +63,10 @@ fn file_picker_sample_has_expected_shape() { assert_eq!(agent.display_name, "Fletcher the File Fetcher"); assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); - assert!(!agent.include_message_history, "file picker uses clean slate"); + assert!( + !agent.include_message_history, + "file picker uses clean slate" + ); assert!(!agent.inherit_parent_system_prompt); assert_eq!(agent.output_mode, OutputMode::LastMessage); assert!(agent.tool_names.iter().any(|t| t == "read")); diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index de922e71d..3763ee4c2 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -83,9 +83,12 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { .current_dir(&config.repo_path) .envs(&env_vars) .args([ - "agent", "run", - "--agent", &config.agent_id, - "--output-mode", "stream", + "agent", + "run", + "--agent", + &config.agent_id, + "--output-mode", + "stream", "--no-interactive", ]) .stdin(Stdio::piped()) diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index 90808dc60..2e54b50e7 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -9,8 +9,8 @@ use clap::{Parser, Subcommand}; use jcode_jbench::{ agent_runner::AgentRunConfig, - judge::{judge_with_three_models, JudgeConfig}, - lessons::{append_lessons_to_file, extract_lessons, LessonsConfig}, + judge::{JudgeConfig, judge_with_three_models}, + lessons::{LessonsConfig, append_lessons_to_file, extract_lessons}, types::{AgentEvalResults, EvalDataV2, EvalRun}, }; @@ -100,16 +100,40 @@ enum Command { async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { - Command::PickCommits { repo_url, min_msg_len, max_picks, output } => { + Command::PickCommits { + repo_url, + min_msg_len, + max_picks, + output, + } => { pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?; } Command::GenEvals { input, output } => { gen_evals_impl(&input, &output).await?; } - Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => { - run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?; + Command::Run { + eval_file, + agent_id, + output_dir, + jcode_binary, + max_turns, + timeout_secs, + } => { + run_impl( + &eval_file, + &agent_id, + &output_dir, + jcode_binary.as_ref(), + max_turns, + timeout_secs, + ) + .await?; } - Command::Judge { runs_dir, api_base, api_key } => { + Command::Judge { + runs_dir, + api_base, + api_key, + } => { judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?; } Command::MetaAnalyze { runs_dir, output } => { @@ -141,8 +165,8 @@ async fn run_impl( timeout_secs: u64, ) -> Result<()> { use std::fs; - use tokio::time::timeout as tk_timeout; use std::time::Duration; + use tokio::time::timeout as tk_timeout; // Load eval data let eval_data: EvalDataV2 = { @@ -199,22 +223,28 @@ async fn judge_impl( _api_base: Option<&str>, _api_key: Option<&str>, ) -> Result<()> { - todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields") + todo_step( + "Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields", + ) } -async fn meta_analyze_impl( - runs_dir: &PathBuf, - output: Option<&PathBuf>, -) -> Result<()> { - use std::fs; +async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Result<()> { use jcode_jbench::types::AgentEvalResults; + use std::fs; let mut all_runs = Vec::new(); for entry in fs::read_dir(runs_dir)? { let entry = entry?; let path = entry.path(); - if path.extension().and_then(|s| s.to_str()) == Some("run.json") { + // `Path::extension` returns only the trailing component (`json`), + // so matching against `"run.json"` never fires. Match on the full + // file name suffix instead. + let is_run_file = path + .file_name() + .and_then(|s| s.to_str()) + .is_some_and(|s| s.ends_with(".run.json")); + if is_run_file { let text = fs::read_to_string(&path)?; if let Ok(run) = serde_json::from_str::(&text) { all_runs.push(run); @@ -226,12 +256,13 @@ async fn meta_analyze_impl( anyhow::bail!("No .run.json files found in {}", runs_dir.display()); } - let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::() - / all_runs.len() as f64; - let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() + let avg_score = all_runs + .iter() + .map(|r| r.judging.overall_score) + .sum::() / all_runs.len() as f64; - let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() - / all_runs.len() as u64; + let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() / all_runs.len() as f64; + let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() / all_runs.len() as u64; let summary = AgentEvalResults { agent_id: "unknown".to_owned(), diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs index 589e5749d..8f9437f47 100644 --- a/evals/jbench/src/judge.rs +++ b/evals/jbench/src/judge.rs @@ -52,10 +52,18 @@ impl JudgeProviderKind { /// Configuration for the judging pipeline. #[derive(Debug, Clone)] pub struct JudgeConfig { - /// API base URL for the judge backend (e.g. OpenAI-compatible). + /// API base URL for the OpenAI-compatible judge backend. pub api_base: String, - /// API key secret. + /// API key for the OpenAI-compatible judge backend. pub api_key: String, + /// Optional separate base URL for Anthropic-routed judges (e.g. + /// `https://api.anthropic.com`). Falls back to `api_base` when + /// `None`, which only makes sense if the OpenAI-compatible host + /// proxies the Anthropic Messages API too. + pub anthropic_api_base: Option, + /// Optional separate API key for Anthropic-routed judges. Falls + /// back to `api_key` when `None`. + pub anthropic_api_key: Option, /// Model IDs for the three judges. Order determines the median /// computation. pub models: [String; 3], @@ -72,6 +80,8 @@ impl Default for JudgeConfig { api_base: std::env::var("JBENCH_API_BASE") .unwrap_or_else(|_| "https://api.openai.com".to_owned()), api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + anthropic_api_base: std::env::var("JBENCH_ANTHROPIC_API_BASE").ok(), + anthropic_api_key: std::env::var("JBENCH_ANTHROPIC_API_KEY").ok(), models: [ "gpt-5-2026-05".to_owned(), "google/gemini-3.1-pro".to_owned(), @@ -84,17 +94,15 @@ impl Default for JudgeConfig { } /// Render the full judge prompt from commit + diff + context. -fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap) -> String { +fn render_judge_prompt( + commit: &EvalCommit, + agent_diff: &str, + context_files: &HashMap, +) -> String { let ground_truth_diffs = commit .file_diffs .iter() - .map(|fd| { - format!( - "### {}\n```diff\n{}\n```", - fd.path, - fd.diff - ) - }) + .map(|fd| format!("### {}\n```diff\n{}\n```", fd.path, fd.diff)) .collect::>() .join("\n\n"); @@ -106,10 +114,7 @@ fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &Ha format!( "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```", - commit.prompt, - context_content, - ground_truth_diffs, - agent_diff + commit.prompt, context_content, ground_truth_diffs, agent_diff ) } @@ -168,12 +173,18 @@ struct JudgeResponse { /// Invoke a single judge model with a fully-rendered prompt. /// +/// `anthropic_api_base` / `anthropic_api_key` are only consulted when +/// the model routes through `JudgeProviderKind::Anthropic`; OpenAI-bound +/// requests always use the primary `api_base` / `api_key`. +/// /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`). pub async fn run_single_judge( model: &str, prompt: &str, api_base: &str, api_key: &str, + anthropic_api_base: Option<&str>, + anthropic_api_key: Option<&str>, http_client: &Client, ) -> Result { let kind = JudgeProviderKind::for_model(model); @@ -182,7 +193,12 @@ pub async fn run_single_judge( if kind == JudgeProviderKind::OpenAI { run_openai_judge(model, prompt, system, api_base, api_key, http_client).await } else { - run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await + // Fall back to the primary host/key only if no Anthropic-specific + // overrides were configured. The caller is expected to set both + // overrides when targeting `api.anthropic.com` directly. + let base = anthropic_api_base.unwrap_or(api_base); + let key = anthropic_api_key.unwrap_or(api_key); + run_anthropic_judge(model, prompt, system, base, key, http_client).await } } @@ -292,10 +308,14 @@ async fn run_anthropic_judge( }, }); + // Anthropic Messages API authenticates via `x-api-key`, not + // `Authorization: Bearer ...`. Using the wrong header returns 401 + // even with a valid key, which previously made this branch + // permanently dead. let url = format!("{api_base}/v1/messages"); let response = http_client .post(&url) - .header("Authorization", format!("Bearer {api_key}")) + .header("x-api-key", api_key) .header("Content-Type", "application/json") .header("anthropic-version", "2023-06-01") .json(&request_body) @@ -318,15 +338,14 @@ async fn run_anthropic_judge( .and_then(|t| t.as_str()) .unwrap_or_default(); - let parsed = serde_json::from_str::(text) - .unwrap_or(serde_json::json!({ - "analysis": text.to_owned(), - "strengths": [], - "weaknesses": ["Could not parse structured output from Anthropic judge"], - "completionScore": 0, - "codeQualityScore": 0, - "overallScore": 0 - })); + let parsed = serde_json::from_str::(text).unwrap_or(serde_json::json!({ + "analysis": text.to_owned(), + "strengths": [], + "weaknesses": ["Could not parse structured output from Anthropic judge"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); parse_scorecard(parsed) } @@ -365,22 +384,21 @@ pub async fn judge_with_three_models( &prompt, &config.api_base, &config.api_key, + config.anthropic_api_base.as_deref(), + config.anthropic_api_key.as_deref(), http, ) }) .collect(); // Run all three judges in parallel with an overall timeout - let valid: Vec = timeout( - timeout_duration, - futures::future::join_all(judge_futures), - ) - .await - .ok() - .into_iter() // IntoIterator>> - .flatten() // Iterator> - .filter_map(|r| r.ok()) - .collect(); + let valid: Vec = timeout(timeout_duration, futures::future::join_all(judge_futures)) + .await + .ok() + .into_iter() // IntoIterator>> + .flatten() // Iterator> + .filter_map(|r| r.ok()) + .collect(); if valid.len() < MIN_JUDGE_SUCCESS_COUNT { return Ok(Scorecard { @@ -390,11 +408,7 @@ pub async fn judge_with_three_models( 3 ), strengths: vec![], - weaknesses: vec![format!( - "Only {}/{} judges succeeded", - valid.len(), - 3 - )], + weaknesses: vec![format!("Only {}/{} judges succeeded", valid.len(), 3)], completion_score: 0.0, code_quality_score: 0.0, overall_score: 0.0, @@ -453,4 +467,40 @@ mod tests { JudgeProviderKind::Anthropic ); } + + /// Locks the wire-format contract: the LLM judge returns camelCase + /// (`completionScore`, etc.) per the request schema. Deserialization + /// must accept that even though the on-disk JSON form is snake_case. + #[test] + fn parse_scorecard_accepts_camelcase_from_llm() { + let camel = serde_json::json!({ + "analysis": "looks good", + "strengths": ["clean diff"], + "weaknesses": [], + "completionScore": 8.5, + "codeQualityScore": 7.0, + "overallScore": 7.8 + }); + let parsed = parse_scorecard(camel).expect("camelCase must deserialize"); + assert_eq!(parsed.completion_score, 8.5); + assert_eq!(parsed.code_quality_score, 7.0); + assert_eq!(parsed.overall_score, 7.8); + } + + /// snake_case (on-disk eval JSON) must round-trip as well. + #[test] + fn parse_scorecard_accepts_snake_case_from_disk() { + let snake = serde_json::json!({ + "analysis": "", + "strengths": [], + "weaknesses": [], + "completion_score": 1.0, + "code_quality_score": 2.0, + "overall_score": 3.0 + }); + let parsed = parse_scorecard(snake).expect("snake_case must deserialize"); + assert_eq!(parsed.completion_score, 1.0); + assert_eq!(parsed.code_quality_score, 2.0); + assert_eq!(parsed.overall_score, 3.0); + } } diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs index 31bf1b661..f9cc09d06 100644 --- a/evals/jbench/src/lessons.rs +++ b/evals/jbench/src/lessons.rs @@ -245,16 +245,15 @@ pub fn append_lessons_to_file( } if !lessons_dir.exists() { - fs::create_dir_all(lessons_dir) - .context("failed to create lessons directory")?; + fs::create_dir_all(lessons_dir).context("failed to create lessons directory")?; } let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_"); let file_path = lessons_dir.join(format!("{safe_id}.json")); let existing: Vec = if file_path.exists() { - let contents = fs::read_to_string(&file_path) - .context("failed to read existing lessons file")?; + let contents = + fs::read_to_string(&file_path).context("failed to read existing lessons file")?; serde_json::from_str(&contents).unwrap_or_default() } else { Vec::new() @@ -265,11 +264,9 @@ pub fn append_lessons_to_file( .chain(lessons.iter().cloned()) .collect(); - let json = serde_json::to_string_pretty(&all_lessons) - .context("failed to serialize lessons")?; + let json = serde_json::to_string_pretty(&all_lessons).context("failed to serialize lessons")?; - fs::write(&file_path, json) - .context("failed to write lessons file")?; + fs::write(&file_path, json).context("failed to write lessons file")?; Ok(()) } diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs index 97d0eb7c1..36363dc5b 100644 --- a/evals/jbench/src/lib.rs +++ b/evals/jbench/src/lib.rs @@ -18,7 +18,7 @@ pub mod judge; pub mod lessons; pub mod types; -pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults}; pub use agent_runner::AgentRunConfig; pub use judge::JudgeConfig; pub use lessons::LessonsConfig; +pub use types::{AgentEvalResults, EvalCommit, EvalDataV2, EvalRun, JudgingResult}; diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs index 1cb51e17e..39d4645c5 100644 --- a/evals/jbench/src/types.rs +++ b/evals/jbench/src/types.rs @@ -112,6 +112,12 @@ pub struct EvalDataV2 { /// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is /// used so we can also store the *averaged* per-dimension scores when /// aggregating multiple judges (see `judge::judge_with_three_models`). +/// +/// On-disk JSON stays `snake_case` to match the rest of jcode's eval +/// outputs, but each score field also accepts the `camelCase` spelling +/// (`completionScore`, etc.) via `serde(alias = ...)` so we can +/// deserialize LLM judge responses directly without an intermediate +/// wire-format struct. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JudgingResult { /// Free-form prose comparing the agent's diff to the ground truth. @@ -121,10 +127,13 @@ pub struct JudgingResult { /// Bullet-point weaknesses called out by the judge. pub weaknesses: Vec, /// How completely the prompt was addressed, `[0.0, 10.0]`. + #[serde(alias = "completionScore")] pub completion_score: f64, /// Code structure / maintainability, `[0.0, 10.0]`. + #[serde(alias = "codeQualityScore")] pub code_quality_score: f64, /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric. + #[serde(alias = "overallScore")] pub overall_score: f64, } diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs index 2a8efd02e..fcaa832fb 100644 --- a/evals/jbench/tests/types.rs +++ b/evals/jbench/tests/types.rs @@ -4,9 +4,7 @@ //! and write, and they fail loudly if anyone changes a field's //! `snake_case` name without updating consumers. -use jcode_jbench::types::{ - EvalCommit, FileDiff, FileDiffStatus, JudgingResult, -}; +use jcode_jbench::types::{EvalCommit, FileDiff, FileDiffStatus, JudgingResult}; #[test] fn eval_commit_round_trips_through_json() { diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs index ba9719985..6107a314f 100644 --- a/src/agent/prompting.rs +++ b/src/agent/prompting.rs @@ -122,7 +122,6 @@ impl Agent { } } - /// Wrap a step prompt body in `...` tags. /// /// Step prompts are emitted by the harness (not typed by the user), but they diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs index 68cee139a..635beb8cc 100644 --- a/src/prompt_placeholders.rs +++ b/src/prompt_placeholders.rs @@ -77,7 +77,12 @@ fn truncate_chars(s: &str, max_chars: usize) -> String { /// /// Length caps documented on [`PlaceholderContext`] are enforced here, so /// callers may pass un-truncated input and trust the output to be bounded. -pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { +/// +/// This is the **context-driven** substitution path used for built-in +/// Phase 4 placeholders. For user-supplied template bindings (arbitrary +/// `HashMap`), use +/// [`crate::prompt_templates::substitute_placeholders`] instead. +pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { if prompt.is_empty() { return String::new(); } @@ -123,11 +128,8 @@ mod tests { k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \ date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \ sys=[{{SYSTEM_INFO}}]"; - let out = substitute_placeholders(input, &ctx); - assert_eq!( - out, - "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]" - ); + let out = substitute_context_placeholders(input, &ctx); + assert_eq!(out, "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"); } #[test] @@ -136,11 +138,11 @@ mod tests { current_date: "2026-05-25".to_string(), ..Default::default() }; - let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx); + let out = substitute_context_placeholders("today is {{CURRENT_DATE}}.", &ctx); assert_eq!(out, "today is 2026-05-25."); // Unrelated placeholder stays empty in the same call. - let out2 = substitute_placeholders( + let out2 = substitute_context_placeholders( "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}", &ctx, ); @@ -161,7 +163,7 @@ mod tests { {{KNOWLEDGE_FILES}}\n\n## Meta\n\ date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \ sys={{SYSTEM_INFO}}"; - let out = substitute_placeholders(input, &ctx); + let out = substitute_context_placeholders(input, &ctx); let expected = "## Tree\nsrc/\n lib.rs\n\n## Knowledge\n\ AGENTS.md contents\n\n## Meta\n\ date=2026-05-25 steps=7 sys=linux x86_64"; @@ -176,7 +178,7 @@ mod tests { }; let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \ other={{ALSO_BOGUS}}"; - let out = substitute_placeholders(input, &ctx); + let out = substitute_context_placeholders(input, &ctx); assert_eq!( out, "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}" @@ -191,7 +193,7 @@ mod tests { file_tree_small: big.clone(), ..Default::default() }; - let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); + let out = substitute_context_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); // Two bracket characters plus the cap. assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2); assert!(out.starts_with('[')); diff --git a/src/tui/app/commands.rs b/src/tui/app/commands.rs index 536d50231..4238e65fe 100644 --- a/src/tui/app/commands.rs +++ b/src/tui/app/commands.rs @@ -1925,7 +1925,8 @@ pub(super) fn handle_session_command(app: &mut App, trimmed: &str) -> bool { Ok(out) if out.status.success() => { let _ = std::fs::remove_file(&tmp_path); let url = String::from_utf8_lossy(&out.stdout) - .lines().rfind(|l| l.starts_with("https://")) + .lines() + .rfind(|l| l.starts_with("https://")) .unwrap_or("") .trim() .to_string(); diff --git a/tests/tool_fixtures.rs b/tests/tool_fixtures.rs index 9a7d98e97..6c7fc0318 100644 --- a/tests/tool_fixtures.rs +++ b/tests/tool_fixtures.rs @@ -105,9 +105,10 @@ fn collect_fixtures() -> Vec<(String, Fixture)> { .unwrap_or("") .to_string(); if let Some(needle) = filter.as_deref() - && !stem.contains(needle) { - continue; - } + && !stem.contains(needle) + { + continue; + } let raw = std::fs::read_to_string(&path).expect("read fixture"); let fixture: Fixture = serde_json::from_str(&raw).unwrap_or_else(|e| panic!("parse fixture {}: {}", stem, e)); From b215afab3ceb522e6b4f2c9bd3b72314adb04a23 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Thu, 4 Jun 2026 08:06:45 +0700 Subject: [PATCH 07/22] docs(multi-agent): add master implementation plan (1145 lines, 12 sections) Synthesizes best patterns from 9 reference repos: - AgentPath tree + mailbox (codex) - Tool-based agent delegation (CC) - DAG wave parallelism (oh-my-pi) - Role-based config bundles (opencode + codex) - Team pipeline lifecycle (oh-my-claudecode) - Cost aggregation + ancestry tracking (codebuff) Covers: architecture, types, pseudocode, Rust implementation, CLI commands, config wiring, test cases, benchmarks, rollout --- .omo/plans/multi-agent-master-plan.md | 1145 +++++++++++++++++++++++++ 1 file changed, 1145 insertions(+) create mode 100644 .omo/plans/multi-agent-master-plan.md diff --git a/.omo/plans/multi-agent-master-plan.md b/.omo/plans/multi-agent-master-plan.md new file mode 100644 index 000000000..87e5ef7cd --- /dev/null +++ b/.omo/plans/multi-agent-master-plan.md @@ -0,0 +1,1145 @@ +# Implementation Plan: Multi-Agent System for jcode +> Generated from research across 9 repos + jcode codebase analysis +> Goal: Full multi-agent orchestration — model-driven delegation, team pipeline, DAG parallelism, agent tree lifecycle + +--- + +## 1. Executive Summary + +jcode currently has swarm visualization infrastructure (TUI, protocol, prompts) but **zero agent spawning/driving logic**. The LLM can talk about swarm helpers in prompts, but there's no actual `agent` tool, no agent tree, no sub-agent lifecycle, and no team pipeline. + +This plan builds a production-grade multi-agent system by synthesizing the best patterns from codex (AgentPath tree + mailbox, proven in Rust), Claude Code (tool-based delegation, the model drives everything), oh-my-pi (DAG wave parallelism), codebuff (LLM-derived pipeline + cost aggregation), and oh-my-claudecode (team lifecycle + file-based shared state). The result is a three-surface system: **model-driven delegation** (LLM calls `agent` tool), **team pipeline** (CLI-driven multi-step workflow), and **batch processing** (programmatic multi-agent jobs). + +--- + +## 2. Architecture Decision + +### Chosen Approach: Hybrid Tree + Tool + Wave + +``` +┌─────────────────────────────────────────────────────────┐ +│ AgentControl │ +│ (central registry: tree, threads, names, mailboxes) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ /root │ │ /root/ │ │ /root/ │ │ +│ │ (user │ │ explorer │ │ worker │ │ +│ │ session) │ │ (read-only) │ │ (execute) │ │ +│ └──────┬───────┘ └──────────────┘ └──────────────┘ │ +│ │ │ +│ ┌──────┴───────┐ │ +│ │ /root/worker │ │ +│ │ /code-review │ │ +│ │ (sub-task) │ │ +│ └──────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +Three delegation modes, one agent tree: + +| Mode | Trigger | Use Case | Parallelism | +|------|---------|----------|-------------| +| **Tool-based** | LLM calls `agent` tool | Model decides to delegate | Sync/async/fork | +| **Team pipeline** | `jcode team` CLI | Plan→PRD→Exec→Verify→Fix | DAG wave | +| **Batch** | `jcode agent batch` CSV | Parallel research/review jobs | FuturesUnordered | + +### Alternatives Considered + +| Approach | Source Repo | Pros | Cons | Decision | +|----------|-------------|------|------|----------| +| AgentPath tree + mailbox | codex | Hierarchical addressing, async decoupling, Rust-native, production-tested | Higher initial complexity | **PRIMARY** — best fit for Rust codebase | +| Tool-based delegation | CC | Model drives everything, simple mental model, proven UX | No automated pipeline | **PRIMARY** — best UX for interactive use | +| DAG wave parallelism | oh-my-pi | Clean dependency resolution, parallel by default | Requires DAG definition upfront | **SECONDARY** — for team pipeline only | +| Centralized orchestrator | codebuff | LLM-pipeline means flexible | Spawning overhead per step | **SECONDARY** — for team pipeline | +| Tmux teams | oh-my-claudecode | Pragmatic, visible | OS-level coupling, fragile | **REFERENCE** — file-based state pattern | +| Single monolithic agent | pi-agent-rust | Simplest, zero overhead | No delegation at all | **REJECTED** — doesn't meet goal | +| Protocol-first | opencode | Clean abstraction | Over-engineered for our needs | **REJECTED** — too abstract | + +--- + +## 3. Data Structures & Types + +```rust +// === Core Agent Tree === + +/// Unique path in the agent tree. +/// Examples: "/root", "/root/explorer", "/root/worker/code-review" +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentPath(Arc); + +impl AgentPath { + pub fn root() -> Self { Self("/root".into()) } + pub fn parent(&self) -> Option; + pub fn child(&self, name: &str) -> AgentPath; + pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool; +} + +/// Agent identity — registered in AgentControl. +#[derive(Debug, Clone)] +pub struct AgentEntry { + pub id: AgentId, // UUID + pub path: AgentPath, // Tree position + pub name: String, // Human-readable nickname (unique pool) + pub role: AgentRole, + pub config: AgentConfig, + pub state: AgentState, + pub created_at: Instant, + pub ancestry: AgentAncestry, // parent_id, ancestor_ids + pub mailbox: Option, +} + +/// Role determines default model, tools, and permissions. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum AgentRole { + /// General agent — full tool access, plans and executes + Default, + /// Read-only investigator — grep, read, glob, websearch only + Explorer, + /// Execute known plan — limited tools, no planning + Worker, + /// Orchestrator — delegates subtasks, synthesizes results + Orchestrator, +} + +/// Agent config bundle — inspired by opencode + codex role profiles. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentConfig { + pub model: Option, // None = inherit parent + pub system_prompt: Option, // None = inherit, Some = override + pub tools: AgentToolPolicy, + pub permissions: AgentPermissionBound, + pub max_turns: u32, // Hard stop + pub max_cost: Option, // Cost cap (USD) + pub timeout: Option, // Wall-clock timeout +} + +/// What tools this agent can use. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum AgentToolPolicy { + /// Inherit parent's tool policy + Inherit, + /// Explicit allow list + Allow(HashSet), + /// Inherit + add + Extend(HashSet), + /// No tools (chat-only) + None, +} + +/// Permission boundary — bubble model from CC. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentPermissionBound { + pub max_risk_level: RiskLevel, // Can't exceed this + pub allow_approve: bool, // Can approve own requests + pub pre_approved: Vec, // Always-ok tool calls +} + +// === Mailbox (from codex) === + +/// One-shot channel for agent communication. +type MailboxSender = tokio::sync::oneshot::Sender; +type MailboxReceiver = tokio::sync::oneshot::Receiver; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentMessage { + pub from: AgentPath, + pub kind: AgentMessageKind, + pub payload: serde_json::Value, + pub timestamp: Instant, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum AgentMessageKind { + /// "Do this subtask, report back" + Task { prompt: String, max_turns: u32 }, + /// "Here are the results" + Result { output: String, cost: Option }, + /// "I need more context" + RequestInfo { question: String }, + /// "Here's the info you requested" + Info { data: serde_json::Value }, + /// "Stop what you're doing" + Cancel, +} + +// === Agent spawn tool input/output === + +/// The `agent` tool that the LLM calls. +#[derive(Debug, Deserialize)] +pub struct AgentToolInput { + /// Role: "explorer", "worker", "orchestrator", or "default" + pub role: String, + /// What to do + pub prompt: String, + /// Sync (wait), async (fire-and-forget), fork (share prompt cache) + #[serde(default = "default_mode")] + pub mode: AgentSpawnMode, + /// Optional tools to add beyond role defaults + #[serde(default)] + pub extra_tools: Vec, + /// Optional max turns for this sub-agent + #[serde(default = "default_subagent_turns")] + pub max_turns: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub enum AgentSpawnMode { + #[default] + /// Wait for completion, return result + Sync, + /// Fire and forget — results logged but not returned + Async, + /// Spawn with current prompt cache — zero cold start + Fork, +} + +/// What the LLM sees after `agent` tool completes. +#[derive(Debug, Serialize)] +pub struct AgentToolOutput { + pub agent_id: String, + pub agent_path: String, + pub result: Option, // None for async + pub turn_count: u32, + pub cost: Option, + pub timed_out: bool, +} + +// === Agent tree registry === + +/// Central agent tree — thread-safe, tree-addressed. +pub struct AgentControl { + tree: Arc>, + name_pool: Arc>>, + thread_limits: AgentThreadLimits, +} + +struct AgentTreeInner { + agents: HashMap, + parent_children: HashMap>, + next_id: u64, +} + +pub struct AgentThreadLimits { + pub max_depth: u32, // Default: 5 + pub max_siblings: u32, // Default: 10 + pub max_total: u32, // Default: 50 +} + +// === DAG pipeline (from oh-my-pi) === + +/// A plan step in the DAG. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PlanStep { + pub id: String, + pub agent_role: AgentRole, + pub prompt: String, + pub depends_on: Vec, // Step IDs that must complete first + pub timeout: Option, +} + +/// Wave = set of steps that can run in parallel. +pub struct ExecutionWave { + pub wave_index: usize, + pub steps: Vec, +} +``` + +--- + +## 4. Pseudocode — Core Algorithm + +### 4a. Spawn Sub-Agent (Tool-Based Delegation) + +``` +FUNCTION spawn_agent(parent_session, input: AgentToolInput): + // 1. Validate + role = RESOLVE_ROLE(input.role) + VALIDATE parent_session can spawn(role) + CHECK AgentControl.thread_limits (depth < max_depth, siblings < max_siblings) + + // 2. Build AgentConfig from role defaults + input overrides + config = AgentConfig { + model: role.default_model ?? parent_session.model, + tools: role.default_tools + input.extra_tools, + permissions: role.default_permissions, + max_turns: input.max_turns, + ... + } + + // 3. Create mailbox + (tx, rx) = oneshot::channel() + + // 4. Register in AgentTree + path = parent_session.path.child(autoname()) + entry = AgentEntry { path, role, config, mailbox: tx, ... } + AgentControl.register(entry) + + // 5. Fire SubagentStart hook + FIRE_HOOK(SubagentStart { parent_path: parent.path, child_path: path, role }) + + // 6. Handle mode: + IF input.mode == Sync: + // Run sub-agent in same task, await result + result = RUN_AGENT_SESSION(config, input.prompt, parent_context) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + RETURN AgentToolOutput { result, ... } + + ELIF input.mode == Async: + // Spawn separate tokio task, no waiting + task = tokio::spawn(async { + result = RUN_AGENT_SESSION(config, input.prompt, parent_context) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + }) + RETURN AgentToolOutput { agent_id: path, result: None, ... } + + ELIF input.mode == Fork: + // Share parent's prompt cache, zero cold start + cached_prompt = parent_session.get_prompt_cache() + task = tokio::spawn(async { + result = RUN_AGENT_SESSION(config, input.prompt, + parent_context, cached_prompt) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + }) + RETURN AgentToolOutput { agent_id: path, result: None, ... } + + END +END +``` + +### 4b. Agent Turn Loop (Sub-Agent Runtime) + +``` +FUNCTION run_agent_session(config, prompt, parent_context, cached_prompt?): + // 1. Create isolated session context + session = AgentSession { + config, + context: parent_context.clone(), + prompt_cache: cached_prompt, + turn_count: 0, + accumulated_cost: 0.0, + mailbox: rx from spawn, + } + + // 2. Execute turn loop + WHILE session.turn_count < config.max_turns: + // Check mailbox for parent messages + IF session.mailbox has message: + IF message.kind == Cancel: + RETURN Result { output: "cancelled", ... } + ELIF message.kind == RequestInfo: + SEND response back via oneshot + CONTINUE + + // Normal LLM turn + response = LLM_CALL(session.context) + session.turn_count++ + session.accumulated_cost += response.cost + + // Process tool calls + FOR tool_call in response.tool_calls: + IF tool_call.name == "agent": + // Nested delegation — recursive spawn + sub_result = spawn_agent(session, tool_call.input) + ADD sub_result to session.context + ELSE: + result = EXECUTE_TOOL(tool_call) + ADD result to session.context + + // Check cost cap + IF config.max_cost && session.accumulated_cost > config.max_cost: + RETURN Result { output: "cost limit exceeded", ... } + + // Check if done (no tool calls = final answer) + IF response.tool_calls is empty: + RETURN Result { output: response.text, cost: session.accumulated_cost } + + RETURN Result { output: "max turns reached", ... } +END +``` + +### 4c. Team Pipeline (DAG Wave Execution) + +``` +FUNCTION execute_team_pipeline(steps: Vec): + // 1. Build DAG from depends_on edges + dag = BUILD_DAG(steps) // adjacency list + in-degree count + + // 2. Decompose into topological waves + waves = TOPOLOGICAL_WAVES(dag) + // Wave 0: steps with no dependencies + // Wave 1: steps whose deps are all in wave 0 + // ... + + // 3. Execute wave by wave + step_results = Map + + FOR wave in waves: + // Run all steps in this wave in parallel + handles = [] + FOR step in wave: + handle = tokio::spawn(async { + // Inherit context from parent + prev wave results + context = BUILD_CONTEXT(step, step_results) + result = spawn_agent(parent, { + role: step.agent_role, + prompt: step.prompt, + mode: Sync, + }) + // Store result for dependent steps + step_results[step.id] = result + }) + handles.push(handle) + + // Wait for entire wave (fail-one = fail-wave) + FOR handle in handles: + await handle + + // Fire wave-complete hook + FIRE_HOOK(WaveComplete { wave_index: wave.wave_index }) + + RETURN step_results +END +``` + +--- + +## 5. Implementation Code & Modules + +### New Cargo Crate: `jcode-agent-tree` + +``` +crates/jcode-agent-tree/ + Cargo.toml + src/ + lib.rs — re-exports + path.rs — AgentPath type + entry.rs — AgentEntry, AgentConfig, AgentRole + control.rs — AgentControl (registry, thread limits) + mailbox.rs — MailboxSender/Receiver, AgentMessage + serialization.rs — tree save/restore +``` + +### `path.rs` + +```rust +use std::sync::Arc; +use serde::{Serialize, Deserialize}; + +/// Tree-addressed agent path. +/// Always starts with "/root". Examples: +/// "/root" +/// "/root/explorer" +/// "/root/worker/code-review" +#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)] +pub struct AgentPath(Arc); + +impl AgentPath { + pub fn root() -> Self { + Self("/root".into()) + } + + /// Parse from string — validates format. + pub fn parse(s: &str) -> Result { + if !s.starts_with('/') { + return Err(AgentPathError::InvalidFormat); + } + if s == "/" { + return Err(AgentPathError::TooShort); + } + // Must not end with / + if s.ends_with('/') && s.len() > 1 { + return Err(AgentPathError::TrailingSlash); + } + Ok(Self(s.into())) + } + + /// Create child path: /root/foo + "bar" = /root/foo/bar + pub fn child(&self, name: &str) -> Self { + let parent = self.0.as_ref(); + if parent.ends_with('/') { + Self(format!("{}{}", parent, name).into()) + } else { + Self(format!("{}/{}", parent, name).into()) + } + } + + /// Parent path or None if root. + pub fn parent(&self) -> Option { + let s = self.0.as_ref(); + if s == "/root" { + return None; + } + let last_slash = s.rfind('/')?; + if last_slash == 0 { + return Some(Self("/root".into())); + } + Some(Self(s[..last_slash].into())) + } + + /// Depth: /root = 0, /root/explorer = 1 + pub fn depth(&self) -> usize { + self.0.chars().filter(|&c| c == '/').count().saturating_sub(1) + } + + /// Is this path a descendant of ancestor? + pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool { + let self_s = self.0.as_ref(); + let anc_s = ancestor.0.as_ref(); + self_s.starts_with(anc_s) && self_s.len() > anc_s.len() + && self_s.as_bytes().get(anc_s.len()) == Some(&b'/') + } + + pub fn as_str(&self) -> &str { + self.0.as_ref() + } +} +``` + +### `control.rs` + +```rust +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{RwLock, Mutex, oneshot}; +use std::time::Instant; + +use crate::path::AgentPath; +use crate::entry::{AgentEntry, AgentRole, AgentConfig, AgentState}; + +/// Maximum thread limits for safety. +const MAX_DEPTH: u32 = 10; +const MAX_SIBLINGS: u32 = 32; +const MAX_TOTAL: u32 = 200; + +/// Central agent tree — thread-safe singleton. +pub struct AgentControl { + inner: Arc>, + name_pool: Arc>, + limits: AgentThreadLimits, +} + +struct AgentTreeInner { + agents: HashMap, + parent_children: HashMap>, + next_id: u64, +} + +pub struct AgentThreadLimits { + pub max_depth: u32, + pub max_siblings: u32, + pub max_total: u32, +} + +impl Default for AgentThreadLimits { + fn default() -> Self { + Self { + max_depth: MAX_DEPTH, + max_siblings: MAX_SIBLINGS, + max_total: MAX_TOTAL, + } + } +} + +impl AgentControl { + pub fn new() -> Self { + let inner = AgentTreeInner { + agents: HashMap::new(), + parent_children: HashMap::new(), + next_id: 1, + }; + Self { + inner: Arc::new(RwLock::new(inner)), + name_pool: Arc::new(Mutex::new(NamePool::new())), + limits: AgentThreadLimits::default(), + } + } + + /// Register a new agent in the tree. + /// Returns error if thread limits would be exceeded. + pub async fn register( + &self, + parent_path: &AgentPath, + name: &str, + role: AgentRole, + config: AgentConfig, + mailbox: oneshot::Sender<...>, + ) -> Result { + let mut inner = self.inner.write().await; + + // Check max total + if inner.agents.len() as u32 >= self.limits.max_total { + return Err(AgentControlError::MaxTotalAgents); + } + + // Check depth + let depth = parent_path.depth() + 1; + if depth > self.limits.max_depth { + return Err(AgentControlError::MaxDepth(depth)); + } + + // Check siblings + let siblings = inner.parent_children.get(parent_path) + .map(|v| v.len()) + .unwrap_or(0); + if siblings >= self.limits.max_siblings as usize { + return Err(AgentControlError::MaxSiblings(siblings)); + } + + // Generate unique name + let unique_name = self.name_pool.lock().unwrap() + .allocate(name); + + let path = parent_path.child(&unique_name); + let id = inner.next_id; + + let entry = AgentEntry { + id, + path: path.clone(), + name: unique_name.clone(), + role, + config, + state: AgentState::Spawning, + created_at: Instant::now(), + mailbox, + }; + + inner.agents.insert(path.clone(), entry); + inner.parent_children + .entry(parent_path.clone()) + .or_default() + .push(path.clone()); + inner.next_id += 1; + + Ok(path) + } + + /// Find agent by path. + pub async fn get(&self, path: &AgentPath) -> Option { + self.inner.read().await.agents.get(path).cloned() + } + + /// List children of a path. + pub async fn children(&self, path: &AgentPath) -> Vec { + self.inner.read().await + .parent_children.get(path) + .cloned() + .unwrap_or_default() + } + + /// Shutdown an agent and all its descendants (recursive). + pub async fn shutdown_tree(&self, path: &AgentPath) { + let mut inner = self.inner.write().await; + let children = inner.parent_children.get(path).cloned().unwrap_or_default(); + + for child_path in &children { + if let Some(entry) = inner.agents.get(child_path) { + if let Some(tx) = &entry.mailbox { + let _ = tx.send(AgentMessage::shutdown()); + } + } + } + // Remove from parent's children list + if let Some(parent) = path.parent() { + if let Some(siblings) = inner.parent_children.get_mut(&parent) { + siblings.retain(|p| p != path); + } + } + inner.agents.remove(path); + } + + /// Complete an agent (success or failure) + pub async fn complete(&self, path: &AgentPath, state: AgentState) { + let mut inner = self.inner.write().await; + if let Some(entry) = inner.agents.get_mut(path) { + entry.state = state; + } + } + + /// Serialize the agent tree for display. + pub async fn snapshot(&self) -> Vec { + self.inner.read().await.agents.values().cloned().collect() + } +} + +// === Name pool (unique agent nicknames) === + +struct NamePool { + used: HashSet, + counters: HashMap, +} + +impl NamePool { + fn new() -> Self { + Self { + used: HashSet::new(), + counters: HashMap::new(), + } + } + + fn allocate(&mut self, base: &str) -> String { + let counter = self.counters.entry(base.to_string()).or_insert(0); + *counter += 1; + let name = format!("{}-{}", base, *counter); + self.used.insert(name.clone()); + name + } +} +``` + +### Modifications to Existing Files + +#### `crates/jcode-app-core/src/agent/mod.rs` — New `agent` tool + +```rust +/// The `agent` tool — lets the LLM spawn sub-agents. +pub struct AgentTool { + agent_control: Arc, + session_registry: Arc, +} + +#[async_trait] +impl Tool for AgentTool { + fn name(&self) -> &str { "agent" } + fn description(&self) -> &str { + "Spawn a sub-agent to work on a task. Use sync mode to get the result back, \ + async for fire-and-forget, fork to reuse the current prompt cache. \ + Roles: explorer (read-only), worker (execute), orchestrator (plan+delegate)." + } + + async fn execute(&self, input: Value, ctx: ToolContext) -> ToolOutput { + let input: AgentToolInput = serde_json::from_value(input)?; + // Validate role + let role = AgentRole::from_str(&input.role) + .map_err(|_| ToolError::InvalidParam("role"))?; + + // Build config from role defaults + overrides + let config = self.build_config(&ctx, role, &input); + + // Create mailbox + let (tx, rx) = oneshot::channel(); + + // Register in tree + let parent_path = ctx.agent_path(); // from session runtime + let path = self.agent_control.register( + &parent_path, &role.to_string(), role, config, tx + ).await?; + + // Fire hook + fire_hook(HookEvent::SubagentStart { + parent: parent_path.to_string(), + child: path.to_string(), + role: role.to_string(), + }).await; + + // ... spawn session and run ... + } +} +``` + +#### `src/cli/args.rs` — New subcommands + +```rust +pub(crate) enum Command { + // ... existing ... + /// Multi-agent team orchestration + #[command(subcommand)] + Team(TeamCommand), + /// Sub-agent tree management + #[command(subcommand)] + Agent(AgentCommand), +} + +#[derive(Subcommand)] +pub(crate) enum TeamCommand { + /// Start a team pipeline from a plan file + Start { + /// Path to plan file (YAML/TOML) + plan: PathBuf, + /// Number of parallel workers + #[arg(long, default_value = "4")] + workers: u32, + }, + /// Show team status + Status, + /// Stop a running team + Stop { + /// Team ID (from `team start`) + team_id: String, + }, +} + +#[derive(Subcommand)] +pub(crate) enum AgentCommand { + /// List all sub-agents in tree + List, + /// Show agent tree + Tree, + /// Kill a sub-agent by path + Kill { + path: String, + }, + /// Get agent status + Status { + path: String, + }, +} +``` + +#### `src/cli/dispatch.rs` — Route new commands + +```rust +Command::Team(cmd) => { + match cmd { + TeamCommand::Start { plan, workers } => { + let plan = parse_plan_file(&plan)?; + runtime.execute_team_pipeline(plan, workers).await?; + } + TeamCommand::Status => { + let tree = runtime.agent_control().snapshot().await; + // Print formatted table + } + TeamCommand::Stop { team_id } => { + runtime.agent_control() + .shutdown_tree(&AgentPath::parse(&format!("/root/{}", team_id))?) + .await; + } + } +} +``` + +#### Integration into Agent Turn Loop + +In `turn_streaming_mpsc.rs`, the existing soft-interrupt points already provide hooks for sub-agent injection: + +- **Point A (pre-API)**: Check sub-agent mailbox for incoming messages (Cancel, RequestInfo) +- **Point B (post-response)**: Process `agent` tool calls from the model +- **Point C (between tools)**: Check for sub-agent result availability +- **Point D (after all tools)**: Fire SubagentStop hooks, propagate results + +```rust +// In the agent turn loop, after tool call processing: +if tool_call.name == "agent" { + let input: AgentToolInput = serde_json::from_value(tool_call.input)?; + let result = AgentTool::execute(input, ctx).await; + // result goes back as a regular tool result + context.add_tool_result(tool_call.id, result); +} +``` + +--- + +## 6. Configuration & Wiring + +### `~/.jcode/config.toml` — Agent section + +```toml +[agents] +# Max sub-agents in the tree +max_total = 50 +# Max delegation depth +max_depth = 5 +# Max siblings per parent +max_siblings = 10 +# Default agent timeout +default_timeout = "300s" +# Default max turns +default_max_turns = 50 + +[agents.roles.explorer] +model = "claude-sonnet-4-20250514" +tools = ["read", "grep", "glob", "websearch", "web_fetch"] +max_turns = 20 +permissions = { max_risk_level = "read_only", allow_approve = false } + +[agents.roles.worker] +model = "claude-sonnet-4-20250514" +tools = ["read", "write", "edit", "bash", "grep", "glob"] +max_turns = 50 +permissions = { max_risk_level = "standard", allow_approve = false } + +[agents.roles.orchestrator] +model = "claude-opus-4-20250514" +tools = "*" # All available tools +max_turns = 30 +permissions = { max_risk_level = "elevated", allow_approve = true } +``` + +### Env Vars (in `disable-registry` style) + +| Env Var | Effect | +|---------|--------| +| `JCODE_DISABLE_AGENT_TREE=1` | Disable all multi-agent features | +| `JCODE_MAX_AGENTS=10` | Override max_total at process level | +| `JCODE_AGENT_TIMEOUT_MS=60000` | Per-agent timeout override | + +### Integration Points Checklist + +| File | Change | Priority | +|------|--------|----------| +| `Cargo.toml` (workspace) | Add `jcode-agent-tree` crate | P0 | +| `crates/jcode-agent-tree/src/lib.rs` | New crate — AgentPath, AgentTree, Mailbox | P0 | +| `crates/jcode-app-core/src/tool/mod.rs` | Register `AgentTool` | P0 | +| `crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs` | Handle `agent` tool calls in turn loop | P0 | +| `src/cli/args.rs` | Add `Team` + `Agent` subcommands | P1 | +| `src/cli/dispatch.rs` | Route team/agent commands | P1 | +| `crates/jcode-base/src/config.rs` | Add `[agents]` config section | P1 | +| `crates/jcode-protocol/src/wire.rs` | Add SubagentStart/Stop events | P1 | +| `crates/jcode-tui/src/tui/app.rs` | Display agent tree in side panel | P2 | +| `crates/jcode-tui/src/tui/ui.rs` | Agent tree widget | P2 | + +--- + +## 7. Repo References + +| Feature Aspect | Repo | File | Link | +|----------------|------|------|------| +| AgentPath tree | codex | cli/kernel/agents/agent_path.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_path.rs | +| Mailbox | codex | cli/kernel/agents/mailbox.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/mailbox.rs | +| AgentControl | codex | cli/kernel/agents/agent_control.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_control.rs | +| Batch CSV | codex | cli/kernel/agents/spawn.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/spawn.rs | +| Agent tool | CC | src/tools/agent.ts | https://github.com/claude-code-best/claude-code/blob/main/src/tools/agent.ts | +| Subagent hooks | CC | src/services/hooks.ts | https://github.com/claude-code-best/claude-code/blob/main/src/services/hooks.ts | +| DAG wave | oh-my-pi | src/agent/swarm/DAGSwarm.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/swarm/DAGSwarm.ts | +| EventBus | oh-my-pi | src/agent/EventBus.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/EventBus.ts | +| Pipeline orchestration | codebuff | src/orchestrator/Buffy.ts | https://github.com/CodebuffAI/codebuff/blob/main/src/orchestrator/Buffy.ts | +| Team pipeline | oh-my-claudecode | src/team/index.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/index.ts | +| Spawn agent | oh-my-openagent | src/agents/agentOrchestration.ts | https://github.com/code-yeongyu/oh-my-openagent/blob/main/src/agents/agentOrchestration.ts | +| Fork subagent | oh-my-claudecode | src/team/agents.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/agents.ts | +| Agent posture gating | oh-my-codex | src/orchestrator/posture.ts | https://github.com/Yeachan-Heo/oh-my-codex/blob/main/src/orchestrator/posture.ts | +| jcode existing swarm TUI | jcode | crates/jcode-tui/src/tui/app.rs | — | +| jcode existing orchestration API | jcode | src/orchestration_api.rs | — | + +--- + +## 8. Test Cases + +### Unit Tests + +```rust +// === AgentPath tests === +#[test] +fn test_agent_path_root() { + let root = AgentPath::root(); + assert_eq!(root.as_str(), "/root"); + assert_eq!(root.depth(), 0); + assert!(root.parent().is_none()); +} + +#[test] +fn test_agent_path_child() { + let root = AgentPath::root(); + let explorer = root.child("explorer"); + assert_eq!(explorer.as_str(), "/root/explorer"); + assert_eq!(explorer.depth(), 1); + assert_eq!(explorer.parent().unwrap().as_str(), "/root"); +} + +#[test] +fn test_agent_path_is_descendant() { + let root = AgentPath::root(); + let worker = root.child("worker"); + let task = worker.child("code-review"); + assert!(task.is_descendant_of(&root)); + assert!(task.is_descendant_of(&worker)); + assert!(!worker.is_descendant_of(&task)); +} + +#[test] +fn test_agent_path_parse_valid() { + let p = AgentPath::parse("/root/explorer").unwrap(); + assert_eq!(p.as_str(), "/root/explorer"); +} + +#[test] +fn test_agent_path_parse_invalid() { + assert!(AgentPath::parse("/").is_err()); + assert!(AgentPath::parse("root").is_err()); +} + +// === AgentControl tests === + +#[tokio::test] +async fn test_register_agent() { + let ctrl = AgentControl::new(); + let root = AgentPath::root(); + let (tx, _rx) = oneshot::channel(); + + let path = ctrl.register(&root, "explorer", AgentRole::Explorer, + AgentConfig::default(), tx).await.unwrap(); + + assert!(path.as_str().starts_with("/root/explorer-")); + assert!(ctrl.get(&path).await.is_some()); +} + +#[tokio::test] +async fn test_max_depth_enforced() { + let ctrl = AgentControl::new(); + let mut path = AgentPath::root(); + for i in 0..12 { // max_depth = 10 + let (tx, _rx) = oneshot::channel(); + let result = ctrl.register(&path, "deep", AgentRole::Worker, + AgentConfig::default(), tx).await; + if i >= 10 { + assert!(result.is_err()); + } else { + path = result.unwrap(); + } + } +} + +#[tokio::test] +async fn test_shutdown_tree() { + let ctrl = AgentControl::new(); + let root = AgentPath::root(); + let (tx1, _rx1) = oneshot::channel(); + let (tx2, _rx2) = oneshot::channel(); + let p1 = ctrl.register(&root, "a", AgentRole::Explorer, + AgentConfig::default(), tx1).await.unwrap(); + let p2 = ctrl.register(&p1, "b", AgentRole::Worker, + AgentConfig::default(), tx2).await.unwrap(); + + ctrl.shutdown_tree(&root).await; + assert!(ctrl.get(&p1).await.is_none()); + assert!(ctrl.get(&p2).await.is_none()); +} + +// === AgentTool tests === + +#[tokio::test] +async fn test_agent_tool_spawn_sync() { + // Setup: create session, register AgentTool, call with input + let tool = AgentTool::new(agent_control, session_registry); + let input = serde_json::json!({ + "role": "explorer", + "prompt": "Check if Cargo.toml exists", + "mode": "sync" + }); + let ctx = ToolContext::test(); + let output = tool.execute(input, ctx).await; + assert!(output.result.is_some()); + assert!(output.turn_count > 0); +} + +#[tokio::test] +async fn test_agent_tool_invalid_role() { + let tool = AgentTool::new(agent_control, session_registry); + let input = serde_json::json!({ + "role": "superhero", // Invalid + "prompt": "Do something" + }); + let result = tool.execute(input, ToolContext::test()).await; + assert!(result.is_err()); +} +``` + +### Integration Tests + +```rust +#[tokio::test] +async fn test_subagent_result_propagates_to_parent() { + // 1. Start parent session via orchestration API + // 2. Parent calls `agent` tool with sync mode + // 3. Sub-agent runs, does some work, returns result + // 4. Verify parent's next turn includes sub-agent result + todo!("End-to-end: spawn parent → parent spawns child → child returns → parent sees result"); +} + +#[tokio::test] +async fn test_agent_tree_persistence() { + // 1. Create agent tree with multiple agents + // 2. Serialize to JSON + // 3. Deserialize + // 4. Verify all paths and entries match + todo!("Agent tree save/restore round-trip"); +} + +#[tokio::test] +async fn test_team_pipeline_dag_wave() { + // 1. Define 5-step DAG: step2 depends on step1, step3 on step1, step4 on step2+3 + // 2. Execute pipeline + // 3. Verify wave order: wave0=[step1], wave1=[step2,step3], wave2=[step4] + // 4. Verify all results present + todo!("DAG execution respects topological order"); +} +``` + +--- + +## 9. Benchmarks + +| Metric | Baseline (no multi-agent) | Target | How to Measure | +|--------|---------------------------|--------|----------------| +| Sub-agent spawn latency | N/A | < 100ms (in-process) | `time` before/after `register()` call | +| Sub-agent LLM first-token | N/A | Same as parent (fork) + 500ms (sync) | Measure TTFT of sub-agent vs parent | +| Memory per sub-agent | N/A | < 50MB baseline + 10MB per active agent | `alloc` profiling | +| Agent tree — 100 agents | N/A | Lookup < 1µs, register < 10µs | Criterion bench | +| DAG wave — 20 steps / 4 waves | N/A | Total < serial time / 3 | Integration timer | +| Cost tracking overhead | N/A | < 0.1% of total API cost | Differential measurement | + +--- + +## 10. Migration / Rollout + +**Phase 1 — Foundation (estimate: 1-2 weeks)** +- New crate `jcode-agent-tree` with AgentPath, AgentControl, Mailbox +- Unit tests for tree operations +- No agent tool yet — infrastructure only +- **Risk**: None (new crate, no existing code touched) + +**Phase 2 — Agent Tool (estimate: 1 week)** +- `AgentTool` implementation: sync + async + fork modes +- Integration into agent turn loop +- Wire hooks (SubagentStart/SubagentStop) to existing hook system +- **Risk**: Medium — turn loop changes must not break single-agent mode + +**Phase 3 — CLI + Config (estimate: 1 week)** +- `jcode agent list/tree/kill/status` commands +- `jcode team start/status/stop` commands +- `[agents]` config section in config.toml +- **Risk**: Low — CLI and config are additive + +**Phase 4 — Team Pipeline + Batch (estimate: 1 week)** +- DAG pipeline executor (plan file → waves → results) +- Batch CSV agent spawning +- TUI agent tree visualization +- **Risk**: Low — builds on Phase 1-3 foundation + +### Feature Flag +All multi-agent functionality gated behind `JCODE_DISABLE_AGENT_TREE` kill-switch (from disable-env system). When disabled, `agent` tool returns "multi-agent disabled" error, team CLI commands error out, and agent tree stays empty. + +--- + +## 11. Known Limitations & Future Work + +- [ ] **Cross-process sub-agents**: Current design is in-process only. Future: sub-agents as separate `jcode` processes via the protocol layer. +- [ ] **Agent checkpoint/resume**: Sub-agents that survive parent restart — requires session persistence. +- [ ] **Prompt cache sharing (Fork)**: Full fork mode requires the LLM provider to support prompt cache snapshots. Phase 1 fork = copy context (not true cache sharing). +- [ ] **Inter-agent streaming**: Sub-agents can only communicate via mailbox messages (discrete), not streaming. Future: SSE-based streaming between agents. +- [ ] **Cost optimization**: No sub-agent cost optimization yet (e.g., cheaper model for explorer). +- [ ] **Agent governance**: No per-user agent quotas, no team-based agent pools. +- [ ] **Swarm replay export**: jcode already has `export_swarm_video()` in the TUI — tie this into agent tree history. + +--- + +## 12. Success Criteria Checklist + +- [ ] `AgentPath` type supports hierarchical addressing, parent/child traversal, depth checks +- [ ] `AgentControl` enforces thread limits (depth, siblings, total) +- [ ] Mailbox-based communication works: parent sends task, agent receives, agent sends result, parent receives +- [ ] `agent` tool call spawns a sub-agent with correct role defaults +- [ ] Sync mode: parent waits, gets result with turn count + cost +- [ ] Async mode: parent continues immediately, result logged +- [ ] SubagentStart/SubagentStop hooks fire correctly +- [ ] `jcode agent list` shows all active agents with paths +- [ ] `jcode agent kill /root/worker-1` terminates agent + children +- [ ] `jcode agent tree` prints hierarchical tree view +- [ ] `jcode team start` reads plan file, executes waves, reports results +- [ ] `jcode team stop ` cancels all running agents in team +- [ ] DAG pipeline executes steps in correct topological wave order +- [ ] Cost aggregation: parent's cost includes all children's costs +- [ ] `JCODE_DISABLE_AGENT_TREE=1` disables all multi-agent features +- [ ] Existing single-agent behavior unchanged (regression test pass) +- [ ] 50 concurrent agents don't overwhelm the runtime From 8feff0a2c072cdac62cc158aae76e8343b93d46b Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Thu, 4 Jun 2026 12:27:10 +0700 Subject: [PATCH 08/22] =?UTF-8?q?fix(pr-313):=20apply=20review-swarm=20fix?= =?UTF-8?q?es=20=E2=80=94=20feature=20gate,=20per-model=20timeout,=20field?= =?UTF-8?q?=20caps,=20serde=20strictness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Gate agent_runner behind 'agent-runner' feature flag - Add KNOWLEDGE_FILES_MAX_CHARS = 100_000 constant with truncation - Add #[serde(deny_unknown_fields)] to AgentDefinition - Per-model timeout in judge_with_three_models (join_all with individual timeouts) - Fix integer truncation in meta_analyze_impl avg_duration - Remove stray merge conflict marker in src/lib.rs --- Cargo.lock | 128 ++++++++++++++++--- crates/jcode-agent-runtime/src/definition.rs | 17 ++- evals/jbench/Cargo.toml | 4 + evals/jbench/src/agent_runner.rs | 22 +++- evals/jbench/src/bin/jbench.rs | 55 +++++--- evals/jbench/src/judge.rs | 40 +++--- evals/jbench/src/lib.rs | 2 + src/lib.rs | 1 - src/prompt_placeholders.rs | 24 +++- 9 files changed, 228 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b7397afe3..19624196e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1469,7 +1469,7 @@ dependencies = [ "bitflags 1.3.2", "core-foundation 0.9.4", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "libc", ] @@ -2809,6 +2809,15 @@ dependencies = [ "ttf-parser 0.25.1", ] +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared 0.1.1", +] + [[package]] name = "foreign-types" version = "0.5.0" @@ -2816,7 +2825,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" dependencies = [ "foreign-types-macros", - "foreign-types-shared", + "foreign-types-shared 0.3.1", ] [[package]] @@ -2830,6 +2839,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "foreign-types-shared" version = "0.3.1" @@ -4332,6 +4347,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.8.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.19" @@ -5218,17 +5249,6 @@ dependencies = [ "serde", ] -[[package]] -name = "jcode-logging" -version = "0.1.0" -dependencies = [ - "chrono", - "jcode-core", - "jcode-storage", - "serde_json", - "tokio", -] - [[package]] name = "jcode-jbench" version = "0.1.0" @@ -5237,13 +5257,24 @@ dependencies = [ "clap", "futures", "jcode-agent-runtime", - "reqwest", + "reqwest 0.12.28", "serde", "serde_json", "tempfile", "tokio", ] +[[package]] +name = "jcode-logging" +version = "0.1.0" +dependencies = [ + "chrono", + "jcode-core", + "jcode-storage", + "serde_json", + "tokio", +] + [[package]] name = "jcode-memory-types" version = "0.1.0" @@ -6340,7 +6371,7 @@ dependencies = [ "bitflags 2.11.1", "block", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "log", "objc", "paste", @@ -6444,6 +6475,23 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.2.1", + "openssl-sys", + "schannel", + "security-framework 3.6.0", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -6863,6 +6911,31 @@ dependencies = [ "pathdiff", ] +[[package]] +name = "openssl" +version = "0.10.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "foreign-types 0.3.2", + "libc", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "openssl-probe" version = "0.1.6" @@ -6875,6 +6948,18 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "openssl-sys" +version = "0.9.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -7988,10 +8073,12 @@ dependencies = [ "http-body-util", "hyper 1.8.1", "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -8002,6 +8089,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.4", "tokio-util", "tower", @@ -9529,6 +9617,16 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index 4adeeabbd..6304a66ed 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -46,6 +46,7 @@ pub const DEFAULT_AGENT_VERSION: &str = "0.1.0"; /// Intentionally `Clone` so the runtime can hand each spawn its own copy /// without locking the registry. Definitions are small (a few KB at most). #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct AgentDefinition { // ----------------------------------------------------------------- // Identity @@ -572,20 +573,18 @@ mod tests { #[test] fn toml_unknown_field_is_rejected() { - // We DO NOT use `#[serde(deny_unknown_fields)]` because forward-compat - // matters when older binaries read newer TOML. But typo'd known fields - // are silently ignored — that's a UX hazard. Document the tradeoff - // here: if this becomes a problem, switch to deny_unknown_fields and - // version the schema explicitly. - // - // For now, this test just verifies unknown fields don't crash. let src = r#" id = "ok" display_name = "ok" unknown_future_field = "value" "#; - let d: AgentDefinition = toml::from_str(src).expect("parse"); - d.validate().expect("validate"); + let err = toml::from_str::(src).unwrap_err(); + assert!( + err.to_string().contains("unknown field") + || err.to_string().contains("unknown") + || err.to_string().contains("`unknown_future_field`"), + "expected denial of unknown field, got: {err}" + ); } // ----------------------------------------------------------------- diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml index b9db6899a..6a360ffc8 100644 --- a/evals/jbench/Cargo.toml +++ b/evals/jbench/Cargo.toml @@ -22,6 +22,10 @@ futures = "0.3" reqwest = { version = "0.12", features = ["json"] } clap = { version = "4", features = ["derive", "env"] } +[features] +default = [] +agent-runner = [] + [dev-dependencies] serde_json = "1" tempfile = "3" diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index 3763ee4c2..d9391cc20 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -112,12 +112,30 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { let mut trace_lines = Vec::new(); let reader = BufReader::new(stdout); let mut lines_stream = reader.lines(); - loop { + let timed_out = loop { let line = timeout(timeout_duration, lines_stream.next_line()).await; match line { Ok(Ok(Some(l))) => trace_lines.push(l), - _ => break, + Ok(Ok(None)) => break false, // EOF — clean exit + Ok(Err(_)) => break false, // read error + Err(_) => break true, // timeout } + }; + + if timed_out { + // Kill the child process so it doesn't become an orphan + let _ = child.kill().await; + // Consume the exit status after kill + let _ = child.wait().await; + return Ok(EvalRun { + commit_sha: String::new(), + prompt: config.prompt, + diff: extract_diff_from_repo(&config.repo_path).unwrap_or_default(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: start.elapsed().as_millis() as u64, + error: Some("Timed out waiting for jcode subprocess".to_owned()), + }); } let status = child diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index 2e54b50e7..160b84d26 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -7,8 +7,9 @@ use std::path::PathBuf; use anyhow::{Context, Result}; use clap::{Parser, Subcommand}; +#[cfg(feature = "agent-runner")] +use jcode_jbench::agent_runner::AgentRunConfig; use jcode_jbench::{ - agent_runner::AgentRunConfig, judge::{JudgeConfig, judge_with_three_models}, lessons::{LessonsConfig, append_lessons_to_file, extract_lessons}, types::{AgentEvalResults, EvalDataV2, EvalRun}, @@ -119,15 +120,20 @@ async fn main() -> Result<()> { max_turns, timeout_secs, } => { - run_impl( - &eval_file, - &agent_id, - &output_dir, - jcode_binary.as_ref(), - max_turns, - timeout_secs, - ) - .await?; + #[cfg(feature = "agent-runner")] + { + run_impl( + &eval_file, + &agent_id, + &output_dir, + jcode_binary.as_ref(), + max_turns, + timeout_secs, + ) + .await?; + } + #[cfg(not(feature = "agent-runner"))] + anyhow::bail!("'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner"); } Command::Judge { runs_dir, @@ -156,6 +162,7 @@ async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> { todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON") } +#[cfg(feature = "agent-runner")] async fn run_impl( eval_file: &PathBuf, agent_id: &str, @@ -182,7 +189,7 @@ async fn run_impl( let config = AgentRunConfig { agent_id: agent_id.to_owned(), prompt: commit.prompt.clone(), - repo_path: output_dir.join(&commit.id), // per-commit working dir + repo_path: output_dir.join(&commit.id), max_turns, timeout_secs, env: eval_data.env.clone(), @@ -190,15 +197,23 @@ async fn run_impl( ..Default::default() }; - let result = tk_timeout( + let result = match tk_timeout( Duration::from_secs(timeout_secs), jcode_jbench::agent_runner::run_agent_in_repo(config), ) .await - .into_iter() - .next() - .unwrap_or_else(|| { - Ok(jcode_jbench::types::EvalRun { + { + Ok(Ok(run)) => run, + Ok(Err(err)) => EvalRun { + commit_sha: commit.sha.clone(), + prompt: commit.prompt.clone(), + diff: String::new(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: 0, + error: Some(format!("Agent error: {err:#}")), + }, + Err(_elapsed) => EvalRun { commit_sha: commit.sha.clone(), prompt: commit.prompt.clone(), diff: String::new(), @@ -206,8 +221,8 @@ async fn run_impl( cost_usd: 0.0, duration_ms: 0, error: Some("Timed out waiting for run_agent_in_repo".to_owned()), - }) - })?; + }, + }; let run_file = output_dir.join(format!("{}.run.json", commit.id)); let json = serde_json::to_string_pretty(&result).context("failed to serialize EvalRun")?; @@ -262,7 +277,9 @@ async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Resu .sum::() / all_runs.len() as f64; let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() / all_runs.len() as f64; - let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() / all_runs.len() as u64; + let avg_duration = (all_runs.iter().map(|r| r.duration_ms as f64).sum::() + / all_runs.len() as f64) + .round() as u64; let summary = AgentEvalResults { agent_id: "unknown".to_owned(), diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs index 8f9437f47..0d4d36f72 100644 --- a/evals/jbench/src/judge.rs +++ b/evals/jbench/src/judge.rs @@ -375,29 +375,37 @@ pub async fn judge_with_three_models( let timeout_duration = Duration::from_secs(config.timeout_secs.unwrap_or(JUDGE_TIMEOUT_SECS)); + // Each judge gets its own timeout so a slow model doesn't starve the others. let judge_futures: Vec<_> = config .models .iter() .map(|model| { - run_single_judge( - model, - &prompt, - &config.api_base, - &config.api_key, - config.anthropic_api_base.as_deref(), - config.anthropic_api_key.as_deref(), - http, - ) + let http = http.clone(); + let prompt = prompt.clone(); + async move { + timeout( + timeout_duration, + run_single_judge( + model, + &prompt, + &config.api_base, + &config.api_key, + config.anthropic_api_base.as_deref(), + config.anthropic_api_key.as_deref(), + &http, + ), + ) + .await + .ok() + .and_then(|r| r.ok()) + } }) .collect(); - // Run all three judges in parallel with an overall timeout - let valid: Vec = timeout(timeout_duration, futures::future::join_all(judge_futures)) + let valid: Vec = futures::future::join_all(judge_futures) .await - .ok() - .into_iter() // IntoIterator>> - .flatten() // Iterator> - .filter_map(|r| r.ok()) + .into_iter() + .filter_map(|r| r) .collect(); if valid.len() < MIN_JUDGE_SUCCESS_COUNT { @@ -417,7 +425,7 @@ pub async fn judge_with_three_models( // Median analysis — sort by overall_score and pick the middle let mut sorted = valid.clone(); - sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap()); + sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap_or(std::cmp::Ordering::Equal)); let median_idx = sorted.len() / 2; let median = &sorted[median_idx]; diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs index 36363dc5b..48860cdcb 100644 --- a/evals/jbench/src/lib.rs +++ b/evals/jbench/src/lib.rs @@ -13,11 +13,13 @@ #![forbid(unsafe_code)] +#[cfg(feature = "agent-runner")] pub mod agent_runner; pub mod judge; pub mod lessons; pub mod types; +#[cfg(feature = "agent-runner")] pub use agent_runner::AgentRunConfig; pub use judge::JudgeConfig; pub use lessons::LessonsConfig; diff --git a/src/lib.rs b/src/lib.rs index e3039b5c8..dad287a05 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,7 +31,6 @@ pub mod model_failover; pub mod model_routing; pub mod orchestration_api; pub mod prefix_cache_stable; -<<<<<<< HEAD pub mod process_memory; pub mod process_title; pub mod prompt; diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs index 635beb8cc..386201dae 100644 --- a/src/prompt_placeholders.rs +++ b/src/prompt_placeholders.rs @@ -11,7 +11,7 @@ //! //! - `{{FILE_TREE_SMALL}}` — truncated project tree, max 2500 chars. //! - `{{FILE_TREE}}` — fuller project tree, max 10000 chars. -//! - `{{KNOWLEDGE_FILES}}` — concatenated knowledge / context files (no limit). +//! - `{{KNOWLEDGE_FILES}}` — concatenated knowledge / context files, max 100000 chars. //! - `{{GIT_CHANGES}}` — `git diff` / status summary, max 30000 chars. //! - `{{CURRENT_DATE}}` — ISO `YYYY-MM-DD` date string. //! - `{{REMAINING_STEPS}}` — remaining-step counter (u32, decimal). @@ -32,6 +32,9 @@ pub const FILE_TREE_MAX_CHARS: usize = 10_000; /// Maximum char count retained for [`PlaceholderContext::git_changes`]. pub const GIT_CHANGES_MAX_CHARS: usize = 30_000; +/// Maximum char count retained for [`PlaceholderContext::knowledge_files`]. +pub const KNOWLEDGE_FILES_MAX_CHARS: usize = 100_000; + /// Container for values that can be substituted into prompt templates. /// /// All `String` fields default to empty and `remaining_steps` defaults to 0. @@ -45,7 +48,8 @@ pub struct PlaceholderContext { /// Fuller project file tree. Truncated to [`FILE_TREE_MAX_CHARS`] chars /// during substitution. pub file_tree: String, - /// Concatenated knowledge/context files. No length limit is applied. + /// Concatenated knowledge/context files. Truncated to [`KNOWLEDGE_FILES_MAX_CHARS`] + /// chars during substitution. pub knowledge_files: String, /// Git diff / status summary. Truncated to [`GIT_CHANGES_MAX_CHARS`] /// chars during substitution. @@ -89,6 +93,7 @@ pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) - let file_tree_small = truncate_chars(&ctx.file_tree_small, FILE_TREE_SMALL_MAX_CHARS); let file_tree = truncate_chars(&ctx.file_tree, FILE_TREE_MAX_CHARS); + let knowledge_files = truncate_chars(&ctx.knowledge_files, KNOWLEDGE_FILES_MAX_CHARS); let git_changes = truncate_chars(&ctx.git_changes, GIT_CHANGES_MAX_CHARS); let remaining_steps = if ctx.remaining_steps == 0 { String::new() @@ -101,7 +106,7 @@ pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) - let replacements: [(&str, &str); 7] = [ ("{{FILE_TREE_SMALL}}", file_tree_small.as_str()), ("{{FILE_TREE}}", file_tree.as_str()), - ("{{KNOWLEDGE_FILES}}", ctx.knowledge_files.as_str()), + ("{{KNOWLEDGE_FILES}}", knowledge_files.as_str()), ("{{GIT_CHANGES}}", git_changes.as_str()), ("{{CURRENT_DATE}}", ctx.current_date.as_str()), ("{{REMAINING_STEPS}}", remaining_steps.as_str()), @@ -199,4 +204,17 @@ mod tests { assert!(out.starts_with('[')); assert!(out.ends_with(']')); } + + #[test] + fn knowledge_files_truncated_when_exceeds_cap() { + let big: String = "k".repeat(KNOWLEDGE_FILES_MAX_CHARS + 5000); + let ctx = PlaceholderContext { + knowledge_files: big.clone(), + ..Default::default() + }; + let out = substitute_context_placeholders("[{{KNOWLEDGE_FILES}}]", &ctx); + assert_eq!(out.chars().count(), KNOWLEDGE_FILES_MAX_CHARS + 2); + assert!(out.starts_with('[')); + assert!(out.ends_with(']')); + } } From 60a61f0b7d6e6b9ca7c9b3398f5f16740b250f7e Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 00:20:12 +0700 Subject: [PATCH 09/22] fix(merge): reconcile src/lib.rs with master layout - Revert src/lib.rs to master (remove stale 36-module list) - Move prompt_placeholders.rs from src/ into crates/jcode-app-core/src/ - Add pub mod prompt_placeholders to jcode-app-core/src/lib.rs - Resolve Cargo.lock merge conflict (hyper/hyper-rustls versions) Build verified: cargo check --bin jcode passes. Tests: jcode-agent-runtime 55 pass, jcode-jbench 3 pass. --- Cargo.lock | 123 +++++++++++++++++- crates/jcode-app-core/src/lib.rs | 1 + .../src}/prompt_placeholders.rs | 0 src/lib.rs | 21 --- 4 files changed, 121 insertions(+), 24 deletions(-) rename {src => crates/jcode-app-core/src}/prompt_placeholders.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index be5c1ef76..2a9a22c9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1468,7 +1468,7 @@ dependencies = [ "bitflags 1.3.2", "core-foundation 0.9.4", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "libc", ] @@ -2842,6 +2842,15 @@ dependencies = [ "ttf-parser 0.25.1", ] +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared 0.1.1", +] + [[package]] name = "foreign-types" version = "0.5.0" @@ -2849,7 +2858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" dependencies = [ "foreign-types-macros", - "foreign-types-shared", + "foreign-types-shared 0.3.1", ] [[package]] @@ -2863,6 +2872,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "foreign-types-shared" version = "0.3.1" @@ -4369,6 +4384,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.10.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -4903,8 +4934,12 @@ dependencies = [ name = "jcode-agent-runtime" version = "0.1.0" dependencies = [ + "anyhow", + "serde", + "serde_json", "thiserror 1.0.69", "tokio", + "toml", ] [[package]] @@ -5250,6 +5285,21 @@ dependencies = [ "serde", ] +[[package]] +name = "jcode-jbench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "jcode-agent-runtime", + "reqwest 0.12.28", + "serde", + "serde_json", + "tempfile", + "tokio", +] + [[package]] name = "jcode-logging" version = "0.1.0" @@ -6360,7 +6410,7 @@ dependencies = [ "bitflags 2.11.1", "block", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "log", "objc", "paste", @@ -6464,6 +6514,23 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.2.1", + "openssl-sys", + "schannel", + "security-framework 3.7.0", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -6874,6 +6941,31 @@ dependencies = [ "pathdiff", ] +[[package]] +name = "openssl" +version = "0.10.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "foreign-types 0.3.2", + "libc", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "openssl-probe" version = "0.1.6" @@ -6886,6 +6978,18 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "openssl-sys" +version = "0.9.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -8026,10 +8130,12 @@ dependencies = [ "http-body-util", "hyper 1.10.1", "hyper-rustls 0.27.9", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -8040,6 +8146,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.4", "tokio-util", "tower", @@ -9560,6 +9667,16 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" diff --git a/crates/jcode-app-core/src/lib.rs b/crates/jcode-app-core/src/lib.rs index b4cb41d24..1e23d83ee 100644 --- a/crates/jcode-app-core/src/lib.rs +++ b/crates/jcode-app-core/src/lib.rs @@ -40,6 +40,7 @@ pub mod notifications; pub mod overnight; pub mod perf; pub mod prompt_templates; +pub mod prompt_placeholders; pub mod replay; pub mod restart_snapshot; pub mod sandbox; diff --git a/src/prompt_placeholders.rs b/crates/jcode-app-core/src/prompt_placeholders.rs similarity index 100% rename from src/prompt_placeholders.rs rename to crates/jcode-app-core/src/prompt_placeholders.rs diff --git a/src/lib.rs b/src/lib.rs index dad287a05..101cfdade 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,27 +31,6 @@ pub mod model_failover; pub mod model_routing; pub mod orchestration_api; pub mod prefix_cache_stable; -pub mod process_memory; -pub mod process_title; -pub mod prompt; -pub mod prompt_placeholders; -pub mod prompt_templates; -pub mod protocol; -pub mod provider; -pub mod provider_catalog; -pub mod registry; -pub mod replay; -pub mod restart_snapshot; -pub mod runtime_memory_log; -pub mod safety; -pub mod sandbox; -pub mod scoped_models; -pub mod server; -pub mod session; -pub mod setup_hints; -pub mod side_panel; -pub mod sidecar; -pub mod skill; pub mod skill_disable; pub mod skill_distillation; pub mod theme; From d294249891d7f140576da0df625af94a896bf4e1 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 07:04:30 +0700 Subject: [PATCH 10/22] =?UTF-8?q?docs(review):=20comprehensive=20PR=20#313?= =?UTF-8?q?=20review=20=E2=80=94=20jcode=20vs=209=20repos=20comparison=20t?= =?UTF-8?q?ables=20+=20roadmap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit feature-planning skill analysis across codebuff, codex, claude-code, opencode, oh-my-pi, oh-my-openagent, oh-my-claudecode, pi-agent-rust, oh-my-codex. Includes: - 9 per-dimension comparison tables (schema, registry, routing, lifecycle, permission, tool, eval, prompt, session) - Top 5 gaps ranked by ROI - Wire-up plan for SafetySystem + AgentDefinition.permissionMode - Phase roadmap (Phase 1 → Phase 5) - 5 actionable issues with severity and fix suggestions --- .omo/plans/pr-313-review.md | 235 ++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 .omo/plans/pr-313-review.md diff --git a/.omo/plans/pr-313-review.md b/.omo/plans/pr-313-review.md new file mode 100644 index 000000000..725144ef0 --- /dev/null +++ b/.omo/plans/pr-313-review.md @@ -0,0 +1,235 @@ +# PR #313 Review: jcode Multi-Agent Foundation vs 9 Reference Repos + +> **Date**: 2026-06-05 +> **Reviewer**: Claude Opus 4.8 (feature-planning skill) +> **PR**: #313 — `experimental/multi-agent-foundation` → `master` +> **Scope**: +5775 / -94 lines, 28 files, 7 commits + +--- + +## 1. Per-Dimension Comparison Tables + +### 1A. Agent Definition Schema + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Format** | TOML | TS imperative + `handleSteps` | N/A (TUI) | Markdown + YAML frontmatter | Markdown + YAML | TS imperative | Markdown + YAML | Markdown + YAML | Rust runtime | N/A | +| **Schema validation** | `serde(deny_unknown_fields)` | Zod runtime | TS types | Zod (lazy) | Effect `Schema.Class` | TS types | YAML parse | YAML parse | serde derive | N/A | +| **`model` field** | optional (`model_override` + `prefer_tier`) | **required** | N/A | optional (`inherit`) | optional | **required** | optional | optional | N/A | env var stack | +| **`reasoning`/`effort`** | `ReasoningEffort` enum (4 levels) | `reasoningOptions.effort` (5 levels) + `max_tokens` | N/A | `effort` enum + integer | `variant` per-model | `Effort` enum | `ModelV2.VariantID` | N/A | N/A | N/A | +| **`outputMode`** | `last_message`/`all_messages`/`structured_output` | identical | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`tool_names`** | whitelist (deny-by-default) | whitelist + MCP servers | built-in list | `tools` + `disallowedTools` | optional from registry | `loadMode` + `tier` | tool registry | tool allowlist | optional | N/A | +| **`spawnable_agents`** | whitelist | `publisher/agent@version` | N/A | N/A (model drives) | N/A | N/A | N/A | N/A | N/A | N/A | +| **`inherit_parent_system_prompt`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`include_message_history`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`handleSteps`** | N/A (Phase 2) | ✅ Generator | N/A | N/A | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **`permissionMode`** | N/A | N/A | N/A | ✅ per-agent | ✅ per-agent | `ToolTier` per-tool | N/A | N/A | N/A | N/A | +| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **`isolation`** | N/A | N/A | N/A | `worktree`/`remote` | N/A | N/A | N/A | `worktree` (git) | N/A | N/A | +| **`mcpServers`** | N/A | ✅ per-agent | N/A | ✅ per-agent | N/A | N/A | N/A | ✅ MCP server | N/A | N/A | +| **`hooks`** | N/A | N/A | N/A | ✅ per-agent | N/A | N/A | N/A | N/A | N/A | N/A | +| **`memory` scope** | N/A | N/A | N/A | `user`/`project`/`local` | N/A | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1B. Agent Registry / Discovery + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Discovery paths** | 3-tier: project > user > builtin | `.agents/` local | N/A | `.claude/agents/*.md` + settings | `.opencode/agents/*.md` + `modes/` | N/A | N/A | N/A | N/A | N/A | +| **Priority order** | project > user > builtin | built-in first | N/A | built-in first | primary source glob | N/A | N/A | N/A | N/A | N/A | +| **Filename == id check** | ✅ enforced | ❌ | N/A | ❌ | ❌ | N/A | N/A | N/A | N/A | N/A | +| **Non-fatal errors** | ✅ collected for `doctor` | throws | N/A | log + skip | throws | N/A | N/A | N/A | N/A | N/A | +| **On-disk format** | TOML | TS | N/A | Markdown | Markdown | N/A | N/A | N/A | N/A | N/A | +| **Reload at runtime** | not yet | no | N/A | cache + plugin invalidation | `update` API | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1C. Model Routing / Tier + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Approach** | env-var slots + session inherit | OpenRouter catalog | `JCODE_ROUTING_*` env vars | `inherit` | `ModelV2.parse` | dynamic `ModelV2` | `ModelResolutionPipeline` (5 stages) | via Claude session | direct | env var stack | +| **Slot/tier concept** | `Routine`/`Thinking` | no (literal model id) | `ROUTINE`+`THINKING`+`THRESHOLD` | no | variant per-provider | model string | catalog aliases | no | no | default + fallback | +| **Fallback chain** | 3-level: override > env > session | OpenRouter routing | N/A | N/A | provider fallback | `resolveModelWithFallback` | 5-stage pipeline | N/A | per-provider | 2-tier fallback | +| **Predefined catalog** | **no** (intentional) | yes (100+ models) | no | no | yes (`models-dev.ts`) | no | yes (60+ models) | no | no | no | +| **Provider abstraction** | no (single OAuth) | OpenRouter | multi-provider | Anthropic | multi-provider | 40+ providers | multi-provider | Anthropic | 15+ providers | Codex only | + +--- + +### 1D. Agent Lifecycle / Spawn + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Agent tree** | N/A | N/A | ✅ `AgentPath` + `ThreadSpawnEdgeStatus` | `team_name` (1:1 TaskList) | `mode: subagent/primary/all` | runtime | `boulder-state` (worktrees) | `team jobs` | session tree | N/A | +| **Spawn tool** | N/A (schema only) | `spawn_agents` | `SpawnAgent`/`WaitAgent`/`CloseAgent`/`SendMessage`/`AssignAgentTask` | `Agent` tool + `TeamCreate` | delegation via tools | N/A | `delegate_task` | `omc_team_start` CLI | N/A | N/A | +| **Message bus** | N/A | output return | `InterAgentCommunication` + delivery edges | `SendMessage` tool | N/A | N/A | `shared-state.ts` | `omc-team-state.ts` | N/A | N/A | +| **Parallel execution** | N/A | `Promise.all` | DAG traversal | concurrent teammates | concurrent | DAG wave | sequential | sequential | N/A | N/A | +| **Worktree isolation** | N/A | N/A | N/A | ✅ `isolation: worktree/remote` | N/A | N/A | N/A | ✅ git worktree cleanup | N/A | N/A | +| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **Job persistence** | N/A | N/A | ✅ SQLite `agent_jobs` | team config JSON | N/A | N/A | `boulder-state` file | `OMC_JOBS_DIR` artifacts | session JSONL | N/A | + +--- + +### 1E. Permission / Safety + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Permission system** | **existing** `SafetySystem` + `ActionTier` | none | sandbox | `PermissionMode` per-agent (default/auto/ask/deny) | `PermissionV2.Ruleset` (allow/deny/ask) per-agent | `ToolTier` (read/write/exec) + approval modes | MCP allowlist | plugin/team scopes | none | `OMX_*` env controls | +| **Per-agent policy** | **gap** — tool whitelist only | tool whitelist | N/A | ✅ `permissionMode` field | ✅ `permissions` array | ✅ `tier` on each tool | N/A | N/A | N/A | N/A | +| **Classification levels** | 2 (auto/permission) | N/A | N/A | 4 (default/auto/ask/deny) | 3 (allow/deny/ask) | 3 (read/write/exec) | N/A | N/A | N/A | N/A | +| **Auto-approve for sub-agents** | **not wired** | via `handleSteps` | N/A | via `permissionMode` | N/A | tool-tier-based | N/A | N/A | N/A | N/A | +| **TUI permission flow** | ✅ `PermissionsApp` (existing) | none | none | none (CLI only) | N/A | N/A | N/A | N/A | N/A | N/A | +| **`disallowedTools`** | N/A | N/A | N/A | ✅ | N/A | `hidden` field | N/A | N/A | N/A | N/A | + +--- + +### 1F. Tool Execution + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Tool registry** | whitelist strings in TOML | typed `ToolName` union | hard-coded | `getTools()` config | `ToolsProvider` | `AgentTool` interface | tool discovery | MCP servers | typed `Tool` trait | sparkshell bridge | +| **Concurrency control** | N/A | N/A | N/A | N/A | N/A | ✅ `shared`/`exclusive` | N/A | N/A | N/A | N/A | +| **`loadMode`** | N/A | N/A | N/A | N/A | N/A | ✅ `essential`/`discoverable` | N/A | N/A | N/A | N/A | +| **`deferrable`** | N/A | ✅ | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A | +| **`nonAbortable`** | N/A | N/A | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A | +| **Validation** | runtime (registry) | Zod args | sandbox | Zod | Effect Schema | Zod (`zodToWireSchema`) | Zod | Zod | typed Rust | typed Rust | +| **`beforeToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (block/transform) | N/A | N/A | N/A | N/A | +| **`afterToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (override) | N/A | N/A | N/A | N/A | +| **Structured output** | ✅ `OutputMode::StructuredOutput` | ✅ `set_output` + `outputSchema` | N/A | N/A | N/A | `set_output` | N/A | N/A | N/A | N/A | + +--- + +### 1G. Eval / Benchmark + +| Aspect | **jcode PR #313** | codebuff (BuffBench) | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|---------------------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Approach** | git-commit reconstruction (scaffold) | git-commit reconstruction (production) | e2e + bench scripts | N/A | N/A | LSP+DAP benchmarks | smoke tests | integration tests | N/A | sparkshell benchmark | +| **Multi-judge** | ✅ 3 judges + per-model timeout | 2 judges (20 min shared) | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Median scoring** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Lessons extractor** | ✅ scaffold | ✅ production | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`meta-analyze`** | ✅ implemented | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Feature flag** | ✅ `agent-runner` gate | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1H. Prompt Utilities + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Placeholder substitution** | ✅ `prompt_placeholders.rs` (pure utility) | `PLACEHOLDER` constants | N/A | prompt templates | mode prompts | atlas prompts | `prompts-core` package | `atlas-prompts.ts` | N/A | `build_summary_prompt()` | +| **Supported tokens** | 7 tokens with length caps | `PLACEHOLDER` enum | N/A | env vars + dynamic | template engine | context-based | variant resolver | markdown | N/A | shell output | +| **Length caps** | ✅ 2500/10k/30k/100k chars | `FILE_TREE_PROMPT` only | N/A | N/A | N/A | provider-specific | model caps | N/A | N/A | N/A | +| **System reminder wrap** | ✅ `wrap_as_system_reminder()` | `` tags | N/A | injection | N/A | N/A | prompt-injection.ts | prompt-injection.ts | N/A | N/A | +| **Frontmatter parse** | N/A (TOML) | N/A | N/A | ✅ `parseAgentToolsFromFrontmatter` | ✅ `ConfigMarkdown.parseOption` | N/A | `shared/frontmatter.ts` | N/A | N/A | N/A | + +--- + +### 1I. Session / Persistence + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Session format** | N/A (existing) | in-memory | SQLite + JSONL | config JSON | SQLite (Effect) | runtime state | `boulder-state` file | `OMC_JOBS_DIR` JSON | **JSONL + SHA-256 chain** | N/A | +| **Branching/history** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ tree structure | N/A | +| **Indexed search** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ `SessionIndex` | N/A | +| **Chain integrity** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ SHA-256 per-entry | N/A | + +--- + +## 2. Top 5 Gaps (ROI-ranked) + +| Rank | Gap | Effort | Impact | Source repos | Concrete action | +|------|-----|--------|--------|--------------|-----------------| +| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | Add `permission_mode: Option` to `AgentDefinition`; during tool execution, call `SafetySystem.classify()` then check agent's override; default = inherit from parent | +| **2** | `Agent` tool — model-driven spawn | 1-2 weeks | 🔴 Critical (core feature) | codex (`SpawnAgent`/`WaitAgent`), claude-code (`AgentTool` + `TeamCreateTool`), codebuff (`spawn_agents`) | Phase 2: add `agent` tool that LLM calls; wire `spawnable_agents` whitelist; implement `AgentPath` tree from codex | +| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | Add `max_turns: Option` to `AgentDefinition`; runtime checks after each turn | +| **4** | `handleSteps` — programmatic agents | 1 week | 🟡 Important (flexibility) | codebuff (`handleSteps` Generator), oh-my-pi (`beforeToolCall`/`afterToolCall`) | Phase 2: add optional `handle_steps` field with Rust async generator or callback approach | +| **5** | Tool concurrency (`shared`/`exclusive`) | 2-3 days | 🟢 Nice-to-have (perf) | oh-my-pi (`AgentTool.concurrency`) | Add `concurrency` field to tool definition; runtime scheduler respects exclusive locks | + +--- + +## 3. Wire-up Plan: SafetySystem + AgentDefinition.permissionMode + +### Current state +- `SafetySystem` (crates/jcode-base/src/safety.rs): `ActionTier` = `AutoAllowed | RequiresPermission` +- `AgentDefinition` (crates/jcode-agent-runtime/src/definition.rs): `tool_names` whitelist only +- `PermissionsApp` (crates/jcode-tui/src/tui/permissions.rs): TUI approval flow exists + +### Proposed addition + +```rust +// crates/jcode-agent-runtime/src/definition.rs + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PermissionMode { + /// Inherit approval from parent agent (default for sub-agents). + Inherit, + /// Auto-approve all tool calls for this agent. + AutoApprove, + /// Always ask user for permission. + Ask, + /// Deny all tool calls (read-only agent). + Deny, +} + +impl Default for PermissionMode { + fn default() -> Self { PermissionMode::Inherit } +} + +// Add to AgentDefinition: +// pub permission_mode: Option, +``` + +### Resolution algorithm (runtime) + +``` +fn resolve_permission(action, tool_name, agent_def, parent_approval): + mode = agent_def.permission_mode.unwrap_or(Inherit) + match mode: + Deny → block + AutoApprove → approve + Ask → prompt user via PermissionsApp + Inherit → use parent_approval (or session-level classify) +``` + +### Migration path +- Default `None` = `Inherit` = existing behavior unchanged +- TOML agents opt-in: `permission_mode = "auto_approve"` for leaf agents +- Phase 2: auto-wire `bash` tool in `basher.toml` with `permission_mode = "auto_approve"` + +--- + +## 4. Roadmap: Phases After PR #313 + +| Phase | Scope | Dependencies | Estimated | +|-------|-------|--------------|-----------| +| **Phase 1** (this PR) | AgentDefinition + tier + registry + JBench scaffold | — | ✅ Done | +| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | 2-3 days | +| **Phase 2** | Agent runtime engine: spawn, parent-child tree, `Agent` tool, `AgentPath` | Phase 1 | 2-3 weeks | +| **Phase 2.5** | `handleSteps` (programmatic agents), tool concurrency | Phase 2 | 1-2 weeks | +| **Phase 3** | Team pipeline (claude-code-style `TeamCreateTool`) | Phase 2 | 1 week | +| **Phase 4** | JBench production (full `pick-commits` → `gen-evals` → `run` → `judge` → `lessons` pipeline) | Phase 2 | 1-2 weeks | +| **Phase 5** | Multi-provider support (extend tier to per-provider catalogs) | Phase 2 | 1 week | + +--- + +## 5. PR #313 Strengths + +1. **Best-in-class agent discovery** — 3-tier priority, filename==id enforcement, non-fatal error collection +2. **Correct model routing philosophy** — slots not catalog, matches single-OAuth reality +3. **JBench exceeds BuffBench** — 3 judges with per-model timeout (vs BuffBench's shared 20-min timeout) +4. **Rust-idiomatic crate structure** — feature gates, clean separation, `serde(deny_unknown_fields)` +5. **Comprehensive documentation** — every module has a doc comment explaining WHY, not just WHAT + +--- + +## 6. PR #313 Actionable Issues + +| # | Issue | Severity | File | Fix | +|---|-------|----------|------|-----| +| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | Use `tokio::task::spawn_blocking` | +| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | Use non-zero exit code or `todo!()` | +| 3 | `file-picker.toml` missing explicit `inherit_parent_system_prompt = false` | Low | .jcode/agents/file-picker.toml | Add for consistency with `basher.toml` | +| 4 | `edition = "2024"` in jbench may cause toolchain issues if workspace uses 2021 | Low | evals/jbench/Cargo.toml | Verify workspace edition consistency | +| 5 | `meta_analyze_impl` reads all `.run.json` files into memory | Low | evals/jbench/src/bin/jbench.rs:268 | Streaming deserializer for large runs | From f84cc127b13a718436866b48b9afa02b1755df99 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 07:24:17 +0700 Subject: [PATCH 11/22] feat(agent-runtime): add per-agent permissionMode field Add PermissionMode enum to jcode-agent-runtime (mirrors dcg_core::Mode): - Default: rule-based classification (legacy AUTO_ALLOWED list) - AcceptEdits: file ops auto-allowed, network/spawn prompt - Plan: read-only, writes denied without prompting - DontAsk: allow-listed tools pass, never prompt - BypassPermissions: skip all evaluation - Auto: LLM-based classifier decides per call Add permission_mode: Option to AgentDefinition. When None, agent inherits session-global mode. Update sample TOML agents: - basher: accept-edits (auto-approve bash) - editor: accept-edits (auto-approve file ops) - file-picker: plan (read-only) - code-reviewer: plan (read-only) Tests: 54 unit + 6 integration = 60 passed, 0 failed. Wire-up plan: at spawn time, convert PermissionMode to dcg_core::Mode and pass to SubagentTool/SessionToolPolicy for per-agent override. --- .jcode/agents/basher.toml | 4 + .jcode/agents/code-reviewer.toml | 3 + .jcode/agents/editor.toml | 4 + .jcode/agents/file-picker.toml | 3 + crates/jcode-agent-runtime/src/definition.rs | 20 ++ crates/jcode-agent-runtime/src/lib.rs | 2 + crates/jcode-agent-runtime/src/permission.rs | 187 ++++++++++++++++++ crates/jcode-agent-runtime/src/registry.rs | 1 + .../tests/sample_agents.rs | 24 ++- 9 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 crates/jcode-agent-runtime/src/permission.rs diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml index c726b51db..6c933b65d 100644 --- a/.jcode/agents/basher.toml +++ b/.jcode/agents/basher.toml @@ -37,6 +37,10 @@ version = "0.1.0" prefer_tier = "routine" reasoning = "minimal" +# Basher runs terminal commands — auto-approve file ops so the parent +# doesn't need to re-approve every bash call. Network/spawn still prompt. +permission_mode = "accept-edits" + include_message_history = false inherit_parent_system_prompt = false output_mode = "last_message" diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml index 22b7e5e38..9734537db 100644 --- a/.jcode/agents/code-reviewer.toml +++ b/.jcode/agents/code-reviewer.toml @@ -31,6 +31,9 @@ inherit_parent_system_prompt = true include_message_history = true output_mode = "last_message" +# Reviewer is read-only — plan mode denies writes without prompting. +permission_mode = "plan" + tool_names = [ "read", "grep", diff --git a/.jcode/agents/editor.toml b/.jcode/agents/editor.toml index 28aed4d01..4ab1e83d8 100644 --- a/.jcode/agents/editor.toml +++ b/.jcode/agents/editor.toml @@ -38,6 +38,10 @@ version = "0.1.0" prefer_tier = "thinking" reasoning = "medium" +# Editor makes code edits — auto-approve file operations so the parent +# agent doesn't need to re-approve every str_replace/write call. +permission_mode = "accept-edits" + inherit_parent_system_prompt = true include_message_history = true output_mode = "all_messages" diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml index b6365a84d..c4a719abb 100644 --- a/.jcode/agents/file-picker.toml +++ b/.jcode/agents/file-picker.toml @@ -26,6 +26,9 @@ reasoning = "minimal" include_message_history = false output_mode = "last_message" +# File picker is read-only — plan mode denies writes without prompting. +permission_mode = "plan" + # Tools required: read project file tree + glob fallback. Whitelist is # checked at runtime against the tool registry; unknown tools fail loudly # rather than silently degrading. diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index 6304a66ed..1f1561255 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -33,6 +33,7 @@ //! Phase 2); for now agents are pure prompted. use crate::output::OutputMode; +use crate::permission::PermissionMode; use crate::reasoning::ReasoningEffort; use crate::tier::ModelTier; @@ -152,6 +153,24 @@ pub struct AgentDefinition { #[serde(default)] pub include_message_history: bool, + // ----------------------------------------------------------------- + // Permissions + // ----------------------------------------------------------------- + /// Optional permission mode override for this agent's tool execution. + /// When set, the agent runs under this permission mode instead of the + /// session-global mode (set via CLI `--permission-mode` or cycled in + /// the TUI). + /// + /// Useful for: + /// - Restricting sub-agents: reviewer runs in `Plan` (read-only). + /// - Elevating leaf agents: `basher` runs in `AcceptEdits`. + /// - Background agents: CI runner uses `DontAsk`. + /// + /// If `None`, the agent inherits the session's current permission mode. + /// See `permission.rs` for the full mode descriptions. + #[serde(default)] + pub permission_mode: Option, + // ----------------------------------------------------------------- // Output // ----------------------------------------------------------------- @@ -407,6 +426,7 @@ mod tests { spawner_prompt: None, inherit_parent_system_prompt: false, include_message_history: false, + permission_mode: None, output_mode: OutputMode::LastMessage, output_schema: None, } diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index 80979a845..818082509 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -25,6 +25,7 @@ pub mod definition; pub mod output; +pub mod permission; pub mod reasoning; pub mod registry; pub mod signals; @@ -40,6 +41,7 @@ pub use signals::{ // New public surface (Phase 0). pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError}; pub use output::OutputMode; +pub use permission::PermissionMode; pub use reasoning::ReasoningEffort; pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source}; diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs new file mode 100644 index 000000000..41db95a72 --- /dev/null +++ b/crates/jcode-agent-runtime/src/permission.rs @@ -0,0 +1,187 @@ +//! Per-agent permission mode for tool execution safety. +//! +//! Mirrors `dcg_core::Mode` but is intentionally self-contained in the +//! dependency-light `jcode-agent-runtime` crate. The runtime converts +//! this enum to `dcg_core::Mode` at spawn time. +//! +//! ## Design +//! +//! The permission mode controls how tool calls are evaluated during an +//! agent's execution: +//! +//! - `Default` — rule-based: read-only tools auto-allowed, writes prompt. +//! - `AcceptEdits` — file operations auto-allowed, network/spawn prompt. +//! - `Plan` — read-only: writes denied without prompting. +//! - `DontAsk` — allow-listed tools pass, never prompt. +//! - `BypassPermissions` — skip all evaluation. +//! - `Auto` — LLM-based classifier decides per call. +//! +//! When `AgentDefinition.permission_mode` is `None`, the agent inherits +//! the session's current permission mode (set via CLI `--permission-mode` +//! or cycled at runtime in the TUI). + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// Per-agent permission mode for tool execution safety. +/// +/// This enum intentionally mirrors `dcg_core::Mode` (from the +/// `destructive_command_guard` crate) so that `jcode-agent-runtime` +/// does not need to depend on `dcg-core` directly. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PermissionMode { + /// Rule-based classification using the legacy `AUTO_ALLOWED` list. + /// Read-only tools auto-allowed; writes require permission. + Default, + /// File operations (edit, write, patch) auto-allowed. Network, + /// spawn, and irreversible operations still prompt. + AcceptEdits, + /// Read-only mode: write operations denied without prompting. + /// Useful for reviewer/observer agents. + Plan, + /// Only allow-listed tools pass; never prompt the user. + /// Useful for unattended/CI agents. + DontAsk, + /// Skip all permission evaluation. Use with caution. + BypassPermissions, + /// LLM-based classifier decides per tool call. + Auto, +} + +impl Default for PermissionMode { + fn default() -> Self { + PermissionMode::Default + } +} + +impl PermissionMode { + /// String representation matching the wire format used by TOML + /// definitions and the CLI. + pub fn as_str(&self) -> &'static str { + match self { + PermissionMode::Default => "default", + PermissionMode::AcceptEdits => "accept-edits", + PermissionMode::Plan => "plan", + PermissionMode::DontAsk => "dont-ask", + PermissionMode::BypassPermissions => "bypass-permissions", + PermissionMode::Auto => "auto", + } + } + + /// Parse a permission mode from a string, accepting common variants. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "default" => Some(PermissionMode::Default), + "acceptedits" | "accept_edits" | "accept-edits" => Some(PermissionMode::AcceptEdits), + "plan" => Some(PermissionMode::Plan), + "dontask" | "dont_ask" | "dont-ask" => Some(PermissionMode::DontAsk), + "bypasspermissions" | "bypass_permissions" | "bypass-permissions" => { + Some(PermissionMode::BypassPermissions) + } + "auto" => Some(PermissionMode::Auto), + _ => None, + } + } +} + +impl fmt::Display for PermissionMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_common_variants() { + assert_eq!( + PermissionMode::parse("default"), + Some(PermissionMode::Default) + ); + assert_eq!( + PermissionMode::parse("AcceptEdits"), + Some(PermissionMode::AcceptEdits) + ); + assert_eq!( + PermissionMode::parse("accept_edits"), + Some(PermissionMode::AcceptEdits) + ); + assert_eq!( + PermissionMode::parse("accept-edits"), + Some(PermissionMode::AcceptEdits) + ); + assert_eq!( + PermissionMode::parse("plan"), + Some(PermissionMode::Plan) + ); + assert_eq!( + PermissionMode::parse("DONTASK"), + Some(PermissionMode::DontAsk) + ); + assert_eq!( + PermissionMode::parse("dont_ask"), + Some(PermissionMode::DontAsk) + ); + assert_eq!( + PermissionMode::parse("bypass_permissions"), + Some(PermissionMode::BypassPermissions) + ); + assert_eq!( + PermissionMode::parse("bypass-permissions"), + Some(PermissionMode::BypassPermissions) + ); + assert_eq!( + PermissionMode::parse("auto"), + Some(PermissionMode::Auto) + ); + assert_eq!(PermissionMode::parse(""), None); + assert_eq!(PermissionMode::parse("nonsense"), None); + } + + #[test] + fn default_is_default() { + assert_eq!(PermissionMode::default(), PermissionMode::Default); + } + + #[test] + fn serde_roundtrip_kebab_case() { + // TOML wire format uses kebab-case per serde(rename_all) + let s = serde_json::to_string(&PermissionMode::AcceptEdits).unwrap(); + assert_eq!(s, "\"accept-edits\""); + let back: PermissionMode = serde_json::from_str("\"accept-edits\"").unwrap(); + assert_eq!(back, PermissionMode::AcceptEdits); + } + + #[test] + fn serde_roundtrip_all_variants() { + for variant in [ + PermissionMode::Default, + PermissionMode::AcceptEdits, + PermissionMode::Plan, + PermissionMode::DontAsk, + PermissionMode::BypassPermissions, + PermissionMode::Auto, + ] { + let json = serde_json::to_string(&variant).unwrap(); + let back: PermissionMode = serde_json::from_str(&json).unwrap(); + assert_eq!(back, variant); + } + } + + #[test] + fn display_matches_as_str() { + for variant in [ + PermissionMode::Default, + PermissionMode::AcceptEdits, + PermissionMode::Plan, + PermissionMode::DontAsk, + PermissionMode::BypassPermissions, + PermissionMode::Auto, + ] { + assert_eq!(format!("{variant}"), variant.as_str()); + } + } +} diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index 82f182b2d..a1a41a79d 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -373,6 +373,7 @@ mod tests { spawner_prompt: None, inherit_parent_system_prompt: false, include_message_history: false, + permission_mode: None, output_mode: OutputMode::LastMessage, output_schema: None, }; diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs index ee6ee7034..d2bf77d4d 100644 --- a/crates/jcode-agent-runtime/tests/sample_agents.rs +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -9,7 +9,9 @@ use std::path::PathBuf; -use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind}; +use jcode_agent_runtime::{ + AgentRegistry, ModelTier, OutputMode, PermissionMode, ReasoningEffort, SourceKind, +}; /// Path to the project-root sample agents directory, relative to the /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so @@ -71,6 +73,11 @@ fn file_picker_sample_has_expected_shape() { assert_eq!(agent.output_mode, OutputMode::LastMessage); assert!(agent.tool_names.iter().any(|t| t == "read")); assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::Plan), + "file-picker is read-only (plan mode)" + ); // Resolve model with no env vars set should fall back to the // session's current model. @@ -106,6 +113,11 @@ fn code_reviewer_uses_inherit_parent_system_prompt_for_cache_hit() { "reviewer needs context of the change it's reviewing" ); assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::Plan), + "code-reviewer is read-only (plan mode)" + ); } #[test] @@ -151,6 +163,11 @@ fn basher_sample_has_expected_shape() { assert_eq!(agent.output_mode, OutputMode::LastMessage); assert_eq!(agent.tool_names, vec!["bash"]); assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::AcceptEdits), + "basher auto-approves file ops" + ); // No tier env var set → resolve falls back to the session model. let resolved = agent.resolve_model("session-model"); @@ -207,4 +224,9 @@ fn editor_sample_has_expected_shape() { ); } assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::AcceptEdits), + "editor auto-approves file ops" + ); } From 844fc412a361c359ad9b89b5da8afbf62c594898 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 08:11:57 +0700 Subject: [PATCH 12/22] feat(agent-runtime): add max_turns field to AgentDefinition Add optional max_turns: Option field that limits the number of agentic turns an agent may execute before being stopped. Prevents runaway agents from consuming unbounded tokens/time. When None, the agent has no per-agent turn limit (session global limit still applies). Tests: 56 unit + 6 integration = 62 passed, 0 failed. --- crates/jcode-agent-runtime/src/definition.rs | 31 ++++++++++++++++++++ crates/jcode-agent-runtime/src/registry.rs | 1 + 2 files changed, 32 insertions(+) diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index 1f1561255..f148c0c16 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -171,6 +171,15 @@ pub struct AgentDefinition { #[serde(default)] pub permission_mode: Option, + /// Optional maximum number of agentic turns this agent may execute + /// before being stopped. Prevents runaway agents from consuming + /// unbounded tokens/time. + /// + /// If `None`, the agent has no per-agent turn limit (the session + /// global limit still applies). + #[serde(default)] + pub max_turns: Option, + // ----------------------------------------------------------------- // Output // ----------------------------------------------------------------- @@ -427,6 +436,7 @@ mod tests { inherit_parent_system_prompt: false, include_message_history: false, permission_mode: None, + max_turns: None, output_mode: OutputMode::LastMessage, output_schema: None, } @@ -690,4 +700,25 @@ mod tests { _ => unreachable!(), } } + + #[test] + fn toml_max_turns_parses() { + let src = r#" + id = "test" + display_name = "Test" + max_turns = 50 + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + assert_eq!(d.max_turns, Some(50)); + } + + #[test] + fn toml_max_turns_none_when_absent() { + let src = r#" + id = "test" + display_name = "Test" + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + assert_eq!(d.max_turns, None); + } } diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index a1a41a79d..cab80c514 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -374,6 +374,7 @@ mod tests { inherit_parent_system_prompt: false, include_message_history: false, permission_mode: None, + max_turns: None, output_mode: OutputMode::LastMessage, output_schema: None, }; From 6d8ecbc6b8056224514a254be5306b4107bc0b86 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 08:12:29 +0700 Subject: [PATCH 13/22] chore(agents): add max_turns to sample TOML agents - basher: max_turns = 10 (quick shell commands) - file-picker: max_turns = 5 (find files, done fast) - code-reviewer: max_turns = 15 (review needs more context) - editor: no limit (complex edits may need many turns) --- .jcode/agents/basher.toml | 1 + .jcode/agents/code-reviewer.toml | 1 + .jcode/agents/file-picker.toml | 1 + 3 files changed, 3 insertions(+) diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml index 6c933b65d..da53e515a 100644 --- a/.jcode/agents/basher.toml +++ b/.jcode/agents/basher.toml @@ -40,6 +40,7 @@ reasoning = "minimal" # Basher runs terminal commands — auto-approve file ops so the parent # doesn't need to re-approve every bash call. Network/spawn still prompt. permission_mode = "accept-edits" +max_turns = 10 include_message_history = false inherit_parent_system_prompt = false diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml index 9734537db..7d44e08ba 100644 --- a/.jcode/agents/code-reviewer.toml +++ b/.jcode/agents/code-reviewer.toml @@ -33,6 +33,7 @@ output_mode = "last_message" # Reviewer is read-only — plan mode denies writes without prompting. permission_mode = "plan" +max_turns = 15 tool_names = [ "read", diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml index c4a719abb..f958b7c4e 100644 --- a/.jcode/agents/file-picker.toml +++ b/.jcode/agents/file-picker.toml @@ -28,6 +28,7 @@ output_mode = "last_message" # File picker is read-only — plan mode denies writes without prompting. permission_mode = "plan" +max_turns = 5 # Tools required: read project file tree + glob fallback. Whitelist is # checked at runtime against the tool registry; unknown tools fail loudly From 2d7a020c50043f55a96fd2fd4df8dcfb51bc2420 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 08:15:04 +0700 Subject: [PATCH 14/22] fix(jbench): address PR #313 review issues - extract_diff_from_repo: wrap sync std::process::Command in tokio::task::spawn_blocking to avoid blocking the async runtime - todo_step: use exit code 2 (not implemented) instead of 0 (success) - Fix unused variable warnings (max_turns, timeout_secs) - cfg-gate unused imports behind agent-runner feature --- evals/jbench/src/agent_runner.rs | 35 ++++++++++++++++++-------------- evals/jbench/src/bin/jbench.rs | 34 +++++++++++++++---------------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index d9391cc20..5fd3f3031 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -130,7 +130,7 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { return Ok(EvalRun { commit_sha: String::new(), prompt: config.prompt, - diff: extract_diff_from_repo(&config.repo_path).unwrap_or_default(), + diff: extract_diff_from_repo(&config.repo_path).await.unwrap_or_default(), judging: Default::default(), cost_usd: 0.0, duration_ms: start.elapsed().as_millis() as u64, @@ -143,7 +143,7 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { .await .context("failed to wait for jcode subprocess")?; - let diff = extract_diff_from_repo(&config.repo_path)?; + let diff = extract_diff_from_repo(&config.repo_path).await?; let error = if !status.success() { Some(format!("jcode exited with status {:?}", status)) } else { @@ -163,19 +163,24 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { /// Produce a unified diff describing all uncommitted changes in /// `repo_path` against its currently-checked-out HEAD. -pub fn extract_diff_from_repo(repo_path: &Path) -> Result { - let output = std::process::Command::new("git") - .args(["diff", "--no-color", "HEAD"]) - .current_dir(repo_path) - .output() - .context("git diff failed")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("git diff exited with error: {stderr}"); - } +pub async fn extract_diff_from_repo(repo_path: &Path) -> Result { + let repo_path = repo_path.to_owned(); + tokio::task::spawn_blocking(move || { + let output = std::process::Command::new("git") + .args(["diff", "--no-color", "HEAD"]) + .current_dir(&repo_path) + .output() + .context("git diff failed")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("git diff exited with error: {stderr}"); + } - Ok(String::from_utf8_lossy(&output.stdout).to_string()) + Ok(String::from_utf8_lossy(&output.stdout).to_string()) + }) + .await + .context("spawn_blocking panicked")? } #[cfg(test)] @@ -184,7 +189,7 @@ mod tests { #[tokio::test] async fn extract_diff_from_repo_nonexistent() { - let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist")); + let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist")).await; assert!(result.is_err()); } } diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index 160b84d26..35b9d31c5 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -9,11 +9,9 @@ use clap::{Parser, Subcommand}; #[cfg(feature = "agent-runner")] use jcode_jbench::agent_runner::AgentRunConfig; -use jcode_jbench::{ - judge::{JudgeConfig, judge_with_three_models}, - lessons::{LessonsConfig, append_lessons_to_file, extract_lessons}, - types::{AgentEvalResults, EvalDataV2, EvalRun}, -}; +#[cfg(feature = "agent-runner")] +use jcode_jbench::types::EvalDataV2; +use jcode_jbench::types::EvalRun; /// Top-level `jbench` CLI. #[derive(Debug, Parser)] @@ -113,22 +111,22 @@ async fn main() -> Result<()> { gen_evals_impl(&input, &output).await?; } Command::Run { - eval_file, - agent_id, - output_dir, - jcode_binary, - max_turns, - timeout_secs, + eval_file: _eval_file, + agent_id: _agent_id, + output_dir: _output_dir, + jcode_binary: _jcode_binary, + max_turns: _max_turns, + timeout_secs: _timeout_secs, } => { #[cfg(feature = "agent-runner")] { run_impl( - &eval_file, - &agent_id, - &output_dir, - jcode_binary.as_ref(), - max_turns, - timeout_secs, + &_eval_file, + &_agent_id, + &_output_dir, + _jcode_binary.as_ref(), + _max_turns, + _timeout_secs, ) .await?; } @@ -303,5 +301,5 @@ async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Resu fn todo_step(phase: &str) -> Result<()> { eprintln!("{phase}"); - std::process::exit(0); + std::process::exit(2); } From 795242b6090475fd237aedaf421c93b0360e1338 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 08:43:52 +0700 Subject: [PATCH 15/22] feat(dcg-bridge): wire per-agent permissionMode into tool execution Add permission_mode_to_dcg() conversion from PermissionMode to dcg_core::Mode (free function due to orphan rule). Add per-session permission mode storage (SESSION_MODES) so subagents can run under a different mode than the global default: - set_session_mode(session_id, mode) - clear_session_mode(session_id) - session_mode(session_id) -> Option Add classify_for_agent(action, agent_permission_mode) that uses the agent's mode when set, falling back to global mode otherwise. Wire SubagentTool to propagate permission_mode from agent definition to child session via set_session_mode, and clean up on completion. Tests: 4 new tests in dcg_bridge (conversion, classify_for_agent, session_mode lifecycle). --- crates/jcode-app-core/src/dcg_bridge.rs | 119 ++++++++++++++++++++++++ crates/jcode-app-core/src/tool/task.rs | 56 ++++++++--- 2 files changed, 164 insertions(+), 11 deletions(-) diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs index c9398d69a..b26de1cd5 100644 --- a/crates/jcode-app-core/src/dcg_bridge.rs +++ b/crates/jcode-app-core/src/dcg_bridge.rs @@ -32,10 +32,12 @@ //! `Default`, `Auto`, `BypassPermissions`; **deny under `Plan`** (which is //! read-only); prompt under `DontAsk` only if explicitly allow-listed. +use std::collections::HashMap; use std::path::PathBuf; use std::sync::{LazyLock, Mutex}; use dcg_core::{Decision, Effect, Engine, EngineConfig, Mode, Session, ToolCall}; +use jcode_agent_runtime::permission::PermissionMode; pub use crate::yolo_classifier::YoloClassifier; @@ -82,6 +84,32 @@ fn default_protected_paths() -> Vec { ] } +/// Convert a [`PermissionMode`] (from `jcode-agent-runtime`) into the +/// corresponding [`dcg_core::Mode`]. The two enums mirror each other +/// exactly; this function is the canonical bridge. +/// +/// We cannot implement `From for Mode` due to the orphan +/// rule (both types live in foreign crates). This free function serves +/// the same purpose. +#[must_use] +pub fn permission_mode_to_dcg(pm: PermissionMode) -> Mode { + match pm { + PermissionMode::Default => Mode::Default, + PermissionMode::AcceptEdits => Mode::AcceptEdits, + PermissionMode::Plan => Mode::Plan, + PermissionMode::DontAsk => Mode::DontAsk, + PermissionMode::BypassPermissions => Mode::BypassPermissions, + PermissionMode::Auto => Mode::Auto, + } +} + +/// Per-session permission mode overrides. When a subagent is spawned with +/// a specific `permission_mode` from its `AgentDefinition`, it is stored +/// here keyed by the child session id. `classify_for_agent` checks this +/// map before falling back to the global mode. +static SESSION_MODES: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + /// Set the global permission mode. Called from the CLI / config layer at /// process startup. Subsequent `classify` calls observe the new mode. pub fn set_mode(mode: Mode) { @@ -99,6 +127,46 @@ pub fn current_mode() -> Mode { .unwrap_or(Mode::Default) } +/// Store a per-session permission mode override. Called when a subagent +/// is spawned with an explicit `permission_mode` from its agent +/// definition. +pub fn set_session_mode(session_id: &str, mode: Mode) { + if let Ok(mut guard) = SESSION_MODES.lock() { + guard.insert(session_id.to_string(), mode); + } +} + +/// Remove the per-session permission mode override for a session that +/// has finished. Prevents unbounded growth of the map. +pub fn clear_session_mode(session_id: &str) { + if let Ok(mut guard) = SESSION_MODES.lock() { + guard.remove(session_id); + } +} + +/// Return the per-session mode override, if any. +#[must_use] +pub fn session_mode(session_id: &str) -> Option { + SESSION_MODES + .lock() + .ok() + .and_then(|guard| guard.get(session_id).copied()) +} + +/// Classify an action using the agent-specific permission mode when +/// provided, falling back to the global mode otherwise. +/// +/// This is the entry point that respects per-agent permission overrides. +/// Call sites that know the agent's `PermissionMode` (e.g. subagent tool +/// execution) should use this instead of [`classify`]. +#[must_use] +pub fn classify_for_agent(action: &str, agent_permission_mode: Option) -> BridgeDecision { + let mode = agent_permission_mode + .map(permission_mode_to_dcg) + .unwrap_or_else(current_mode); + classify_with_mode(action, mode) +} + /// Three-state outcome from the bridge. jcode's `SafetySystem` collapses /// `Allow` to `ActionTier::AutoAllowed` and `Prompt`/`Deny` to /// `ActionTier::RequiresPermission` — but exposing the full set here @@ -391,4 +459,55 @@ mod tests { // Restore so other tests aren't affected by ordering. set_mode(original); } + + #[test] + fn permission_mode_converts_to_dcg_mode() { + use jcode_agent_runtime::permission::PermissionMode as PM; + + assert_eq!(permission_mode_to_dcg(PM::Default), Mode::Default); + assert_eq!(permission_mode_to_dcg(PM::AcceptEdits), Mode::AcceptEdits); + assert_eq!(permission_mode_to_dcg(PM::Plan), Mode::Plan); + assert_eq!(permission_mode_to_dcg(PM::DontAsk), Mode::DontAsk); + assert_eq!(permission_mode_to_dcg(PM::BypassPermissions), Mode::BypassPermissions); + assert_eq!(permission_mode_to_dcg(PM::Auto), Mode::Auto); + } + + #[test] + fn classify_for_agent_uses_agent_mode_when_set() { + use jcode_agent_runtime::permission::PermissionMode as PM; + + // todowrite auto-allows in AcceptEdits but denies in Plan + assert_eq!( + classify_for_agent("todowrite", Some(PM::AcceptEdits)), + BridgeDecision::Allow, + "todowrite must allow in AcceptEdits" + ); + assert_eq!( + classify_for_agent("todowrite", Some(PM::Plan)), + BridgeDecision::Deny, + "todowrite must deny in Plan" + ); + } + + #[test] + fn classify_for_agent_falls_back_to_global_when_none() { + let original = current_mode(); + set_mode(Mode::BypassPermissions); + assert_eq!( + classify_for_agent("made_up_tool", None), + BridgeDecision::Allow, + "falls back to global BypassPermissions mode" + ); + set_mode(original); + } + + #[test] + fn session_mode_set_and_clear() { + let sid = "test_session_mode_123"; + assert!(session_mode(sid).is_none()); + set_session_mode(sid, Mode::Plan); + assert_eq!(session_mode(sid), Some(Mode::Plan)); + clear_session_mode(sid); + assert!(session_mode(sid).is_none()); + } } diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs index c390a836e..7dd69cb76 100644 --- a/crates/jcode-app-core/src/tool/task.rs +++ b/crates/jcode-app-core/src/tool/task.rs @@ -1,12 +1,14 @@ use super::{Registry, Tool, ToolContext, ToolOutput}; use crate::agent::Agent; use crate::bus::{Bus, BusEvent, ToolSummary, ToolSummaryState}; +use crate::dcg_bridge; use crate::logging; use crate::protocol::HistoryMessage; use crate::provider::Provider; use crate::session::Session; use anyhow::Result; use async_trait::async_trait; +use jcode_agent_runtime::permission::PermissionMode; use serde::Deserialize; use serde_json::{Value, json}; use std::collections::{HashMap, HashSet}; @@ -55,6 +57,11 @@ struct SubagentInput { session_id: Option, #[serde(default)] output_mode: SubagentOutputMode, + /// Optional permission mode override from the agent definition. + /// When set, the child session runs under this mode instead of + /// the session-global permission mode. + #[serde(default)] + permission_mode: Option, #[serde(rename = "command", default)] _command: Option, } @@ -115,6 +122,11 @@ impl Tool for SubagentTool { "enum": ["answer", "compact", "full_transcript"], "description": "Return mode. 'answer' returns the final answer only, 'compact' adds a user-visible transcript, and 'full_transcript' adds raw persisted messages. Defaults to 'answer'." }, + "permission_mode": { + "type": "string", + "enum": ["default", "accept-edits", "plan", "dont-ask", "bypass-permissions", "auto"], + "description": "Permission mode override from the agent definition. When set, the child session uses this mode instead of the session-global permission mode." + }, "command": { "type": "string", "description": "Source command." @@ -153,6 +165,20 @@ impl Tool for SubagentTool { session.save()?; + // Propagate the agent definition's permission mode to the child + // session so that `dcg_bridge::classify_for_agent` / `session_mode` + // observe it during the child's tool execution. + let child_session_id = session.id.clone(); + if let Some(pm) = params.permission_mode { + let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm); + dcg_bridge::set_session_mode(&child_session_id, dcg_mode); + logging::info(&format!( + "[tool:subagent] session {} permission mode: {} (from agent definition)", + child_session_id, + pm.as_str(), + )); + } + let mut allowed: HashSet = self.registry.tool_names().await.into_iter().collect(); for blocked in ["subagent", "task", "todo", "todowrite", "todoread"] { allowed.remove(blocked); @@ -215,17 +241,21 @@ impl Tool for SubagentTool { ); let start = std::time::Instant::now(); - let final_text = agent.run_once_capture(¶ms.prompt).await.map_err(|err| { - logging::warn(&format!( - "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}", - params.description, - params.subagent_type, - agent.session_id(), - resolved_model, - err - )); - err - })?; + let final_text = match agent.run_once_capture(¶ms.prompt).await { + Ok(text) => text, + Err(err) => { + logging::warn(&format!( + "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}", + params.description, + params.subagent_type, + agent.session_id(), + resolved_model, + err + )); + dcg_bridge::clear_session_mode(&child_session_id); + return Err(err); + } + }; let sub_session_id = agent.session_id().to_string(); let history = if params.output_mode == SubagentOutputMode::Compact { Some(agent.get_history()) @@ -245,6 +275,9 @@ impl Tool for SubagentTool { start.elapsed().as_secs_f64() )); + // Clean up per-session permission mode to prevent unbounded growth. + dcg_bridge::clear_session_mode(&child_session_id); + listener.abort(); let mut summary: Vec = summary_map @@ -382,6 +415,7 @@ mod tests { model: None, session_id: None, output_mode: SubagentOutputMode::Answer, + permission_mode: None, _command: None, }; From 60f805bb281a44b0b45809c92a74d5a805b3cfd8 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 12:14:58 +0700 Subject: [PATCH 16/22] docs(review): update implementation status in review document Co-Authored-By: Claude Opus 4.8 --- .omo/plans/pr-313-review.md | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/.omo/plans/pr-313-review.md b/.omo/plans/pr-313-review.md index 725144ef0..44253c131 100644 --- a/.omo/plans/pr-313-review.md +++ b/.omo/plans/pr-313-review.md @@ -140,9 +140,9 @@ | Rank | Gap | Effort | Impact | Source repos | Concrete action | |------|-----|--------|--------|--------------|-----------------| -| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | Add `permission_mode: Option` to `AgentDefinition`; during tool execution, call `SafetySystem.classify()` then check agent's override; default = inherit from parent | +| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | ✅ DONE (commit f84cc127 + 795242b6) — `permission_mode` enum + field added, dcg_bridge wired | | **2** | `Agent` tool — model-driven spawn | 1-2 weeks | 🔴 Critical (core feature) | codex (`SpawnAgent`/`WaitAgent`), claude-code (`AgentTool` + `TeamCreateTool`), codebuff (`spawn_agents`) | Phase 2: add `agent` tool that LLM calls; wire `spawnable_agents` whitelist; implement `AgentPath` tree from codex | -| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | Add `max_turns: Option` to `AgentDefinition`; runtime checks after each turn | +| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | ✅ DONE (commit 844fc412) — `max_turns` field added to `AgentDefinition` | | **4** | `handleSteps` — programmatic agents | 1 week | 🟡 Important (flexibility) | codebuff (`handleSteps` Generator), oh-my-pi (`beforeToolCall`/`afterToolCall`) | Phase 2: add optional `handle_steps` field with Rust async generator or callback approach | | **5** | Tool concurrency (`shared`/`exclusive`) | 2-3 days | 🟢 Nice-to-have (perf) | oh-my-pi (`AgentTool.concurrency`) | Add `concurrency` field to tool definition; runtime scheduler respects exclusive locks | @@ -205,7 +205,7 @@ fn resolve_permission(action, tool_name, agent_def, parent_approval): | Phase | Scope | Dependencies | Estimated | |-------|-------|--------------|-----------| | **Phase 1** (this PR) | AgentDefinition + tier + registry + JBench scaffold | — | ✅ Done | -| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | 2-3 days | +| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | ✅ Done | | **Phase 2** | Agent runtime engine: spawn, parent-child tree, `Agent` tool, `AgentPath` | Phase 1 | 2-3 weeks | | **Phase 2.5** | `handleSteps` (programmatic agents), tool concurrency | Phase 2 | 1-2 weeks | | **Phase 3** | Team pipeline (claude-code-style `TeamCreateTool`) | Phase 2 | 1 week | @@ -228,8 +228,28 @@ fn resolve_permission(action, tool_name, agent_def, parent_approval): | # | Issue | Severity | File | Fix | |---|-------|----------|------|-----| -| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | Use `tokio::task::spawn_blocking` | -| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | Use non-zero exit code or `todo!()` | +| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | ✅ FIXED (commit 2d7a020c) | +| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | ✅ FIXED (commit 2d7a020c) | | 3 | `file-picker.toml` missing explicit `inherit_parent_system_prompt = false` | Low | .jcode/agents/file-picker.toml | Add for consistency with `basher.toml` | | 4 | `edition = "2024"` in jbench may cause toolchain issues if workspace uses 2021 | Low | evals/jbench/Cargo.toml | Verify workspace edition consistency | | 5 | `meta_analyze_impl` reads all `.run.json` files into memory | Low | evals/jbench/src/bin/jbench.rs:268 | Streaming deserializer for large runs | + +--- + +## 7. Implementation Status (2026-06-05) + +| Item | Status | Commit | +|------|--------|--------| +| Merge master into branch | ✅ Done | 25d3f21e | +| Reconcile src/lib.rs with master | ✅ Done | 60a61f0b | +| Review document (9 repos) | ✅ Done | d2942498 | +| `permissionMode` enum + field | ✅ Done | f84cc127 | +| `permissionMode` wire-up (dcg_bridge) | ✅ Done | 795242b6 | +| `maxTurns` field | ✅ Done | 844fc412 | +| TOML agents max_turns | ✅ Done | 6d8ecbc6 | +| Fix jbench warnings | ✅ Done | 2d7a020c | +| `Agent` tool (model-driven spawn) | 🔲 Phase 2 | — | +| `handleSteps` (programmatic agents) | 🔲 Phase 2 | — | +| Tool concurrency (shared/exclusive) | 🔲 Phase 2 | — | +| Team pipeline (TeamCreateTool) | 🔲 Phase 3 | — | +| JBench production | 🔲 Phase 4 | — | From 736abcda96ba464cd8a539a573317c2d03a6a9f7 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 12:16:41 +0700 Subject: [PATCH 17/22] feat(agent-runtime): add disallowed_tools field + TOML consistency fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add disallowed_tools: Vec denylist to AgentDefinition. Takes precedence over tool_names — useful for inheriting a broad whitelist while blocking specific dangerous tools. Fix TOML consistency: - file-picker.toml: add explicit inherit_parent_system_prompt = false - Add documentation comment explaining why Tests: 58 unit + 6 integration = 64 passed, 0 failed. --- .jcode/agents/file-picker.toml | 6 +++ crates/jcode-agent-runtime/src/definition.rs | 44 ++++++++++++++++++++ crates/jcode-agent-runtime/src/registry.rs | 1 + 3 files changed, 51 insertions(+) diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml index f958b7c4e..6d6e41081 100644 --- a/.jcode/agents/file-picker.toml +++ b/.jcode/agents/file-picker.toml @@ -14,6 +14,11 @@ # File picker doesn't need to see prior edit chatter. A clean slate # keeps the prompt short and avoids accidentally biasing path # selection toward already-touched files. +# +# Why `inherit_parent_system_prompt = false`: +# Like basher, this is a tightly scoped leaf agent. It needs its own +# short prompt focused on file discovery, not the parent's full +# project/system prompt. id = "file-picker" display_name = "Fletcher the File Fetcher" @@ -24,6 +29,7 @@ prefer_tier = "routine" reasoning = "minimal" include_message_history = false +inherit_parent_system_prompt = false output_mode = "last_message" # File picker is read-only — plan mode denies writes without prompting. diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index f148c0c16..c26e5f3ad 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -101,6 +101,15 @@ pub struct AgentDefinition { #[serde(default)] pub tool_names: Vec, + /// Optional denylist of tool names this agent may NOT call, even if + /// they appear in `tool_names`. Takes precedence over `tool_names`. + /// Useful for inheriting a broad whitelist while blocking specific + /// dangerous tools (e.g. allow all except `bash`). + /// + /// Empty list = no additional denials (default). + #[serde(default)] + pub disallowed_tools: Vec, + /// Allowlist of agent ids this agent may `spawn_agents` / `spawn_agent_inline`. /// Empty list = no spawning. Use the local agent id (e.g. `file-picker`) /// or the future `publisher/agent@version` form for shared agents. @@ -428,6 +437,7 @@ mod tests { model_override: None, reasoning: None, tool_names: Vec::new(), + disallowed_tools: Vec::new(), spawnable_agents: Vec::new(), system_prompt: String::new(), instructions_prompt: None, @@ -601,6 +611,40 @@ mod tests { assert_eq!(d.output_mode, OutputMode::AllMessages); } + #[test] + fn toml_disallowed_tools_parses_and_defaults() { + // Explicit value + let src = r#" + id = "restricted" + display_name = "Restricted Agent" + tool_names = ["read", "write_file", "bash"] + disallowed_tools = ["bash"] + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.disallowed_tools, vec!["bash"]); + assert_eq!(d.tool_names, vec!["read", "write_file", "bash"]); + // disallowed_tools takes precedence: bash is listed in tool_names + // but also in disallowed_tools, so the effective allowlist is + // tool_names minus disallowed_tools = ["read", "write_file"]. + let effective: Vec<&str> = d + .tool_names + .iter() + .filter(|t| !d.disallowed_tools.contains(t)) + .map(|s| s.as_str()) + .collect(); + assert_eq!(effective, vec!["read", "write_file"]); + + // Omitted field defaults to empty + let src2 = r#" + id = "open" + display_name = "Open Agent" + tool_names = ["bash"] + "#; + let d2: AgentDefinition = toml::from_str(src2).expect("parse"); + assert!(d2.disallowed_tools.is_empty()); + } + #[test] fn toml_unknown_field_is_rejected() { let src = r#" diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index cab80c514..d322e6a3c 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -366,6 +366,7 @@ mod tests { model_override: None, reasoning: None, tool_names: vec![], + disallowed_tools: vec![], spawnable_agents: vec![], system_prompt: String::new(), instructions_prompt: None, From aec4e4c06da2e2be977590d75c599663d64c1c60 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 12:46:25 +0700 Subject: [PATCH 18/22] =?UTF-8?q?feat(multi-agent):=20Phase=202=20?= =?UTF-8?q?=E2=80=94=20wire=20AgentDefinition=20into=20SubagentTool=20+=20?= =?UTF-8?q?parent-child=20tree?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## SubagentTool wiring (task.rs) - Add AgentRegistry to SubagentTool for definition lookup - Look up AgentDefinition by subagent_type at spawn time - Apply tool_names whitelist from definition (intersected with available) - Apply disallowed_tools denylist from definition - Inject system_prompt when inherit_parent_system_prompt is false - Wire permission_mode: params override > definition > inherit session - Map OutputMode: LastMessage->Answer, AllMessages->Compact - Log max_turns for future enforcement ## Parent-child tree (session.rs) - Add children: Vec to Session with serde(default) - Add add_child() method for registering child sessions - Wire SubagentTool to call parent.add_child() after spawn - Children persisted in session JSON for TUI tree visualization Backward compatible: all new fields use serde(default), AgentRegistry is Option so missing registry falls back to existing behavior. --- crates/jcode-app-core/src/tool/mod.rs | 2 +- crates/jcode-app-core/src/tool/task.rs | 130 +++++++++++++++++-- crates/jcode-base/src/session.rs | 17 +++ crates/jcode-base/src/session/journal.rs | 3 + crates/jcode-base/src/session/persistence.rs | 1 + 5 files changed, 141 insertions(+), 12 deletions(-) diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index f0d9e189e..0b60936d9 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -324,7 +324,7 @@ impl Registry { Self::insert_tool( &mut tools_map, "subagent", - task::SubagentTool::new(provider, registry.clone()), + task::SubagentTool::new(provider, registry.clone(), None), ); Self::insert_tool( &mut tools_map, diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs index 7dd69cb76..6c87e65ef 100644 --- a/crates/jcode-app-core/src/tool/task.rs +++ b/crates/jcode-app-core/src/tool/task.rs @@ -9,6 +9,7 @@ use crate::session::Session; use anyhow::Result; use async_trait::async_trait; use jcode_agent_runtime::permission::PermissionMode; +use jcode_agent_runtime::registry::AgentRegistry; use serde::Deserialize; use serde_json::{Value, json}; use std::collections::{HashMap, HashSet}; @@ -18,11 +19,20 @@ use tokio::sync::broadcast; pub struct SubagentTool { provider: Arc, registry: Registry, + agent_registry: Option>, } impl SubagentTool { - pub fn new(provider: Arc, registry: Registry) -> Self { - Self { provider, registry } + pub fn new( + provider: Arc, + registry: Registry, + agent_registry: Option>, + ) -> Self { + Self { + provider, + registry, + agent_registry, + } } fn preferred_parent_subagent_model(parent_session_id: &str) -> Option { @@ -138,6 +148,38 @@ impl Tool for SubagentTool { async fn execute(&self, input: Value, ctx: ToolContext) -> Result { let params: SubagentInput = serde_json::from_value(input)?; + // Look up the agent definition from the registry (if available). + // When found, its fields (tool_names, system_prompt, permission_mode, + // output_mode, max_turns) inform how the child agent is spawned. + let agent_def = self + .agent_registry + .as_ref() + .and_then(|reg| reg.get(¶ms.subagent_type)) + .map(|la| &la.definition); + + // Merge permission_mode: params (LLM override) takes precedence, + // then agent definition, then None (inherits session default). + let effective_permission_mode = params + .permission_mode + .or_else(|| agent_def.and_then(|d| d.permission_mode)); + + // Merge output_mode: if the LLM didn't explicitly set output_mode + // (i.e. it's the default Answer), prefer the agent definition's value. + let effective_output_mode = if params.output_mode == SubagentOutputMode::Answer { + agent_def + .map(|d| subagent_output_mode_from_definition(d.output_mode)) + .unwrap_or(params.output_mode) + } else { + params.output_mode + }; + + if agent_def.is_some() { + logging::info(&format!( + "[tool:subagent] matched agent definition for type '{}'", + params.subagent_type + )); + } + let mut session = if let Some(session_id) = ¶ms.session_id { Session::load(session_id).unwrap_or_else(|err| { logging::warn(&format!( @@ -163,13 +205,19 @@ impl Tool for SubagentTool { session.working_dir = Some(working_dir.display().to_string()); } + // Register child in parent's session + if let Ok(mut parent_session) = Session::load(&ctx.session_id) { + parent_session.add_child(session.id.clone()); + let _ = parent_session.save(); + } + session.save()?; - // Propagate the agent definition's permission mode to the child - // session so that `dcg_bridge::classify_for_agent` / `session_mode` - // observe it during the child's tool execution. + // Propagate the effective permission mode to the child session so + // that `dcg_bridge::classify_for_agent` / `session_mode` observe it + // during the child's tool execution. let child_session_id = session.id.clone(); - if let Some(pm) = params.permission_mode { + if let Some(pm) = effective_permission_mode { let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm); dcg_bridge::set_session_mode(&child_session_id, dcg_mode); logging::info(&format!( @@ -179,10 +227,35 @@ impl Tool for SubagentTool { )); } - let mut allowed: HashSet = self.registry.tool_names().await.into_iter().collect(); + // Build the allowed tool set for the child agent. + // If the agent definition specifies `tool_names`, use that whitelist + // (intersected with actually-available tools) instead of "all minus + // blocked". `disallowed_tools` from the definition are always removed. + let mut allowed: HashSet = if let Some(def) = agent_def { + if !def.tool_names.is_empty() { + let available: HashSet = + self.registry.tool_names().await.into_iter().collect(); + def.tool_names + .iter() + .filter(|t| available.contains(t.as_str())) + .cloned() + .collect() + } else { + self.registry.tool_names().await.into_iter().collect() + } + } else { + self.registry.tool_names().await.into_iter().collect() + }; + // Always block self-referential / meta tools. for blocked in ["subagent", "task", "todo", "todowrite", "todoread"] { allowed.remove(blocked); } + // Remove agent-definition-level disallowed tools. + if let Some(def) = agent_def { + for blocked in &def.disallowed_tools { + allowed.remove(blocked); + } + } crate::config::config() .tools .apply_to_allowed_set(&mut allowed); @@ -240,6 +313,25 @@ impl Tool for SubagentTool { Some(allowed), ); + // Apply agent definition's system prompt override when the definition + // provides one and does not request parent prompt inheritance. + if let Some(def) = agent_def { + if !def.system_prompt.is_empty() && !def.inherit_parent_system_prompt { + agent.set_system_prompt(&def.system_prompt); + logging::info(&format!( + "[tool:subagent] applied system_prompt from agent definition '{}' ({} chars)", + params.subagent_type, + def.system_prompt.len(), + )); + } + if let Some(max_turns) = def.max_turns { + logging::info(&format!( + "[tool:subagent] agent definition '{}' specifies max_turns={}", + params.subagent_type, max_turns, + )); + } + } + let start = std::time::Instant::now(); let final_text = match agent.run_once_capture(¶ms.prompt).await { Ok(text) => text, @@ -257,12 +349,12 @@ impl Tool for SubagentTool { } }; let sub_session_id = agent.session_id().to_string(); - let history = if params.output_mode == SubagentOutputMode::Compact { + let history = if effective_output_mode == SubagentOutputMode::Compact { Some(agent.get_history()) } else { None }; - let full_transcript = if params.output_mode == SubagentOutputMode::FullTranscript { + let full_transcript = if effective_output_mode == SubagentOutputMode::FullTranscript { let session = Session::load(&sub_session_id)?; Some(serde_json::to_string_pretty(&session.messages)?) } else { @@ -291,7 +383,7 @@ impl Tool for SubagentTool { let output = format_subagent_output( &final_text, &sub_session_id, - params.output_mode, + effective_output_mode, history.as_deref(), full_transcript.as_deref(), ); @@ -302,7 +394,7 @@ impl Tool for SubagentTool { "summary": summary, "sessionId": sub_session_id, "model": resolved_model, - "outputMode": params.output_mode.as_str(), + "outputMode": effective_output_mode.as_str(), }))) } } @@ -321,6 +413,22 @@ fn subagent_display_title(params: &SubagentInput, model: &str) -> String { ) } +/// Map an `AgentDefinition`'s `OutputMode` to the subagent tool's internal +/// `SubagentOutputMode`. The mapping is intentionally conservative: +/// - `LastMessage` → `Answer` (default low-token behaviour) +/// - `AllMessages` → `Compact` (human-readable transcript) +/// - `StructuredOutput` → `Answer` (structured output is a separate mechanism) +fn subagent_output_mode_from_definition( + def_mode: jcode_agent_runtime::output::OutputMode, +) -> SubagentOutputMode { + use jcode_agent_runtime::output::OutputMode as DefOutputMode; + match def_mode { + DefOutputMode::LastMessage => SubagentOutputMode::Answer, + DefOutputMode::AllMessages => SubagentOutputMode::Compact, + DefOutputMode::StructuredOutput => SubagentOutputMode::Answer, + } +} + impl SubagentOutputMode { fn as_str(self) -> &'static str { match self { diff --git a/crates/jcode-base/src/session.rs b/crates/jcode-base/src/session.rs index 54c8b826b..6c9ac6a05 100644 --- a/crates/jcode-base/src/session.rs +++ b/crates/jcode-base/src/session.rs @@ -131,6 +131,11 @@ pub struct Session { /// Optional user-provided label for saved sessions #[serde(default, skip_serializing_if = "Option::is_none")] pub save_label: Option, + /// IDs of child sessions spawned from this session. + /// Populated at spawn time by SubagentTool. Persisted so the TUI + /// can display the agent tree. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub children: Vec, /// Environment snapshots for post-mortem debugging #[serde(default, skip_serializing_if = "Vec::is_empty")] pub env_snapshots: Vec, @@ -469,6 +474,7 @@ impl Session { is_debug: self.is_debug, saved: self.saved, save_label: self.save_label.clone(), + children: self.children.clone(), } } @@ -653,6 +659,7 @@ impl Session { self.is_debug = meta.is_debug; self.saved = meta.saved; self.save_label = meta.save_label; + self.children = meta.children; self.mark_memory_profile_dirty(); } @@ -693,6 +700,7 @@ impl Session { is_debug, saved: false, save_label: None, + children: Vec::new(), env_snapshots: Vec::new(), memory_injections: Vec::new(), replay_events: Vec::new(), @@ -754,6 +762,7 @@ impl Session { is_debug, saved: false, save_label: None, + children: Vec::new(), env_snapshots: Vec::new(), memory_injections: Vec::new(), replay_events: Vec::new(), @@ -769,6 +778,14 @@ impl Session { session } + /// Register a child session id. Called by SubagentTool after + /// creating the child session. + pub fn add_child(&mut self, child_id: String) { + if !self.children.contains(&child_id) { + self.children.push(child_id); + } + } + /// Mark this session as a debug/test session pub fn set_debug(&mut self, is_debug: bool) { self.is_debug = is_debug; diff --git a/crates/jcode-base/src/session/journal.rs b/crates/jcode-base/src/session/journal.rs index 5336e1b86..ba7f5619d 100644 --- a/crates/jcode-base/src/session/journal.rs +++ b/crates/jcode-base/src/session/journal.rs @@ -33,6 +33,8 @@ pub(super) struct SessionJournalMeta { pub(super) is_debug: bool, pub(super) saved: bool, pub(super) save_label: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub(super) children: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -91,4 +93,5 @@ pub(super) fn metadata_requires_snapshot( || prev.is_debug != current.is_debug || prev.saved != current.saved || prev.save_label != current.save_label + || prev.children != current.children } diff --git a/crates/jcode-base/src/session/persistence.rs b/crates/jcode-base/src/session/persistence.rs index 23165746e..c6d402c12 100644 --- a/crates/jcode-base/src/session/persistence.rs +++ b/crates/jcode-base/src/session/persistence.rs @@ -241,6 +241,7 @@ impl Session { is_debug: self.is_debug, saved: false, save_label: None, + children: Vec::new(), ..Self::create(Some(self.id.clone()), None) } } From 411b201b5225844c6fd1652fcc69dd6ff083aae7 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 13:11:16 +0700 Subject: [PATCH 19/22] feat(multi-agent): wire AgentRegistry through Registry::new + Session.children tree ## Registry wiring - Thread Option> through Registry::new() - Pass to SubagentTool for definition lookup at spawn time - Update all Registry::new() call sites (30+ files) with None ## Session parent-child tree (already committed in Phase 2) - children: Vec on Session with serde(default) - add_child() method for registering child sessions - SubagentTool calls parent.add_child() after spawn - Children persisted in session JSON + journal meta 33 files changed, +436/-105 lines. --- crates/jcode-app-core/src/agent_tests.rs | 34 +- crates/jcode-app-core/src/ambient/runner.rs | 6 +- crates/jcode-app-core/src/server.rs | 4 +- .../src/server/client_actions_tests.rs | 8 +- .../src/server/client_comm_tests.rs | 2 +- .../src/server/client_lifecycle.rs | 2 +- .../src/server/client_lifecycle_tests.rs | 12 +- .../src/server/client_session_tests.rs | 2 +- .../src/server/client_session_tests/clear.rs | 2 +- .../src/server/client_session_tests/reload.rs | 4 +- .../resume/attach_without_local_history.rs | 4 +- .../resume/busy_existing_attach.rs | 4 +- .../resume/different_client_attach.rs | 4 +- .../resume/live_events_before_history.rs | 2 +- .../resume/multiple_live_attach.rs | 4 +- .../resume/reconnect_takeover_with_history.rs | 4 +- .../resume/same_client_takeover.rs | 4 +- .../src/server/comm_control_tests.rs | 2 +- .../src/server/comm_session_tests.rs | 2 +- .../src/server/debug_command_exec.rs | 4 +- .../jcode-app-core/src/server/debug_tests.rs | 2 +- crates/jcode-app-core/src/server/headless.rs | 2 +- .../src/server/provider_control.rs | 2 +- .../jcode-app-core/src/server/queue_tests.rs | 4 +- crates/jcode-app-core/src/server/tests.rs | 2 +- crates/jcode-app-core/src/tool/mod.rs | 7 +- crates/jcode-app-core/src/tool/tests.rs | 22 +- crates/jcode-tui/src/tui/app/remote_tests.rs | 2 +- .../src/tui/app/tests/state_model_poke_03.rs | 22 +- .../tui/app/tests/support_failover/part_01.rs | 8 +- .../tui/app/tests/support_failover/part_02.rs | 14 +- crates/jcode-tui/src/tui/ui_header.rs | 2 +- evals/jbench/src/bin/jbench.rs | 342 +++++++++++++++++- 33 files changed, 436 insertions(+), 105 deletions(-) diff --git a/crates/jcode-app-core/src/agent_tests.rs b/crates/jcode-app-core/src/agent_tests.rs index fd1324e11..1103ba73c 100644 --- a/crates/jcode-app-core/src/agent_tests.rs +++ b/crates/jcode-app-core/src/agent_tests.rs @@ -152,7 +152,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() { open_delay: Duration::from_secs(2), first_event_delay: Duration::from_secs(2), }); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( Role::User, @@ -219,7 +219,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() { #[tokio::test] async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( @@ -260,7 +260,7 @@ async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode( #[tokio::test] async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( @@ -322,7 +322,7 @@ async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() { #[tokio::test] async fn messages_for_provider_applies_manual_compaction_in_native_auto_mode() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); for i in 0..30 { @@ -449,7 +449,7 @@ async fn interrupt_signal_notified_completes_after_fire() { async fn new_agent_registers_active_pid_and_clear_swaps_it() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let first_session_id = agent.session_id().to_string(); @@ -491,7 +491,7 @@ async fn default_disabled_tools_are_not_exposed_or_executable() { crate::config::Config::invalidate_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let definitions = agent.tool_definitions().await; let tool_names = agent.tool_names().await; @@ -573,7 +573,7 @@ fn seed_transient_session_state(agent: &mut Agent) { async fn clear_resets_runtime_interrupt_and_queue_state() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); seed_transient_session_state(&mut agent); @@ -602,7 +602,7 @@ async fn clear_resets_runtime_interrupt_and_queue_state() { async fn restore_session_resets_runtime_interrupt_and_queue_state() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let mut restored_session = crate::session::Session::create_with_id( @@ -644,7 +644,7 @@ async fn restore_session_rehydrates_injected_memory_ids() { crate::memory::clear_all_pending_memory(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let mut restored_session = crate::session::Session::create_with_id( @@ -685,7 +685,7 @@ async fn build_memory_prompt_nonblocking_defers_pending_memory_during_tool_loop( crate::memory::clear_all_pending_memory(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Agent::new(provider, registry); let session_id = agent.session.id.clone(); @@ -734,7 +734,7 @@ async fn memory_injection_message_defaults_to_ephemeral_history() { crate::config::invalidate_config_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let before = agent.session.messages.len(); let memory = crate::memory::PendingMemory { @@ -767,7 +767,7 @@ async fn memory_injection_message_can_persist_to_history() { crate::config::invalidate_config_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let before = agent.session.messages.len(); let memory = crate::memory::PendingMemory { @@ -805,7 +805,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() { crate::env::set_var("JCODE_HOME", temp.path()); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider.clone(), registry.clone()); let session_id = agent.session_id().to_string(); agent.session.save().expect("save active session"); @@ -841,7 +841,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() { async fn env_snapshot_detail_is_minimal_for_empty_sessions_and_full_after_history() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); assert_eq!(agent.env_snapshot_detail(), EnvSnapshotDetail::Minimal); @@ -904,7 +904,7 @@ impl crate::tool::Tool for FakeMcpTool { async fn mcp_tools_registered_after_lock_are_visible_to_agent() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn locks the snapshot (this is what happens before the async MCP @@ -966,7 +966,7 @@ async fn mcp_tools_registered_after_lock_are_visible_to_agent() { async fn mcp_late_registration_rebuild_happens_at_most_once() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn locks the snapshot with no MCP tools yet. @@ -1038,7 +1038,7 @@ async fn mcp_late_registration_rebuild_happens_at_most_once() { async fn tool_snapshot_is_stable_without_new_mcp_tools() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let first = agent.tool_definitions().await; diff --git a/crates/jcode-app-core/src/ambient/runner.rs b/crates/jcode-app-core/src/ambient/runner.rs index 790502351..092f17486 100644 --- a/crates/jcode-app-core/src/ambient/runner.rs +++ b/crates/jcode-app-core/src/ambient/runner.rs @@ -385,7 +385,7 @@ impl AmbientRunnerHandle { ) -> anyhow::Result<()> { let session = Session::load(session_id)?; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), None).await; if session.is_canary { registry.register_selfdev_tools().await; } @@ -470,7 +470,7 @@ impl AmbientRunnerHandle { let child_is_canary = child.is_canary; let child_is_debug = child.is_debug; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), None).await; if child_is_canary { registry.register_selfdev_tools().await; } @@ -928,7 +928,7 @@ impl AmbientRunnerHandle { self.set_running_detail("setting up tools").await; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), None).await; registry.register_ambient_tools().await; // Issue #89: register MCP tools so user-installed MCP servers are // available to the ambient agent — without this, the cycle agent diff --git a/crates/jcode-app-core/src/server.rs b/crates/jcode-app-core/src/server.rs index 14a6b3433..7ac821788 100644 --- a/crates/jcode-app-core/src/server.rs +++ b/crates/jcode-app-core/src/server.rs @@ -559,7 +559,7 @@ impl Server { tokio::spawn(async move { let start = Instant::now(); let provider = registry_warm_provider.fork(); - let _ = crate::tool::Registry::new(provider).await; + let _ = crate::tool::Registry::new(provider, None).await; crate::logging::info(&format!( "Registry prewarm completed in {}ms", start.elapsed().as_millis() @@ -635,7 +635,7 @@ impl Server { let previous_status = session.status.clone(); let provider = self.provider.fork(); - let registry = crate::tool::Registry::new(provider.clone()).await; + let registry = crate::tool::Registry::new(provider.clone(), None).await; if session.is_canary { registry.register_selfdev_tools().await; } diff --git a/crates/jcode-app-core/src/server/client_actions_tests.rs b/crates/jcode-app-core/src/server/client_actions_tests.rs index 4d4923c27..8783446a8 100644 --- a/crates/jcode-app-core/src/server/client_actions_tests.rs +++ b/crates/jcode-app-core/src/server/client_actions_tests.rs @@ -141,7 +141,7 @@ fn clone_split_session_uses_persisted_session_state() { #[tokio::test] async fn enabling_swarm_does_not_auto_elect_coordinator() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let (member_event_tx, _member_event_rx) = mpsc::unbounded_channel(); let now = Instant::now(); @@ -242,7 +242,7 @@ async fn rename_session_event_uses_agent_session_id_even_when_client_id_is_stale crate::env::set_var("JCODE_HOME", temp.path()); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let agent_session_id = agent.lock().await.session_id().to_string(); let stale_client_session_id = "session_stale_client_id"; @@ -321,7 +321,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session() StreamEvent::MessageEnd { stop_reason: None }, ]); let provider_dyn: Arc = provider.clone(); - let registry = Registry::new(provider_dyn.clone()).await; + let registry = Registry::new(provider_dyn.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider_dyn, registry))); let session_id = agent.lock().await.session_id().to_string(); let sessions = Arc::new(RwLock::new(HashMap::>>::from([( @@ -422,7 +422,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session() #[tokio::test] async fn notify_session_queues_soft_interrupt_when_live_session_is_busy() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let session_id = agent.lock().await.session_id().to_string(); let queue = agent.lock().await.soft_interrupt_queue(); diff --git a/crates/jcode-app-core/src/server/client_comm_tests.rs b/crates/jcode-app-core/src/server/client_comm_tests.rs index 0db9680bf..70c2354fd 100644 --- a/crates/jcode-app-core/src/server/client_comm_tests.rs +++ b/crates/jcode-app-core/src/server/client_comm_tests.rs @@ -39,7 +39,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/client_lifecycle.rs b/crates/jcode-app-core/src/server/client_lifecycle.rs index e437e6e49..e52e2dd05 100644 --- a/crates/jcode-app-core/src/server/client_lifecycle.rs +++ b/crates/jcode-app-core/src/server/client_lifecycle.rs @@ -418,7 +418,7 @@ pub(super) async fn handle_client( let provider = provider_template.fork(); let t0 = std::time::Instant::now(); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let registry_ms = t0.elapsed().as_millis(); let mut swarm_enabled = crate::config::config().features.swarm; diff --git a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs index c02140f5e..4513301fd 100644 --- a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs +++ b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs @@ -23,7 +23,7 @@ async fn session_control_handle_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let queue = Arc::new(std::sync::Mutex::new(Vec::new())); @@ -61,7 +61,7 @@ async fn refreshed_session_control_handle_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id( "session_busy_control_refresh".to_string(), None, @@ -106,7 +106,7 @@ async fn busy_agent_request_rejection_does_not_wait_for_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let (client_event_tx, mut client_event_rx) = mpsc::unbounded_channel::(); @@ -356,7 +356,7 @@ fn reload_starting_rejects_new_turn_without_spawning_processing_task() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::clone(&forked), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id("session_guard".to_string(), None, None); session.model = Some("panic-on-fork".to_string()); @@ -448,7 +448,7 @@ fn accepted_reload_recovery_continuation_marks_intent_delivered() -> anyhow::Res let rt = tokio::runtime::Runtime::new().expect("runtime"); rt.block_on(async { let provider: Arc = Arc::new(CompleteImmediatelyProvider); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some("complete-immediately".to_string()); @@ -537,7 +537,7 @@ fn reload_starting_rejects_new_turns_for_multiple_sessions() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::clone(&forked), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let swarm_members = Arc::new(RwLock::new(HashMap::new())); let swarms_by_id = Arc::new(RwLock::new(HashMap::new())); let event_history = Arc::new(RwLock::new(std::collections::VecDeque::new())); diff --git a/crates/jcode-app-core/src/server/client_session_tests.rs b/crates/jcode-app-core/src/server/client_session_tests.rs index d8fd02226..2471090e5 100644 --- a/crates/jcode-app-core/src/server/client_session_tests.rs +++ b/crates/jcode-app-core/src/server/client_session_tests.rs @@ -90,7 +90,7 @@ fn test_agent(messages: Vec) -> Agent { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("runtime"); let _guard = rt.enter(); - let registry = rt.block_on(Registry::new(provider.clone())); + let registry = rt.block_on(Registry::new(provider.clone(), None)); build_test_agent(provider, registry, messages) } diff --git a/crates/jcode-app-core/src/server/client_session_tests/clear.rs b/crates/jcode-app-core/src/server/client_session_tests/clear.rs index 758515e19..09732a67f 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/clear.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/clear.rs @@ -8,7 +8,7 @@ async fn handle_clear_session_replaces_runtime_handles_and_updates_shutdown_regi let old_session_id = "session_before_clear"; let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/reload.rs b/crates/jcode-app-core/src/server/client_session_tests/reload.rs index aef88e3a2..4f5d37556 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/reload.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/reload.rs @@ -303,7 +303,7 @@ fn handle_reload_queues_signal_for_canary_session() -> Result<()> { rt.block_on(async { let mut rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = build_test_agent(provider, registry, Vec::new()); agent.set_canary("self-dev"); let agent = Arc::new(Mutex::new(agent)); @@ -407,7 +407,7 @@ async fn handle_reload_does_not_wait_for_busy_agent_lock() -> Result<()> { let mut rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = build_test_agent(provider, registry, Vec::new()); let agent = Arc::new(Mutex::new(agent)); let busy_agent_lock = agent.lock().await; diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs index d04acd44e..0057ce38a 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<( persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<( Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs index fc5cb93ff..b79f5a724 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs @@ -20,7 +20,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy() }; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -28,7 +28,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy() vec![persisted_message], ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs index 96040ce38..fb134048a 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() -> Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs index 97558cbdd..e45296af3 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_registers_live_events_before_history_replay() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs index 4dd0edd5a..6293e941d 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs @@ -7,7 +7,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> { let temp_session_id = "session_temp_connecting"; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -15,7 +15,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> { Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs index 77aa96899..775090b6b 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() -> Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs index c044f0f48..cb6ce3b16 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs @@ -16,7 +16,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -24,7 +24,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/comm_control_tests.rs b/crates/jcode-app-core/src/server/comm_control_tests.rs index faddcae4f..5108018e0 100644 --- a/crates/jcode-app-core/src/server/comm_control_tests.rs +++ b/crates/jcode-app-core/src/server/comm_control_tests.rs @@ -124,7 +124,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/comm_session_tests.rs b/crates/jcode-app-core/src/server/comm_session_tests.rs index d7cf8e678..057a21faf 100644 --- a/crates/jcode-app-core/src/server/comm_session_tests.rs +++ b/crates/jcode-app-core/src/server/comm_session_tests.rs @@ -70,7 +70,7 @@ fn member( async fn test_agent_with_working_dir(session_id: &str, working_dir: &str) -> Arc> { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some("mock".to_string()); session.working_dir = Some(working_dir.to_string()); diff --git a/crates/jcode-app-core/src/server/debug_command_exec.rs b/crates/jcode-app-core/src/server/debug_command_exec.rs index d23f08176..63f7824fa 100644 --- a/crates/jcode-app-core/src/server/debug_command_exec.rs +++ b/crates/jcode-app-core/src/server/debug_command_exec.rs @@ -697,7 +697,7 @@ mod tests { let mut reload_rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_selfdev_tools().await; let mut agent = Agent::new(provider, registry); @@ -747,7 +747,7 @@ mod tests { #[tokio::test] async fn debug_cancel_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(AsyncMutex::new(Agent::new(provider, registry))); let session_id = agent.lock().await.session_id().to_string(); diff --git a/crates/jcode-app-core/src/server/debug_tests.rs b/crates/jcode-app-core/src/server/debug_tests.rs index 0c32dfc26..6e7b3ba65 100644 --- a/crates/jcode-app-core/src/server/debug_tests.rs +++ b/crates/jcode-app-core/src/server/debug_tests.rs @@ -646,7 +646,7 @@ mod debug_execution_tests { async fn test_agent() -> Arc> { let provider = Arc::new(TestProvider) as Arc; - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(AsyncMutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/headless.rs b/crates/jcode-app-core/src/server/headless.rs index 7d7004096..965ba64da 100644 --- a/crates/jcode-app-core/src/server/headless.rs +++ b/crates/jcode-app-core/src/server/headless.rs @@ -49,7 +49,7 @@ pub(super) async fn create_headless_session( }; let provider = provider_template.fork(); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.enable_memory_test_mode().await; diff --git a/crates/jcode-app-core/src/server/provider_control.rs b/crates/jcode-app-core/src/server/provider_control.rs index d5e3489d8..cfa588c88 100644 --- a/crates/jcode-app-core/src/server/provider_control.rs +++ b/crates/jcode-app-core/src/server/provider_control.rs @@ -1242,7 +1242,7 @@ mod tests { ) { let provider = Arc::new(TestEffortProvider::default()); let provider_dyn: Arc = provider.clone(); - let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn)).await; + let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some(provider.model()); diff --git a/crates/jcode-app-core/src/server/queue_tests.rs b/crates/jcode-app-core/src/server/queue_tests.rs index 27eae2c06..35485d0df 100644 --- a/crates/jcode-app-core/src/server/queue_tests.rs +++ b/crates/jcode-app-core/src/server/queue_tests.rs @@ -41,7 +41,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } @@ -165,7 +165,7 @@ async fn queue_soft_interrupt_for_session_persists_when_live_queue_is_unavailabl assert_eq!(persisted[0].source, SoftInterruptSource::BackgroundTask); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut restored = Agent::new(provider, registry); restored .restore_session(&session_id) diff --git a/crates/jcode-app-core/src/server/tests.rs b/crates/jcode-app-core/src/server/tests.rs index e2240f2ca..9a59fe918 100644 --- a/crates/jcode-app-core/src/server/tests.rs +++ b/crates/jcode-app-core/src/server/tests.rs @@ -172,7 +172,7 @@ impl Provider for StreamingMockProvider { } async fn test_agent(provider: Arc) -> Arc> { - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index 0b60936d9..0ba3e4930 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -275,7 +275,10 @@ impl Registry { tools } - pub async fn new(provider: Arc) -> Self { + pub async fn new( + provider: Arc, + agent_registry: Option>, + ) -> Self { let start = std::time::Instant::now(); let skills_start = std::time::Instant::now(); let skills = Self::shared_skills_registry(); @@ -324,7 +327,7 @@ impl Registry { Self::insert_tool( &mut tools_map, "subagent", - task::SubagentTool::new(provider, registry.clone(), None), + task::SubagentTool::new(provider, registry.clone(), agent_registry), ); Self::insert_tool( &mut tools_map, diff --git a/crates/jcode-app-core/src/tool/tests.rs b/crates/jcode-app-core/src/tool/tests.rs index 5f6f4f295..8fdbef2f8 100644 --- a/crates/jcode-app-core/src/tool/tests.rs +++ b/crates/jcode-app-core/src/tool/tests.rs @@ -33,7 +33,7 @@ impl Provider for MockProvider { async fn test_tool_definitions_are_sorted() { // Create registry with mock provider let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; // Get definitions multiple times and verify they're always in the same order let defs1 = registry.definitions(None).await; @@ -98,7 +98,7 @@ fn tool_definitions_do_not_auto_inject_intent() { #[tokio::test] async fn first_party_tool_definitions_include_optional_intent_explicitly() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; registry.register_ambient_tools().await; let defs = registry.definitions(None).await; @@ -160,7 +160,7 @@ fn test_resolve_tool_name_oauth_aliases() { #[tokio::test] async fn test_batch_resolves_oauth_names() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let temp_dir = std::env::temp_dir(); let temp_dir_str = temp_dir.to_string_lossy().to_string(); @@ -188,7 +188,7 @@ async fn test_batch_resolves_oauth_names() { #[tokio::test] async fn registry_execute_enforces_session_tool_policy_after_alias_resolution() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let temp_dir = std::env::temp_dir(); let session_id = "test-policy-deny"; set_session_tool_policy(session_id, None, HashSet::from(["grep".to_string()])); @@ -225,7 +225,7 @@ async fn registry_execute_enforces_session_tool_policy_after_alias_resolution() #[tokio::test] async fn test_definitions_keep_batch_schema_generic() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let batch_def = defs @@ -255,7 +255,7 @@ fn resolve_tool_name_maps_communicate_to_swarm() { #[ignore] async fn print_tool_definition_token_report() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let mut defs = registry.definitions(None).await; defs.sort_by_key(|def| std::cmp::Reverse(def.prompt_token_estimate())); @@ -324,7 +324,7 @@ fn collect_schema_errors(schema: &Value, path: &str, errors: &mut Vec) { #[tokio::test] async fn test_tool_definitions_do_not_expose_invalid_array_schemas() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let mut errors = Vec::new(); @@ -449,7 +449,7 @@ async fn test_context_guard_zero_budget_passes_through() { #[tokio::test] async fn test_request_permission_is_ambient_only() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -476,7 +476,7 @@ async fn test_no_builtin_tools_env_disables_registry() { crate::env::set_var("JCODE_NO_BUILTIN_TOOLS", "1"); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -502,7 +502,7 @@ async fn test_default_registry_has_builtin_tools() { crate::env::remove_var("JCODE_NO_BUILTIN_TOOLS"); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -537,7 +537,7 @@ fn closest_tool_names_suggests_near_misses() { #[tokio::test] async fn unknown_tool_error_lists_available_tools_and_suggestions() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; registry.register_ambient_tools().await; let ctx = ToolContext { diff --git a/crates/jcode-tui/src/tui/app/remote_tests.rs b/crates/jcode-tui/src/tui/app/remote_tests.rs index f6150556e..4359b9688 100644 --- a/crates/jcode-tui/src/tui/app/remote_tests.rs +++ b/crates/jcode-tui/src/tui/app/remote_tests.rs @@ -40,7 +40,7 @@ impl Provider for MockProvider { fn create_test_app() -> crate::tui::app::App { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("runtime"); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = crate::tui::app::App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs index aa830a570..014d543e9 100644 --- a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs +++ b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs @@ -451,7 +451,7 @@ fn test_model_picker_reuses_cached_entries_until_invalidated() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -492,7 +492,7 @@ fn test_shift_tab_model_favorite_hotkey_preserves_input_line() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -528,7 +528,7 @@ fn test_tui_api_key_auth_refreshes_catalog_shows_diff_without_opening_picker() { let refreshes = provider.refreshes.clone(); let provider: Arc = Arc::new(provider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -625,7 +625,7 @@ fn test_tui_cerebras_paste_key_lifecycle_has_no_degraded_success_messages() { let set_model_requests = fake_provider.set_model_requests.clone(); let provider: Arc = Arc::new(fake_provider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -917,7 +917,7 @@ fn test_tui_openai_compatible_empty_catalog_does_not_switch_to_profile_default() set_model_attempts: StdArc::clone(&set_model_attempts), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -977,7 +977,7 @@ fn test_tui_openai_compatible_local_refresh_failure_is_pending_not_final_failure set_model_attempts: StdArc::clone(&set_model_attempts), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1044,7 +1044,7 @@ fn test_model_picker_opens_simplified_state_before_async_routes_complete() { delay: Duration::from_millis(75), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1083,7 +1083,7 @@ fn test_model_picker_state_space_preserves_provider_labels_after_route_hydration model: StdArc::new(StdMutex::new("gpt-5.5".to_string())), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1154,7 +1154,7 @@ fn test_model_picker_does_not_cache_single_model_fallback() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1215,7 +1215,7 @@ fn test_login_completed_spawns_auth_refresh_when_runtime_is_available() { delay: Duration::from_millis(150), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1424,7 +1424,7 @@ fn test_azure_login_completion_switches_local_model_without_completion() { complete_calls: StdArc::clone(&complete_calls), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs index 4d05361ee..5af4ec460 100644 --- a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs +++ b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs @@ -182,7 +182,7 @@ fn create_test_app() -> App { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -196,7 +196,7 @@ fn create_named_provider_test_app(name: &'static str, model: &'static str) -> Ap let provider: Arc = Arc::new(NamedMockProvider { name, model }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -222,7 +222,7 @@ fn create_refresh_summary_test_app(summary: crate::provider::ModelCatalogRefresh let provider: Arc = Arc::new(RefreshSummaryProvider { summary }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -239,7 +239,7 @@ fn create_openrouter_spec_capture_test_app() -> (App, StdArc (App, StdArc App { logged_in: StdArc::new(StdMutex::new(false)), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -331,7 +331,7 @@ fn create_antigravity_picker_test_app() -> App { model: StdArc::new(StdMutex::new("default".to_string())), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -455,7 +455,7 @@ fn create_login_smoke_model_app() -> App { let provider: Arc = Arc::new(LoginSmokeModelProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -512,7 +512,7 @@ fn create_failing_model_switch_test_app() -> App { let provider: Arc = Arc::new(FailingModelSwitchProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -537,7 +537,7 @@ fn create_fast_test_app() -> App { service_tier: StdArc::new(StdMutex::new(None)), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -574,7 +574,7 @@ fn create_gemini_test_app() -> App { let provider: Arc = Arc::new(GeminiMockProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/ui_header.rs b/crates/jcode-tui/src/tui/ui_header.rs index 962d5a661..6bf17a918 100644 --- a/crates/jcode-tui/src/tui/ui_header.rs +++ b/crates/jcode-tui/src/tui/ui_header.rs @@ -782,7 +782,7 @@ mod tests { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("test runtime"); - let registry = rt.block_on(Registry::new(provider.clone())); + let registry = rt.block_on(Registry::new(provider.clone(), None)); crate::tui::app::App::new_for_test_harness(provider, registry) } diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index 35b9d31c5..bce8442a4 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -148,16 +148,187 @@ async fn main() -> Result<()> { } async fn pick_commits_impl( - _repo_url: &str, - _min_msg_len: usize, - _max_picks: usize, - _output: Option, + repo_path: &str, + min_msg_len: usize, + max_picks: usize, + output: Option, ) -> Result<()> { - todo_step("Phase 5.2: commit selection via git log heuristics + message quality filter") + // Verify the path is a git repository. + let check = std::process::Command::new("git") + .args(["-C", repo_path, "rev-parse", "--is-inside-work-tree"]) + .output() + .context("failed to run git rev-parse")?; + if !check.status.success() { + anyhow::bail!("{} is not a git repository", repo_path); + } + + // Get commit log: SHA, first parent, subject, then shortstat on the + // following line. `COMMIT` acts as a block separator. + let log_out = std::process::Command::new("git") + .args([ + "-C", + repo_path, + "log", + "--format=COMMIT%n%H%n%P%n%s", + "--shortstat", + ]) + .output() + .context("failed to run git log")?; + + if !log_out.status.success() { + let stderr = String::from_utf8_lossy(&log_out.stderr); + anyhow::bail!("git log failed: {}", stderr); + } + + let stdout = String::from_utf8_lossy(&log_out.stdout); + let mut picked: Vec = Vec::new(); + + for block in stdout.split("COMMIT\n").skip(1) { + let lines: Vec<&str> = block.lines().collect(); + if lines.len() < 3 { + continue; + } + + let sha = lines[0].trim(); + let parent_sha = lines[1] + .split_whitespace() + .next() + .unwrap_or("") + .to_string(); + let subject = lines[2].trim(); + + // Skip root commits (no parent). + if parent_sha.is_empty() { + continue; + } + + // Filter: commit message must meet minimum length. + if subject.len() < min_msg_len { + continue; + } + + // Parse file count from shortstat (e.g. " 3 files changed, …"). + let file_count = lines + .iter() + .rev() + .find(|l| l.contains(" file")) + .and_then(|l| l.split_whitespace().next()?.parse::().ok()) + .unwrap_or(0); + + // Filter: bounded scope — not zero files, not a mega-commit. + if file_count == 0 || file_count > 10 { + continue; + } + + picked.push(serde_json::json!({ + "sha": sha, + "parent_sha": parent_sha, + "spec": subject, + "prompt": subject, + })); + + if picked.len() >= max_picks { + break; + } + } + + let json = serde_json::to_string_pretty(&picked)?; + if let Some(path) = output { + std::fs::write(&path, &json)?; + eprintln!("Wrote {} commits to {}", picked.len(), path.display()); + } else { + println!("{json}"); + } + + Ok(()) } -async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> { - todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON") +async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> { + use jcode_jbench::types::{EvalCommit, EvalDataV2}; + + // Intermediate struct matching the pick-commits output format. + #[derive(serde::Deserialize)] + struct PickedCommit { + sha: String, + parent_sha: String, + spec: String, + prompt: String, + } + + // Read input JSON. + let input_text = std::fs::read_to_string(input) + .with_context(|| format!("failed to read input file {}", input.display()))?; + let picked: Vec = serde_json::from_str(&input_text) + .context("failed to parse input JSON as array of picked commits")?; + + if picked.is_empty() { + anyhow::bail!("input file contains no commits"); + } + + // Detect repo URL from the local git remote. + let repo_url = get_repo_url().unwrap_or_else(|| "unknown".to_owned()); + + let mut eval_commits = Vec::with_capacity(picked.len()); + + for pc in &picked { + let id = format!("{}-eval", &pc.sha[..std::cmp::min(8, pc.sha.len())]); + + // git diff --name-status to get file statuses. + let name_status = run_git(&[ + "diff", + "--name-status", + &format!("{}..{}", pc.parent_sha, pc.sha), + ]) + .with_context(|| { + format!( + "git diff --name-status failed for {}..{}", + pc.parent_sha, pc.sha + ) + })?; + + // git diff to get the full unified diff. + let full_diff = run_git(&[ + "diff", + &format!("{}..{}", pc.parent_sha, pc.sha), + ]) + .with_context(|| { + format!("git diff failed for {}..{}", pc.parent_sha, pc.sha) + })?; + + let file_diffs = parse_diffs(&name_status, &full_diff); + + eval_commits.push(EvalCommit { + id, + sha: pc.sha.clone(), + parent_sha: pc.parent_sha.clone(), + spec: pc.spec.clone(), + prompt: pc.prompt.clone(), + supplemental_files: Vec::new(), + file_diffs, + }); + } + + let eval_data = EvalDataV2 { + repo_url, + test_repo_name: None, + generation_date: chrono_now(), + init_command: None, + env: std::collections::HashMap::new(), + final_check_commands: Vec::new(), + eval_commits, + }; + + let json = serde_json::to_string_pretty(&eval_data) + .context("failed to serialize EvalDataV2")?; + std::fs::write(output, &json) + .with_context(|| format!("failed to write output file {}", output.display()))?; + + println!( + "Wrote {} eval commits to {}", + eval_data.eval_commits.len(), + output.display() + ); + Ok(()) } #[cfg(feature = "agent-runner")] @@ -303,3 +474,160 @@ fn todo_step(phase: &str) -> Result<()> { eprintln!("{phase}"); std::process::exit(2); } + +/// Run a `git` subcommand and return its stdout as a `String`. +fn run_git(args: &[&str]) -> Result { + let output = std::process::Command::new("git") + .args(args) + .output() + .context("failed to spawn git")?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("git {} failed: {}", args.join(" "), stderr.trim()); + } + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +/// Try to detect the repo URL from `git remote get-url origin`. +fn get_repo_url() -> Option { + std::process::Command::new("git") + .args(["remote", "get-url", "origin"]) + .output() + .ok() + .filter(|o| o.status.success()) + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_owned()) +} + +/// ISO-8601 timestamp without pulling in a full datetime crate. +fn chrono_now() -> String { + // Use a simple approach: seconds since epoch formatted manually + // would be ideal, but for simplicity just use a debug-friendly format. + // The `chrono` crate isn't in deps, so we format from SystemTime. + use std::time::SystemTime; + let dur = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default(); + let secs = dur.as_secs(); + // Break into Y-M-D H:M:S (UTC, simplified leap-year handling). + let days = secs / 86400; + let time_of_day = secs % 86400; + let h = time_of_day / 3600; + let m = (time_of_day % 3600) / 60; + let s = time_of_day % 60; + // Days since 1970-01-01 -> Y/M/D via a simple civil calendar. + let (y, mo, d) = civil_from_days(days as i64); + format!("{y:04}-{mo:02}-{d:02}T{h:02}:{m:02}:{s:02}Z") +} + +/// Convert days since 1970-01-01 to (year, month, day). +/// Uses Howard Hinnant's algorithm. +fn civil_from_days(days: i64) -> (i64, u32, u32) { + let z = days + 719468; + let era = if z >= 0 { z } else { z - 146096 } / 146097; + let doe = (z - era * 146097) as u32; + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + (y, m, d) +} + +/// Parse `git diff --name-status` output and the full unified diff into +/// `FileDiff` structs. +/// +/// The name-status output gives us file paths and status codes; we split +/// the full diff by file to associate each chunk with the right file. +fn parse_diffs(name_status: &str, full_diff: &str) -> Vec { + use jcode_jbench::types::{FileDiff, FileDiffStatus}; + + // Parse name-status lines: e.g. "M\tpath/to/file.rs" or "R100\told\tnew". + let mut file_entries: Vec<(FileDiffStatus, String, Option)> = Vec::new(); + for line in name_status.lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() < 2 { + continue; + } + let code = parts[0]; + let (status, path, old_path) = match code { + "M" => (FileDiffStatus::Modified, parts[1].to_owned(), None), + "A" => (FileDiffStatus::Added, parts[1].to_owned(), None), + "D" => (FileDiffStatus::Deleted, parts[1].to_owned(), None), + r if r.starts_with('R') => { + // Renamed: "R100\told_path\tnew_path" + if parts.len() >= 3 { + (FileDiffStatus::Renamed, parts[2].to_owned(), Some(parts[1].to_owned())) + } else { + (FileDiffStatus::Modified, parts[1].to_owned(), None) + } + } + "C" => { + // Copied — treat as Added for our purposes. + let path = if parts.len() >= 3 { parts[2] } else { parts[1] }; + (FileDiffStatus::Added, path.to_owned(), None) + } + _ => (FileDiffStatus::Modified, parts[1].to_owned(), None), + }; + file_entries.push((status, path, old_path)); + } + + // Split the full diff by "diff --git" boundaries to get per-file chunks. + let file_diffs_map = split_diff_by_file(full_diff); + + // Build FileDiff structs, matching by path. + let mut result = Vec::with_capacity(file_entries.len()); + for (status, path, old_path) in file_entries { + let diff_text = file_diffs_map + .get(&path) + .cloned() + .unwrap_or_default(); + result.push(FileDiff { + path, + status, + old_path, + diff: diff_text, + }); + } + + result +} + +/// Split a unified diff into per-file chunks keyed by the post-image path. +fn split_diff_by_file(full_diff: &str) -> std::collections::HashMap { + let mut map = std::collections::HashMap::new(); + let mut current_path: Option = None; + let mut current_chunk = String::new(); + + for line in full_diff.lines() { + if line.starts_with("diff --git ") { + // Save previous chunk. + if let Some(ref p) = current_path { + map.insert(p.clone(), current_chunk.clone()); + } + // Extract the post-image path from "diff --git a/path b/path". + let path = line + .splitn(2, " b/") + .nth(1) + .unwrap_or("") + .to_owned(); + current_path = Some(path); + current_chunk.clear(); + } + if current_path.is_some() { + current_chunk.push_str(line); + current_chunk.push('\n'); + } + } + // Don't forget the last chunk. + if let Some(p) = current_path { + map.insert(p, current_chunk); + } + + map +} From f50f912451fae6aa1a38a9dda839673f2ca62ea7 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 23:17:27 +0700 Subject: [PATCH 20/22] =?UTF-8?q?feat(multi-agent):=20Phase=203=20?= =?UTF-8?q?=E2=80=94=20TeamCreateTool=20+=20Task=20management=20tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## New tools registered in Registry ### team_create Creates a team with name + description. Stores config as JSON at ~/.jcode/teams/.json. Idempotent — re-creating returns existing. ### team_delete Deletes a team config file by name. ### task_create Adds a task to an existing team. Validates team exists. Uses UUID for task IDs. ### task_update Updates task status and/or owner. Partial updates supported. ### task_list Lists all tasks in a team with their status and owner. ## Files - crates/jcode-app-core/src/tool/team.rs — TeamConfig, TeamCreateTool, TeamDeleteTool - crates/jcode-app-core/src/tool/task_management.rs — TaskCreate/Update/ListTool - crates/jcode-app-core/src/tool/mod.rs — register 5 new tools Build: cargo check passes (2 pre-existing warnings). --- crates/jcode-app-core/src/tool/mod.rs | 32 +++ .../src/tool/task_management.rs | 256 ++++++++++++++++++ crates/jcode-app-core/src/tool/team.rs | 211 +++++++++++++++ 3 files changed, 499 insertions(+) create mode 100644 crates/jcode-app-core/src/tool/task_management.rs create mode 100644 crates/jcode-app-core/src/tool/team.rs diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index 0ba3e4930..c010b94c9 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -30,6 +30,8 @@ mod session_search; mod side_panel; mod skill; mod task; +pub mod task_management; +mod team; mod todo; mod webfetch; mod websearch; @@ -252,6 +254,36 @@ impl Registry { Self::insert_tool_timed(&mut m, &mut timings, "gmail", gmail::GmailTool::new); Self::insert_tool_timed(&mut m, &mut timings, "schedule", ambient::ScheduleTool::new); Self::insert_tool_timed(&mut m, &mut timings, "selfdev", selfdev::SelfDevTool::new); + Self::insert_tool_timed( + &mut m, + &mut timings, + "team_create", + team::TeamCreateTool::new, + ); + Self::insert_tool_timed( + &mut m, + &mut timings, + "team_delete", + team::TeamDeleteTool::new, + ); + Self::insert_tool_timed( + &mut m, + &mut timings, + "task_create", + task_management::TaskCreateTool::new, + ); + Self::insert_tool_timed( + &mut m, + &mut timings, + "task_update", + task_management::TaskUpdateTool::new, + ); + Self::insert_tool_timed( + &mut m, + &mut timings, + "task_list", + task_management::TaskListTool::new, + ); let nonzero: Vec = timings .iter() .filter(|(_, ms)| *ms > 0) diff --git a/crates/jcode-app-core/src/tool/task_management.rs b/crates/jcode-app-core/src/tool/task_management.rs new file mode 100644 index 000000000..896e89093 --- /dev/null +++ b/crates/jcode-app-core/src/tool/task_management.rs @@ -0,0 +1,256 @@ +use super::{Tool, ToolContext, ToolOutput}; +use super::team::{TeamConfig, TeamTask}; +use anyhow::Result; +use async_trait::async_trait; +use serde::Deserialize; +use serde_json::{Value, json}; + +// --------------------------------------------------------------------------- +// TaskCreateTool +// --------------------------------------------------------------------------- + +pub struct TaskCreateTool; + +impl TaskCreateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskCreateInput { + team_name: String, + subject: String, + description: String, +} + +#[async_trait] +impl Tool for TaskCreateTool { + fn name(&self) -> &str { + "task_create" + } + + fn description(&self) -> &str { + "Create a new task within a team. The task starts with status 'pending' \ + and no owner assigned." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name", "subject", "description"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team to add the task to." + }, + "subject": { + "type": "string", + "description": "Short task title." + }, + "description": { + "type": "string", + "description": "Detailed task description." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskCreateInput = serde_json::from_value(input)?; + + let mut team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!( + "Team '{}' not found. Create it first with team_create.", + params.team_name + )); + } + }; + + let task_id = format!("task-{}", uuid::Uuid::new_v4().as_simple()); + let task = TeamTask { + id: task_id.clone(), + subject: params.subject, + description: params.description, + status: "pending".to_string(), + owner: None, + }; + team.tasks.push(task); + team.save()?; + + Ok(ToolOutput::new(format!( + "Task '{}' created in team '{}'.", + task_id, params.team_name + )) + .with_title(format!("Task created: {}", task_id))) + } +} + +// --------------------------------------------------------------------------- +// TaskUpdateTool +// --------------------------------------------------------------------------- + +pub struct TaskUpdateTool; + +impl TaskUpdateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskUpdateInput { + team_name: String, + task_id: String, + #[serde(default)] + status: Option, + #[serde(default)] + owner: Option, +} + +#[async_trait] +impl Tool for TaskUpdateTool { + fn name(&self) -> &str { + "task_update" + } + + fn description(&self) -> &str { + "Update a task's status or owner within a team." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name", "task_id"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team containing the task." + }, + "task_id": { + "type": "string", + "description": "Task ID to update." + }, + "status": { + "type": "string", + "enum": ["pending", "in_progress", "completed"], + "description": "New status for the task." + }, + "owner": { + "type": "string", + "description": "Assign or reassign the task to a team member name." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskUpdateInput = serde_json::from_value(input)?; + + let mut team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!( + "Team '{}' not found.", + params.team_name + )); + } + }; + + let task = team + .tasks + .iter_mut() + .find(|t| t.id == params.task_id) + .ok_or_else(|| anyhow::anyhow!("Task '{}' not found.", params.task_id))?; + + if let Some(status) = params.status { + task.status = status; + } + if let Some(owner) = params.owner { + task.owner = Some(owner); + } + + let updated = task.clone(); + team.save()?; + + Ok(ToolOutput::new(format!( + "Task '{}' updated.\n\n{}", + params.task_id, + serde_json::to_string_pretty(&updated)? + )) + .with_title(format!("Task '{}' updated", params.task_id))) + } +} + +// --------------------------------------------------------------------------- +// TaskListTool +// --------------------------------------------------------------------------- + +pub struct TaskListTool; + +impl TaskListTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskListInput { + team_name: String, +} + +#[async_trait] +impl Tool for TaskListTool { + fn name(&self) -> &str { + "task_list" + } + + fn description(&self) -> &str { + "List all tasks in a team, showing their status and owner." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team to list tasks for." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskListInput = serde_json::from_value(input)?; + + let team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!( + "Team '{}' not found.", + params.team_name + )); + } + }; + + let output = serde_json::to_string_pretty(&team.tasks)?; + let summary = format!( + "Team '{}': {} task(s) total, {} pending, {} in_progress, {} completed.", + params.team_name, + team.tasks.len(), + team.tasks.iter().filter(|t| t.status == "pending").count(), + team.tasks.iter().filter(|t| t.status == "in_progress").count(), + team.tasks.iter().filter(|t| t.status == "completed").count(), + ); + + Ok(ToolOutput::new(format!("{}\n\n{}", summary, output)) + .with_title(format!("{} tasks in '{}'", team.tasks.len(), params.team_name))) + } +} diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs new file mode 100644 index 000000000..72db41b3b --- /dev/null +++ b/crates/jcode-app-core/src/tool/team.rs @@ -0,0 +1,211 @@ +use super::{Tool, ToolContext, ToolOutput}; +use anyhow::Result; +use async_trait::async_trait; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::path::PathBuf; + +/// Get the teams directory path (~/.jcode/teams/). +fn teams_dir() -> PathBuf { + dirs::home_dir() + .unwrap_or_else(|| PathBuf::from(".")) + .join(".jcode") + .join("teams") +} + +/// Team configuration stored as JSON on disk. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamConfig { + pub name: String, + pub description: String, + pub created_at: String, + pub members: Vec, + pub tasks: Vec, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamMember { + pub name: String, + pub session_id: String, + pub agent_type: String, + pub status: String, // "active" | "idle" | "shutdown" +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamTask { + pub id: String, + pub subject: String, + pub description: String, + pub status: String, // "pending" | "in_progress" | "completed" + pub owner: Option, // member name +} + +impl TeamConfig { + /// Load a team config from disk by name. + pub fn load(name: &str) -> Result> { + let path = teams_dir().join(format!("{name}.json")); + if !path.exists() { + return Ok(None); + } + let text = std::fs::read_to_string(&path)?; + Ok(Some(serde_json::from_str(&text)?)) + } + + /// Save this team config to disk. + pub fn save(&self) -> Result<()> { + let dir = teams_dir(); + std::fs::create_dir_all(&dir)?; + let path = dir.join(format!("{}.json", self.name)); + let json = serde_json::to_string_pretty(self)?; + std::fs::write(&path, json)?; + Ok(()) + } + + /// Delete a team config from disk by name. + pub fn delete(name: &str) -> Result<()> { + let path = teams_dir().join(format!("{name}.json")); + if path.exists() { + std::fs::remove_file(&path)?; + } + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// TeamCreateTool +// --------------------------------------------------------------------------- + +pub struct TeamCreateTool; + +impl TeamCreateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TeamCreateInput { + name: String, + description: String, +} + +#[async_trait] +impl Tool for TeamCreateTool { + fn name(&self) -> &str { + "team_create" + } + + fn description(&self) -> &str { + "Create a new team for coordinating sub-agents. Stores a lightweight \ + team config file at ~/.jcode/teams/.json that tracks members, \ + tasks, and status." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["name", "description"], + "properties": { + "intent": super::intent_schema_property(), + "name": { + "type": "string", + "description": "Unique team name (used as filename)." + }, + "description": { + "type": "string", + "description": "What this team is for." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TeamCreateInput = serde_json::from_value(input)?; + + if let Some(existing) = TeamConfig::load(¶ms.name)? { + return Ok(ToolOutput::new(format!( + "Team '{}' already exists.\n\n{}", + params.name, + serde_json::to_string_pretty(&existing)? + )) + .with_title(format!("Team '{}' already exists", params.name))); + } + + let team = TeamConfig { + name: params.name.clone(), + description: params.description.clone(), + created_at: chrono::Utc::now().to_rfc3339(), + members: Vec::new(), + tasks: Vec::new(), + }; + team.save()?; + + let output = serde_json::to_string_pretty(&team)?; + Ok(ToolOutput::new(format!( + "Team '{}' created.\n\n{}", + params.name, output + )) + .with_title(format!("Team '{}' created", params.name))) + } +} + +// --------------------------------------------------------------------------- +// TeamDeleteTool +// --------------------------------------------------------------------------- + +pub struct TeamDeleteTool; + +impl TeamDeleteTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TeamDeleteInput { + name: String, +} + +#[async_trait] +impl Tool for TeamDeleteTool { + fn name(&self) -> &str { + "team_delete" + } + + fn description(&self) -> &str { + "Delete a team configuration. Removes the team config file from \ + ~/.jcode/teams/.json." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["name"], + "properties": { + "intent": super::intent_schema_property(), + "name": { + "type": "string", + "description": "Team name to delete." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TeamDeleteInput = serde_json::from_value(input)?; + + let existed = TeamConfig::load(¶ms.name)?.is_some(); + TeamConfig::delete(¶ms.name)?; + + if existed { + Ok(ToolOutput::new(format!("Team '{}' deleted.", params.name)) + .with_title(format!("Team '{}' deleted", params.name))) + } else { + Ok(ToolOutput::new(format!( + "Team '{}' did not exist (no-op).", + params.name + )) + .with_title(format!("Team '{}' not found", params.name))) + } + } +} From d06a4175d02e244a91a91e73d23fcf8ab5665d81 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Fri, 5 Jun 2026 23:42:57 +0700 Subject: [PATCH 21/22] fix(ci): resolve conflict markers + clippy + fmt fixes - Remove stale >>>>>>> conflict marker in skill.rs - Fix clippy: derive Default on PermissionMode instead of manual impl - Fix clippy: collapsible if-let in tier.rs - Fix clippy: doc list indentation in output.rs - cargo fmt --all --- crates/jcode-agent-runtime/src/output.rs | 6 +-- crates/jcode-agent-runtime/src/permission.rs | 19 ++------ crates/jcode-agent-runtime/src/tier.rs | 8 ++-- .../src/agent/turn_execution.rs | 4 +- .../src/agent/turn_streaming_mpsc.rs | 15 +++--- crates/jcode-app-core/src/dcg_bridge.rs | 10 +++- crates/jcode-app-core/src/lib.rs | 2 +- .../jcode-app-core/src/server/comm_session.rs | 7 +-- .../src/server/comm_session_tests.rs | 15 ++++-- .../jcode-app-core/src/tool/selfdev/setup.rs | 46 +++++++------------ .../jcode-app-core/src/tool/selfdev/tests.rs | 8 +++- .../src/tool/task_management.rs | 31 +++++++------ crates/jcode-app-core/src/tool/team.rs | 20 ++++---- .../src/auth/live_provider_probes.rs | 10 +++- crates/jcode-base/src/auth/provider_e2e.rs | 30 ++++++------ crates/jcode-base/src/provider/gemini.rs | 4 +- .../jcode-base/src/provider/gemini_tests.rs | 5 +- crates/jcode-base/src/provider/mod.rs | 2 +- crates/jcode-base/src/skill.rs | 1 - crates/jcode-base/src/telemetry/tests.rs | 19 ++++++-- crates/jcode-provider-core/src/lib.rs | 8 ++-- crates/jcode-provider-core/src/selection.rs | 15 ++++-- .../src/render_core_adapter_tests.rs | 38 +++++++++++---- crates/jcode-tui/src/tui/app/misc_ui.rs | 8 ++-- crates/jcode-tui/src/tui/app/tests.rs | 5 +- crates/jcode-tui/src/tui/info_widget.rs | 6 ++- evals/jbench/src/agent_runner.rs | 10 ++-- evals/jbench/src/bin/jbench.rs | 40 ++++++---------- evals/jbench/src/judge.rs | 6 ++- src/cli/provider_doctor.rs | 5 +- tests/e2e/reload_multiclient.rs | 5 +- 31 files changed, 230 insertions(+), 178 deletions(-) diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs index 93dc60a93..bda4ee17d 100644 --- a/crates/jcode-agent-runtime/src/output.rs +++ b/crates/jcode-agent-runtime/src/output.rs @@ -5,9 +5,9 @@ //! //! - `LastMessage`: parent receives only the agent's final assistant turn. //! Default. Good for "research-and-summarize" agents like file-picker. -//! - `AllMessages`: parent receives the full child message history (text -//! + tool calls + tool results). Good for editor-like agents that need -//! to expose their full edit trace. +//! - `AllMessages`: parent receives the full child message history +//! (text + tool calls + tool results). Good for editor-like agents +//! that need to expose their full edit trace. //! - `StructuredOutput`: agent must call `set_output` with a JSON value //! that conforms to `output_schema`. Good for judge agents, lessons //! extractors, structured planners. diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs index 41db95a72..2f112efc0 100644 --- a/crates/jcode-agent-runtime/src/permission.rs +++ b/crates/jcode-agent-runtime/src/permission.rs @@ -28,11 +28,12 @@ use std::fmt; /// This enum intentionally mirrors `dcg_core::Mode` (from the /// `destructive_command_guard` crate) so that `jcode-agent-runtime` /// does not need to depend on `dcg-core` directly. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] #[serde(rename_all = "kebab-case")] pub enum PermissionMode { /// Rule-based classification using the legacy `AUTO_ALLOWED` list. /// Read-only tools auto-allowed; writes require permission. + #[default] Default, /// File operations (edit, write, patch) auto-allowed. Network, /// spawn, and irreversible operations still prompt. @@ -49,12 +50,6 @@ pub enum PermissionMode { Auto, } -impl Default for PermissionMode { - fn default() -> Self { - PermissionMode::Default - } -} - impl PermissionMode { /// String representation matching the wire format used by TOML /// definitions and the CLI. @@ -113,10 +108,7 @@ mod tests { PermissionMode::parse("accept-edits"), Some(PermissionMode::AcceptEdits) ); - assert_eq!( - PermissionMode::parse("plan"), - Some(PermissionMode::Plan) - ); + assert_eq!(PermissionMode::parse("plan"), Some(PermissionMode::Plan)); assert_eq!( PermissionMode::parse("DONTASK"), Some(PermissionMode::DontAsk) @@ -133,10 +125,7 @@ mod tests { PermissionMode::parse("bypass-permissions"), Some(PermissionMode::BypassPermissions) ); - assert_eq!( - PermissionMode::parse("auto"), - Some(PermissionMode::Auto) - ); + assert_eq!(PermissionMode::parse("auto"), Some(PermissionMode::Auto)); assert_eq!(PermissionMode::parse(""), None); assert_eq!(PermissionMode::parse("nonsense"), None); } diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs index 33ee6288b..b75916fa5 100644 --- a/crates/jcode-agent-runtime/src/tier.rs +++ b/crates/jcode-agent-runtime/src/tier.rs @@ -118,10 +118,10 @@ pub fn resolve_model( return override_id; } - if let Some(tier) = prefer_tier { - if let Some(tier_model) = tier.read_user_override() { - return tier_model; - } + if let Some(tier) = prefer_tier + && let Some(tier_model) = tier.read_user_override() + { + return tier_model; } current_session_model.to_string() diff --git a/crates/jcode-app-core/src/agent/turn_execution.rs b/crates/jcode-app-core/src/agent/turn_execution.rs index 44393c474..bb23bded7 100644 --- a/crates/jcode-app-core/src/agent/turn_execution.rs +++ b/crates/jcode-app-core/src/agent/turn_execution.rs @@ -325,8 +325,8 @@ impl Agent { fn apply_selfdev_tool_surface(tools: &mut [ToolDefinition], is_canary: bool) { for tool in tools.iter_mut() { if tool.name == "selfdev" { - tool.description = crate::tool::selfdev::SelfDevTool::description_for(is_canary) - .to_string(); + tool.description = + crate::tool::selfdev::SelfDevTool::description_for(is_canary).to_string(); tool.input_schema = crate::tool::selfdev::SelfDevTool::schema_for(is_canary); } } diff --git a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs index a91adff4c..0f4b0faf5 100644 --- a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs +++ b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs @@ -396,8 +396,9 @@ impl Agent { // answer renders as a normal paragraph rather than as reasoning. if reasoning_open && !text.trim().is_empty() { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } text_content.push_str(&text); if !text_wrapped_detected { @@ -430,8 +431,9 @@ impl Agent { StreamEvent::ToolUseStart { id, name } => { if reasoning_open { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } let _ = event_tx.send(ServerEvent::ToolStart { id: id.clone(), @@ -595,8 +597,9 @@ impl Agent { // step) so the client flushes its live partial line. if reasoning_open { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } if reason.is_some() { stop_reason = reason; diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs index b26de1cd5..b91ac925f 100644 --- a/crates/jcode-app-core/src/dcg_bridge.rs +++ b/crates/jcode-app-core/src/dcg_bridge.rs @@ -160,7 +160,10 @@ pub fn session_mode(session_id: &str) -> Option { /// Call sites that know the agent's `PermissionMode` (e.g. subagent tool /// execution) should use this instead of [`classify`]. #[must_use] -pub fn classify_for_agent(action: &str, agent_permission_mode: Option) -> BridgeDecision { +pub fn classify_for_agent( + action: &str, + agent_permission_mode: Option, +) -> BridgeDecision { let mode = agent_permission_mode .map(permission_mode_to_dcg) .unwrap_or_else(current_mode); @@ -468,7 +471,10 @@ mod tests { assert_eq!(permission_mode_to_dcg(PM::AcceptEdits), Mode::AcceptEdits); assert_eq!(permission_mode_to_dcg(PM::Plan), Mode::Plan); assert_eq!(permission_mode_to_dcg(PM::DontAsk), Mode::DontAsk); - assert_eq!(permission_mode_to_dcg(PM::BypassPermissions), Mode::BypassPermissions); + assert_eq!( + permission_mode_to_dcg(PM::BypassPermissions), + Mode::BypassPermissions + ); assert_eq!(permission_mode_to_dcg(PM::Auto), Mode::Auto); } diff --git a/crates/jcode-app-core/src/lib.rs b/crates/jcode-app-core/src/lib.rs index 1e23d83ee..27d8ee45e 100644 --- a/crates/jcode-app-core/src/lib.rs +++ b/crates/jcode-app-core/src/lib.rs @@ -39,8 +39,8 @@ pub mod network_retry; pub mod notifications; pub mod overnight; pub mod perf; -pub mod prompt_templates; pub mod prompt_placeholders; +pub mod prompt_templates; pub mod replay; pub mod restart_snapshot; pub mod sandbox; diff --git a/crates/jcode-app-core/src/server/comm_session.rs b/crates/jcode-app-core/src/server/comm_session.rs index 3b4e27196..540c03de1 100644 --- a/crates/jcode-app-core/src/server/comm_session.rs +++ b/crates/jcode-app-core/src/server/comm_session.rs @@ -266,9 +266,10 @@ fn resolve_swarm_spawn_selection( } None => SwarmSpawnSelection { model: coordinator.model.clone(), - provider_key: coordinator.provider_key.clone().or_else(|| { - provider_key_for_spawn_model(coordinator.model.as_deref(), None) - }), + provider_key: coordinator + .provider_key + .clone() + .or_else(|| provider_key_for_spawn_model(coordinator.model.as_deref(), None)), route_api_method: coordinator.route_api_method.clone(), }, } diff --git a/crates/jcode-app-core/src/server/comm_session_tests.rs b/crates/jcode-app-core/src/server/comm_session_tests.rs index 52812df70..eac745636 100644 --- a/crates/jcode-app-core/src/server/comm_session_tests.rs +++ b/crates/jcode-app-core/src/server/comm_session_tests.rs @@ -466,7 +466,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api() // the same API route, not Claude OAuth (the config default). let selection = resolve_swarm_spawn_selection( None, - &coordinator_identity(Some("claude-opus-4-6"), Some("claude-api"), Some("claude-api")), + &coordinator_identity( + Some("claude-opus-4-6"), + Some("claude-api"), + Some("claude-api"), + ), ); assert_eq!(selection.model.as_deref(), Some("claude-opus-4-6")); @@ -478,7 +482,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api() fn resolve_swarm_spawn_model_keeps_provider_key_when_config_matches_coordinator() { let selection = resolve_swarm_spawn_selection( Some("custom-model".to_string()), - &coordinator_identity(Some("custom-model"), Some("custom-provider"), Some("custom-route")), + &coordinator_identity( + Some("custom-model"), + Some("custom-provider"), + Some("custom-route"), + ), ); assert_eq!(selection.model.as_deref(), Some("custom-model")); @@ -541,8 +549,7 @@ async fn coordinator_identity_falls_back_to_persisted_session_when_agent_busy() // Persist a coordinator session that records a concrete model + auth route. // Persist after the agent is built so it reflects the authoritative on-disk // snapshot the spawn path will read when the agent lock is unavailable. - let mut session = - crate::session::Session::create_with_id("coord_busy".to_string(), None, None); + let mut session = crate::session::Session::create_with_id("coord_busy".to_string(), None, None); session.model = Some("claude-opus-4-6".to_string()); session.provider_key = Some("claude-api".to_string()); session.route_api_method = Some("claude-api".to_string()); diff --git a/crates/jcode-app-core/src/tool/selfdev/setup.rs b/crates/jcode-app-core/src/tool/selfdev/setup.rs index 3f07483fb..496329daf 100644 --- a/crates/jcode-app-core/src/tool/selfdev/setup.rs +++ b/crates/jcode-app-core/src/tool/selfdev/setup.rs @@ -21,11 +21,7 @@ impl SetupCheck { } } - fn missing( - name: &'static str, - detail: impl Into, - fix: impl Into, - ) -> Self { + fn missing(name: &'static str, detail: impl Into, fix: impl Into) -> Self { Self { name, ok: false, @@ -102,36 +98,25 @@ impl SelfDevTool { if repo_dir.is_none() { // Only attempt a clone when git is available and we're not in a // synthetic test session. - let git_available = checks - .iter() - .any(|check| check.name == "git" && check.ok); + let git_available = checks.iter().any(|check| check.name == "git" && check.ok); if SelfDevTool::is_test_session() { - clone_note = Some( - "Test mode: skipped cloning the jcode source.".to_string(), - ); + clone_note = Some("Test mode: skipped cloning the jcode source.".to_string()); } else if git_available { match Self::clone_selfdev_source() { Ok(path) => { - clone_note = Some(format!( - "Cloned jcode source into {}.", - path.display() - )); + clone_note = Some(format!("Cloned jcode source into {}.", path.display())); repo_dir = Some(path); } Err(err) => { - clone_note = Some(format!( - "Could not clone jcode source automatically: {err}", - )); + clone_note = + Some(format!("Could not clone jcode source automatically: {err}",)); } } } } match &repo_dir { - Some(path) => checks.push(SetupCheck::ok( - "repository", - path.display().to_string(), - )), + Some(path) => checks.push(SetupCheck::ok("repository", path.display().to_string())), None => { let target = Self::selfdev_clone_dir() .map(|p| p.display().to_string()) @@ -152,10 +137,9 @@ impl SelfDevTool { // build before `selfdev reload`/`enter` can hand off into a dev binary. if let Some(repo) = repo_dir.as_deref() { match build::find_dev_binary(repo) { - Some(binary) => checks.push(SetupCheck::ok( - "dev binary", - binary.display().to_string(), - )), + Some(binary) => { + checks.push(SetupCheck::ok("dev binary", binary.display().to_string())) + } None => checks.push(SetupCheck::missing( "dev binary", "no built binary in target/selfdev or target/release", @@ -222,7 +206,11 @@ impl SelfDevTool { let format_path = |path: Option<&std::path::Path>| match path { Some(p) => { let exists = p.exists(); - format!("{} {}", p.display(), if exists { "(exists)" } else { "(missing)" }) + format!( + "{} {}", + p.display(), + if exists { "(exists)" } else { "(missing)" } + ) } None => "unavailable".to_string(), }; @@ -293,9 +281,7 @@ impl SelfDevTool { /// is strictly newer than the running process). pub(super) async fn do_reload_to_newer_build(&self, _ctx: &ToolContext) -> Result { if SelfDevTool::is_test_session() { - return Ok(ToolOutput::new( - "Test mode: skipped reload-to-newer-build.", - )); + return Ok(ToolOutput::new("Test mode: skipped reload-to-newer-build.")); } if !server::server_has_newer_binary() { diff --git a/crates/jcode-app-core/src/tool/selfdev/tests.rs b/crates/jcode-app-core/src/tool/selfdev/tests.rs index 4f633c3e6..d569cda02 100644 --- a/crates/jcode-app-core/src/tool/selfdev/tests.rs +++ b/crates/jcode-app-core/src/tool/selfdev/tests.rs @@ -325,7 +325,13 @@ fn non_selfdev_schema_only_exposes_onramp_actions() { sorted, vec!["enter", "find-config", "reload", "setup", "status"] ); - for hidden in ["build", "test", "cancel-build", "socket-info", "socket-help"] { + for hidden in [ + "build", + "test", + "cancel-build", + "socket-info", + "socket-help", + ] { assert!( !actions.contains(&hidden), "on-ramp schema should not expose {hidden}" diff --git a/crates/jcode-app-core/src/tool/task_management.rs b/crates/jcode-app-core/src/tool/task_management.rs index 896e89093..6533f0b27 100644 --- a/crates/jcode-app-core/src/tool/task_management.rs +++ b/crates/jcode-app-core/src/tool/task_management.rs @@ -1,5 +1,5 @@ -use super::{Tool, ToolContext, ToolOutput}; use super::team::{TeamConfig, TeamTask}; +use super::{Tool, ToolContext, ToolOutput}; use anyhow::Result; use async_trait::async_trait; use serde::Deserialize; @@ -154,10 +154,7 @@ impl Tool for TaskUpdateTool { let mut team = match TeamConfig::load(¶ms.team_name)? { Some(t) => t, None => { - return Err(anyhow::anyhow!( - "Team '{}' not found.", - params.team_name - )); + return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name)); } }; @@ -233,10 +230,7 @@ impl Tool for TaskListTool { let team = match TeamConfig::load(¶ms.team_name)? { Some(t) => t, None => { - return Err(anyhow::anyhow!( - "Team '{}' not found.", - params.team_name - )); + return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name)); } }; @@ -246,11 +240,22 @@ impl Tool for TaskListTool { params.team_name, team.tasks.len(), team.tasks.iter().filter(|t| t.status == "pending").count(), - team.tasks.iter().filter(|t| t.status == "in_progress").count(), - team.tasks.iter().filter(|t| t.status == "completed").count(), + team.tasks + .iter() + .filter(|t| t.status == "in_progress") + .count(), + team.tasks + .iter() + .filter(|t| t.status == "completed") + .count(), ); - Ok(ToolOutput::new(format!("{}\n\n{}", summary, output)) - .with_title(format!("{} tasks in '{}'", team.tasks.len(), params.team_name))) + Ok( + ToolOutput::new(format!("{}\n\n{}", summary, output)).with_title(format!( + "{} tasks in '{}'", + team.tasks.len(), + params.team_name + )), + ) } } diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs index 72db41b3b..6c8ad7738 100644 --- a/crates/jcode-app-core/src/tool/team.rs +++ b/crates/jcode-app-core/src/tool/team.rs @@ -36,7 +36,7 @@ pub struct TeamTask { pub id: String, pub subject: String, pub description: String, - pub status: String, // "pending" | "in_progress" | "completed" + pub status: String, // "pending" | "in_progress" | "completed" pub owner: Option, // member name } @@ -141,11 +141,10 @@ impl Tool for TeamCreateTool { team.save()?; let output = serde_json::to_string_pretty(&team)?; - Ok(ToolOutput::new(format!( - "Team '{}' created.\n\n{}", - params.name, output - )) - .with_title(format!("Team '{}' created", params.name))) + Ok( + ToolOutput::new(format!("Team '{}' created.\n\n{}", params.name, output)) + .with_title(format!("Team '{}' created", params.name)), + ) } } @@ -201,11 +200,10 @@ impl Tool for TeamDeleteTool { Ok(ToolOutput::new(format!("Team '{}' deleted.", params.name)) .with_title(format!("Team '{}' deleted", params.name))) } else { - Ok(ToolOutput::new(format!( - "Team '{}' did not exist (no-op).", - params.name - )) - .with_title(format!("Team '{}' not found", params.name))) + Ok( + ToolOutput::new(format!("Team '{}' did not exist (no-op).", params.name)) + .with_title(format!("Team '{}' not found", params.name)), + ) } } } diff --git a/crates/jcode-base/src/auth/live_provider_probes.rs b/crates/jcode-base/src/auth/live_provider_probes.rs index e551bc0d4..a47f4697a 100644 --- a/crates/jcode-base/src/auth/live_provider_probes.rs +++ b/crates/jcode-base/src/auth/live_provider_probes.rs @@ -1341,7 +1341,10 @@ pub async fn run_live_native_provider_smoke( .with_duration_ms(started.elapsed().as_millis() as u64) .with_evidence("model", serde_json::json!(model)) .with_evidence("matched_expected_content", serde_json::json!(true)) - .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone())); + .with_evidence( + "stop_reason", + serde_json::json!(outcome.stop_reason.clone()), + ); if let Some(usage) = outcome.usage_evidence() { stage = stage.with_evidence("usage", usage); } @@ -1429,7 +1432,10 @@ pub async fn run_live_native_provider_stream_smoke( .with_evidence("attempts", serde_json::json!(attempts)) .with_evidence("total_events", serde_json::json!(outcome.total_events)) .with_evidence("matched_expected_content", serde_json::json!(true)) - .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone())); + .with_evidence( + "stop_reason", + serde_json::json!(outcome.stop_reason.clone()), + ); if let Some(usage) = outcome.usage_evidence() { stage = stage.with_evidence("usage", usage); } diff --git a/crates/jcode-base/src/auth/provider_e2e.rs b/crates/jcode-base/src/auth/provider_e2e.rs index cf1f2b3c1..391de4515 100644 --- a/crates/jcode-base/src/auth/provider_e2e.rs +++ b/crates/jcode-base/src/auth/provider_e2e.rs @@ -1321,8 +1321,8 @@ impl NativeProviderKind { /// Returns an error only when the runtime cannot be constructed at all (e.g. /// Copilot with no credential file); model selection happens later. fn build_runtime(self) -> anyhow::Result> { - use anyhow::Context as _; use crate::provider::Provider; + use anyhow::Context as _; let runtime: std::sync::Arc = match self { Self::OpenAi => { let credentials = crate::auth::codex::load_credentials().unwrap_or_else(|_| { @@ -1337,9 +1337,7 @@ impl NativeProviderKind { std::sync::Arc::new(crate::provider::openai::OpenAIProvider::new(credentials)) } Self::Gemini => std::sync::Arc::new(crate::provider::gemini::GeminiProvider::new()), - Self::Cursor => { - std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new()) - } + Self::Cursor => std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new()), Self::Copilot => { // `new()` requires a loadable GitHub token; fall back to an empty // token so the offline tier can still construct the runtime for @@ -1354,18 +1352,14 @@ impl NativeProviderKind { crate::env::set_var("JCODE_COPILOT_PREFETCH_STARTUP_GRACE_MS", "0"); let runtime = match crate::provider::copilot::CopilotApiProvider::new() { Ok(runtime) => runtime, - Err(_) => crate::provider::copilot::CopilotApiProvider::new_with_token( - String::new(), - ), + Err(_) => { + crate::provider::copilot::CopilotApiProvider::new_with_token(String::new()) + } }; std::sync::Arc::new(runtime) } - Self::Bedrock => { - std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new()) - } - Self::Jcode => { - std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new()) - } + Self::Bedrock => std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new()), + Self::Jcode => std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new()), Self::Azure => { // Azure OpenAI is the OpenRouter transport configured via Azure // env; apply that env (endpoint/key/header wiring) before building @@ -1696,8 +1690,14 @@ pub async fn run_generic_native_e2e( )); } } else { - run_generic_native_api_checks(runtime.as_ref(), &selected, spec.label, &mut checks, &mut spend) - .await; + run_generic_native_api_checks( + runtime.as_ref(), + &selected, + spec.label, + &mut checks, + &mut spend, + ) + .await; } } else { for checkpoint in API_DEPENDENT_CHECKPOINTS { diff --git a/crates/jcode-base/src/provider/gemini.rs b/crates/jcode-base/src/provider/gemini.rs index 8e8dc9174..485fb0786 100644 --- a/crates/jcode-base/src/provider/gemini.rs +++ b/crates/jcode-base/src/provider/gemini.rs @@ -849,9 +849,7 @@ impl Provider for GeminiProvider { .await; let _ = tx.send(Ok(StreamEvent::ToolUseEnd)).await; if let Some(signature) = signature { - let _ = tx - .send(Ok(StreamEvent::ToolUseSignature(signature))) - .await; + let _ = tx.send(Ok(StreamEvent::ToolUseSignature(signature))).await; } } else if let Some(signature) = part_signature { // Standalone signature part; remember it for the next diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs index 8d2917a04..21c3bcc6f 100644 --- a/crates/jcode-base/src/provider/gemini_tests.rs +++ b/crates/jcode-base/src/provider/gemini_tests.rs @@ -386,7 +386,10 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() { assert!(!schema_contains_key(parameters, "additionalProperties")); assert!(!schema_contains_key(parameters, "$schema")); // Real schema content is preserved. - assert_eq!(parameters["properties"]["file_path"]["type"], json!("string")); + assert_eq!( + parameters["properties"]["file_path"]["type"], + json!("string") + ); assert_eq!( parameters["properties"]["opts"]["properties"]["limit"]["type"], json!("integer") diff --git a/crates/jcode-base/src/provider/mod.rs b/crates/jcode-base/src/provider/mod.rs index ef4011e37..bfeccd7f9 100644 --- a/crates/jcode-base/src/provider/mod.rs +++ b/crates/jcode-base/src/provider/mod.rs @@ -48,6 +48,7 @@ pub use catalog_routes::{ remote_model_routes_lightweight_fallback, remote_model_should_offer_copilot_route, remote_openai_compatible_route_for_model, simplified_model_routes_for_picker, }; +pub use jcode_provider_core::cli_provider_arg_for_session_key; pub use jcode_provider_core::{ ALL_CLAUDE_MODELS, ALL_OPENAI_MODELS, CHEAPNESS_REFERENCE_INPUT_TOKENS, CHEAPNESS_REFERENCE_OUTPUT_TOKENS, DEFAULT_CONTEXT_LIMIT, EventStream, JCODE_USER_AGENT, @@ -58,7 +59,6 @@ pub use jcode_provider_core::{ normalize_copilot_model_name, provider_from_model_key, shared_http_client, summarize_model_catalog_refresh, }; -pub use jcode_provider_core::cli_provider_arg_for_session_key; pub use jcode_provider_core::{ProviderFailoverPrompt, parse_failover_prompt_message}; pub use route_builders::{ build_anthropic_oauth_route, build_copilot_route, build_openai_api_key_route, diff --git a/crates/jcode-base/src/skill.rs b/crates/jcode-base/src/skill.rs index f704d04ac..3ca1799aa 100644 --- a/crates/jcode-base/src/skill.rs +++ b/crates/jcode-base/src/skill.rs @@ -924,6 +924,5 @@ mod invocation_parse_tests { ); let skill = SkillRegistry::parse_skill(&path).unwrap(); assert_eq!(skill.tags, vec!["rust", "perf"]); ->>>>>>> origin/master } } diff --git a/crates/jcode-base/src/telemetry/tests.rs b/crates/jcode-base/src/telemetry/tests.rs index 0cade87aa..a5871b080 100644 --- a/crates/jcode-base/src/telemetry/tests.rs +++ b/crates/jcode-base/src/telemetry/tests.rs @@ -30,12 +30,25 @@ fn test_do_not_track() { fn test_is_ci_detects_ci_env() { let _guard = lock_test_env(); // Clear any inherited CI markers so the baseline is deterministic. - for key in ["CI", "GITHUB_ACTIONS", "BUILDKITE", "JENKINS_URL", "GITLAB_CI", "CIRCLECI"] { + for key in [ + "CI", + "GITHUB_ACTIONS", + "BUILDKITE", + "JENKINS_URL", + "GITLAB_CI", + "CIRCLECI", + ] { crate::env::remove_var(key); } - assert!(!is_ci(), "expected non-CI baseline after clearing CI markers"); + assert!( + !is_ci(), + "expected non-CI baseline after clearing CI markers" + ); crate::env::set_var("CI", "true"); - assert!(is_ci(), "CI env var should mark the run as CI (gates install skip)"); + assert!( + is_ci(), + "CI env var should mark the run as CI (gates install skip)" + ); crate::env::remove_var("CI"); assert!(!is_ci()); } diff --git a/crates/jcode-provider-core/src/lib.rs b/crates/jcode-provider-core/src/lib.rs index 93fe676e7..73433d8ad 100644 --- a/crates/jcode-provider-core/src/lib.rs +++ b/crates/jcode-provider-core/src/lib.rs @@ -26,10 +26,10 @@ pub use models::{ provider_for_model_with_hint as core_provider_for_model_with_hint, provider_key_from_hint, }; pub use selection::{ - ActiveProvider, ProviderAvailability, auto_default_provider, - cli_provider_arg_for_session_key, dedupe_model_routes, explicit_model_provider_prefix, - fallback_sequence, model_name_for_provider, parse_provider_hint, provider_from_model_key, - provider_key, provider_label, + ActiveProvider, ProviderAvailability, auto_default_provider, cli_provider_arg_for_session_key, + dedupe_model_routes, explicit_model_provider_prefix, fallback_sequence, + model_name_for_provider, parse_provider_hint, provider_from_model_key, provider_key, + provider_label, }; use anyhow::Result; diff --git a/crates/jcode-provider-core/src/selection.rs b/crates/jcode-provider-core/src/selection.rs index 1c4139cba..bc83ae280 100644 --- a/crates/jcode-provider-core/src/selection.rs +++ b/crates/jcode-provider-core/src/selection.rs @@ -361,16 +361,25 @@ mod tests { Some("anthropic-api") ); // Anthropic OAuth -> claude. - assert_eq!(cli_provider_arg_for_session_key("claude-oauth"), Some("claude")); + assert_eq!( + cli_provider_arg_for_session_key("claude-oauth"), + Some("claude") + ); assert_eq!(cli_provider_arg_for_session_key("claude"), Some("claude")); // OpenAI variants. - assert_eq!(cli_provider_arg_for_session_key("openai-oauth"), Some("openai")); + assert_eq!( + cli_provider_arg_for_session_key("openai-oauth"), + Some("openai") + ); assert_eq!( cli_provider_arg_for_session_key("openai-api-key"), Some("openai-api") ); // Passthrough providers. - assert_eq!(cli_provider_arg_for_session_key("openrouter"), Some("openrouter")); + assert_eq!( + cli_provider_arg_for_session_key("openrouter"), + Some("openrouter") + ); assert_eq!(cli_provider_arg_for_session_key("copilot"), Some("copilot")); assert_eq!(cli_provider_arg_for_session_key("gemini"), Some("gemini")); assert_eq!(cli_provider_arg_for_session_key("bedrock"), Some("bedrock")); diff --git a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs index 2b4a35903..220a2bd14 100644 --- a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs +++ b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs @@ -290,13 +290,34 @@ impl Rng { } const WORDS: &[&str] = &[ - "alpha", "beta", "gamma", "delta", "x", "y", "z", "the", "quick", "brown", - "fox", "中文", "데이터", "emoji", "lorem", "ipsum", "a", "I", "we", "code", + "alpha", + "beta", + "gamma", + "delta", + "x", + "y", + "z", + "the", + "quick", + "brown", + "fox", + "中文", + "데이터", + "emoji", + "lorem", + "ipsum", + "a", + "I", + "we", + "code", ]; fn gen_words(rng: &mut Rng, max: usize) -> String { let n = 1 + rng.below(max); - (0..n).map(|_| *rng.pick(WORDS)).collect::>().join(" ") + (0..n) + .map(|_| *rng.pick(WORDS)) + .collect::>() + .join(" ") } /// Generate an inline fragment (no leading/trailing block structure). @@ -307,7 +328,11 @@ fn gen_inline(rng: &mut Rng, depth: usize) -> String { 2 => format!("_{}_", gen_words(rng, 3)), 3 => format!("`{}`", gen_words(rng, 2)), 4 => format!("~~{}~~", gen_words(rng, 2)), - 5 => format!("[{}](http://example.com/{})", gen_words(rng, 2), rng.below(99)), + 5 => format!( + "[{}](http://example.com/{})", + gen_words(rng, 2), + rng.below(99) + ), 6 => format!("${}+{}$", rng.pick(WORDS), rng.pick(WORDS)), 7 => format!("${}", rng.below(999)), // currency _ => format!( @@ -536,8 +561,3 @@ fn fuzz_random_documents_wrapped_parity() { .join("\n\n") ); } - - - - - diff --git a/crates/jcode-tui/src/tui/app/misc_ui.rs b/crates/jcode-tui/src/tui/app/misc_ui.rs index 0d408cc80..58789734d 100644 --- a/crates/jcode-tui/src/tui/app/misc_ui.rs +++ b/crates/jcode-tui/src/tui/app/misc_ui.rs @@ -38,9 +38,8 @@ impl ResolvedTokenPricing { cache_read_tokens: u64, cache_creation_tokens: u64, ) -> f32 { - let split_accounting = self.is_anthropic - || cache_creation_tokens > 0 - || cache_read_tokens > input_tokens; + let split_accounting = + self.is_anthropic || cache_creation_tokens > 0 || cache_read_tokens > input_tokens; let fresh_input_tokens = if split_accounting { input_tokens @@ -275,8 +274,7 @@ impl App { let model = ::provider_model(self); let provider_name = ::provider_name(self).to_lowercase(); - let is_anthropic = - provider_name.contains("anthropic") || provider_name.contains("claude"); + let is_anthropic = provider_name.contains("anthropic") || provider_name.contains("claude"); let is_openai = provider_name.contains("openai"); // The server resolves the active credential authoritatively; only bill diff --git a/crates/jcode-tui/src/tui/app/tests.rs b/crates/jcode-tui/src/tui/app/tests.rs index ea81b409b..0cc1ef805 100644 --- a/crates/jcode-tui/src/tui/app/tests.rs +++ b/crates/jcode-tui/src/tui/app/tests.rs @@ -452,7 +452,10 @@ fn skills_command_marks_active_skill_in_remote_mode() { assert!(content.contains("- /optimization (active)"), "{content}"); assert!(content.contains("- /firefox-browser\n"), "{content}"); // Endorsed list should mark remote-installed skills as installed. - assert!(content.contains("/firefox-browser [installed]"), "{content}"); + assert!( + content.contains("/firefox-browser [installed]"), + "{content}" + ); } #[test] diff --git a/crates/jcode-tui/src/tui/info_widget.rs b/crates/jcode-tui/src/tui/info_widget.rs index 9448ba98f..e6d669fbe 100644 --- a/crates/jcode-tui/src/tui/info_widget.rs +++ b/crates/jcode-tui/src/tui/info_widget.rs @@ -419,7 +419,11 @@ pub struct CacheMissAttribution { impl CacheHitInfo { /// Effective total prompt tokens across the session (read denominator). fn effective_reported_tokens(&self) -> u64 { - effective_prompt_tokens(self.reported_input_tokens, self.read_tokens, self.creation_tokens) + effective_prompt_tokens( + self.reported_input_tokens, + self.read_tokens, + self.creation_tokens, + ) } /// Fraction of the session's prompt tokens that were served from cache. diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index 5fd3f3031..8b56d4a46 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -116,9 +116,9 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { let line = timeout(timeout_duration, lines_stream.next_line()).await; match line { Ok(Ok(Some(l))) => trace_lines.push(l), - Ok(Ok(None)) => break false, // EOF — clean exit - Ok(Err(_)) => break false, // read error - Err(_) => break true, // timeout + Ok(Ok(None)) => break false, // EOF — clean exit + Ok(Err(_)) => break false, // read error + Err(_) => break true, // timeout } }; @@ -130,7 +130,9 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { return Ok(EvalRun { commit_sha: String::new(), prompt: config.prompt, - diff: extract_diff_from_repo(&config.repo_path).await.unwrap_or_default(), + diff: extract_diff_from_repo(&config.repo_path) + .await + .unwrap_or_default(), judging: Default::default(), cost_usd: 0.0, duration_ms: start.elapsed().as_millis() as u64, diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index bce8442a4..5e3c651d0 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -131,7 +131,9 @@ async fn main() -> Result<()> { .await?; } #[cfg(not(feature = "agent-runner"))] - anyhow::bail!("'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner"); + anyhow::bail!( + "'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner" + ); } Command::Judge { runs_dir, @@ -190,11 +192,7 @@ async fn pick_commits_impl( } let sha = lines[0].trim(); - let parent_sha = lines[1] - .split_whitespace() - .next() - .unwrap_or("") - .to_string(); + let parent_sha = lines[1].split_whitespace().next().unwrap_or("").to_string(); let subject = lines[2].trim(); // Skip root commits (no parent). @@ -287,13 +285,8 @@ async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> { })?; // git diff to get the full unified diff. - let full_diff = run_git(&[ - "diff", - &format!("{}..{}", pc.parent_sha, pc.sha), - ]) - .with_context(|| { - format!("git diff failed for {}..{}", pc.parent_sha, pc.sha) - })?; + let full_diff = run_git(&["diff", &format!("{}..{}", pc.parent_sha, pc.sha)]) + .with_context(|| format!("git diff failed for {}..{}", pc.parent_sha, pc.sha))?; let file_diffs = parse_diffs(&name_status, &full_diff); @@ -318,8 +311,8 @@ async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> { eval_commits, }; - let json = serde_json::to_string_pretty(&eval_data) - .context("failed to serialize EvalDataV2")?; + let json = + serde_json::to_string_pretty(&eval_data).context("failed to serialize EvalDataV2")?; std::fs::write(output, &json) .with_context(|| format!("failed to write output file {}", output.display()))?; @@ -562,7 +555,11 @@ fn parse_diffs(name_status: &str, full_diff: &str) -> Vec { // Renamed: "R100\told_path\tnew_path" if parts.len() >= 3 { - (FileDiffStatus::Renamed, parts[2].to_owned(), Some(parts[1].to_owned())) + ( + FileDiffStatus::Renamed, + parts[2].to_owned(), + Some(parts[1].to_owned()), + ) } else { (FileDiffStatus::Modified, parts[1].to_owned(), None) } @@ -583,10 +580,7 @@ fn parse_diffs(name_status: &str, full_diff: &str) -> Vec std::collections::HashMap run_claude_native_e2e(provider, model, tier).await?, Some("antigravity") => run_antigravity_native_e2e(provider, model, tier).await?, Some(other) => { - let kind = NativeProviderKind::from_normalized(other).ok_or_else(|| { - anyhow!("`{provider}` has no native provider-doctor driver") - })?; + let kind = NativeProviderKind::from_normalized(other) + .ok_or_else(|| anyhow!("`{provider}` has no native provider-doctor driver"))?; run_generic_native_e2e(kind, model, tier).await? } None => anyhow::bail!("`{provider}` has no native provider-doctor driver"), diff --git a/tests/e2e/reload_multiclient.rs b/tests/e2e/reload_multiclient.rs index dd8fd6b6f..8e6e077cd 100644 --- a/tests/e2e/reload_multiclient.rs +++ b/tests/e2e/reload_multiclient.rs @@ -160,7 +160,10 @@ async fn reload_notifies_successor_after_session_takeover() -> Result<()> { assert!( b_saw, "the live successor connection must be told the server is reloading; saw: {:?}", - b_events.iter().map(|e| format!("{e:?}")).collect::>() + b_events + .iter() + .map(|e| format!("{e:?}")) + .collect::>() ); // The superseded original connection must end (disconnect) rather than From 188a857b353b8d269cd54c85fa35ba2038ba33c1 Mon Sep 17 00:00:00 2001 From: quangdang46 Date: Sat, 6 Jun 2026 01:22:57 +0700 Subject: [PATCH 22/22] fix: address review swarm findings for PR #313 Security fixes: - H1: Add validate_team_name() to prevent path traversal in TeamConfig - H4: Reject BypassPermissions in project-local TOML agent definitions Runtime wiring: - H2: Wire shared AgentRegistry into production Registry::new sites - H3: Add classify_for_session() that checks per-session mode overrides - H5: Add max_turns enforcement in Agent turn loop - H6: Wire agent_def.resolve_model() into SubagentTool model resolution Code quality: - M4: Remove deny_unknown_fields from AgentDefinition for forward compat - M5: Align PermissionMode::parse() with serde kebab-case - M6: Gate experimental team/task tools behind JCODE_EXPERIMENTAL_TOOLS env - M7: Document parent session mutation race condition - M8: Add SessionModeGuard RAII for automatic session mode cleanup All 63 agent-runtime tests pass. cargo check clean. --- Cargo.lock | 1 + crates/jcode-agent-runtime/Cargo.toml | 1 + crates/jcode-agent-runtime/src/definition.rs | 13 +- crates/jcode-agent-runtime/src/permission.rs | 35 ++--- crates/jcode-agent-runtime/src/registry.rs | 11 ++ crates/jcode-app-core/src/agent.rs | 5 + .../src/agent/turn_execution.rs | 6 + crates/jcode-app-core/src/agent/turn_loops.rs | 18 +++ crates/jcode-app-core/src/ambient/runner.rs | 8 +- crates/jcode-app-core/src/dcg_bridge.rs | 43 ++++++ crates/jcode-app-core/src/overnight.rs | 1 + crates/jcode-app-core/src/server.rs | 3 +- .../src/server/client_lifecycle.rs | 2 +- .../src/server/client_session.rs | 3 + crates/jcode-app-core/src/server/headless.rs | 2 +- crates/jcode-app-core/src/tool/mod.rs | 122 +++++++++++++----- crates/jcode-app-core/src/tool/task.rs | 56 +++++--- crates/jcode-app-core/src/tool/team.rs | 24 ++++ .../jcode-base/src/provider/gemini_tests.rs | 2 +- src/bin/harness.rs | 2 +- src/cli/commands.rs | 2 +- src/cli/commands_tests.rs | 4 +- src/cli/provider_init.rs | 4 +- src/cli/selfdev_tests.rs | 4 +- tests/e2e/ambient.rs | 8 +- tests/e2e/provider_behavior.rs | 8 +- tests/e2e/session_flow.rs | 4 +- 27 files changed, 287 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 038002b3f..efbedfaaf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5172,6 +5172,7 @@ dependencies = [ "thiserror 1.0.69", "tokio", "toml", + "tracing", ] [[package]] diff --git a/crates/jcode-agent-runtime/Cargo.toml b/crates/jcode-agent-runtime/Cargo.toml index f66eb40ce..9a769a299 100644 --- a/crates/jcode-agent-runtime/Cargo.toml +++ b/crates/jcode-agent-runtime/Cargo.toml @@ -14,6 +14,7 @@ serde = { version = "1", features = ["derive"] } serde_json = "1" toml = "0.8" anyhow = "1" +tracing = "0.1" [dev-dependencies] serde_json = "1" diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index c26e5f3ad..61ce6190b 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -47,7 +47,6 @@ pub const DEFAULT_AGENT_VERSION: &str = "0.1.0"; /// Intentionally `Clone` so the runtime can hand each spawn its own copy /// without locking the registry. Definitions are small (a few KB at most). #[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] pub struct AgentDefinition { // ----------------------------------------------------------------- // Identity @@ -646,19 +645,15 @@ mod tests { } #[test] - fn toml_unknown_field_is_rejected() { + fn toml_unknown_field_is_silently_ignored() { let src = r#" id = "ok" display_name = "ok" unknown_future_field = "value" "#; - let err = toml::from_str::(src).unwrap_err(); - assert!( - err.to_string().contains("unknown field") - || err.to_string().contains("unknown") - || err.to_string().contains("`unknown_future_field`"), - "expected denial of unknown field, got: {err}" - ); + let def = toml::from_str::(src).expect("unknown fields should be ignored for forward compat"); + assert_eq!(def.id, "ok"); + assert_eq!(def.display_name, "ok"); } // ----------------------------------------------------------------- diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs index 2f112efc0..045922933 100644 --- a/crates/jcode-agent-runtime/src/permission.rs +++ b/crates/jcode-agent-runtime/src/permission.rs @@ -64,16 +64,15 @@ impl PermissionMode { } } - /// Parse a permission mode from a string, accepting common variants. + /// Parse a permission mode from a string. Only accepts kebab-case + /// variants matching the serde wire format for consistency. pub fn parse(s: &str) -> Option { match s.trim().to_ascii_lowercase().as_str() { "default" => Some(PermissionMode::Default), - "acceptedits" | "accept_edits" | "accept-edits" => Some(PermissionMode::AcceptEdits), + "accept-edits" => Some(PermissionMode::AcceptEdits), "plan" => Some(PermissionMode::Plan), - "dontask" | "dont_ask" | "dont-ask" => Some(PermissionMode::DontAsk), - "bypasspermissions" | "bypass_permissions" | "bypass-permissions" => { - Some(PermissionMode::BypassPermissions) - } + "dont-ask" => Some(PermissionMode::DontAsk), + "bypass-permissions" => Some(PermissionMode::BypassPermissions), "auto" => Some(PermissionMode::Auto), _ => None, } @@ -91,36 +90,20 @@ mod tests { use super::*; #[test] - fn parse_accepts_common_variants() { + fn parse_accepts_kebab_case_only() { assert_eq!( PermissionMode::parse("default"), Some(PermissionMode::Default) ); - assert_eq!( - PermissionMode::parse("AcceptEdits"), - Some(PermissionMode::AcceptEdits) - ); - assert_eq!( - PermissionMode::parse("accept_edits"), - Some(PermissionMode::AcceptEdits) - ); assert_eq!( PermissionMode::parse("accept-edits"), Some(PermissionMode::AcceptEdits) ); assert_eq!(PermissionMode::parse("plan"), Some(PermissionMode::Plan)); assert_eq!( - PermissionMode::parse("DONTASK"), + PermissionMode::parse("dont-ask"), Some(PermissionMode::DontAsk) ); - assert_eq!( - PermissionMode::parse("dont_ask"), - Some(PermissionMode::DontAsk) - ); - assert_eq!( - PermissionMode::parse("bypass_permissions"), - Some(PermissionMode::BypassPermissions) - ); assert_eq!( PermissionMode::parse("bypass-permissions"), Some(PermissionMode::BypassPermissions) @@ -128,6 +111,10 @@ mod tests { assert_eq!(PermissionMode::parse("auto"), Some(PermissionMode::Auto)); assert_eq!(PermissionMode::parse(""), None); assert_eq!(PermissionMode::parse("nonsense"), None); + // Non-kebab-case variants are rejected for serde consistency + assert_eq!(PermissionMode::parse("accept_edits"), None); + assert_eq!(PermissionMode::parse("AcceptEdits"), None); + assert_eq!(PermissionMode::parse("bypass_permissions"), None); } #[test] diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index d322e6a3c..9bc2398a8 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -21,6 +21,7 @@ //! session start. Self-dev is welcome to call `reload_from_disk()`. use crate::definition::{AgentDefinition, DefinitionError}; +use crate::permission::PermissionMode; use serde::{Deserialize, Serialize}; use std::collections::HashMap; @@ -228,6 +229,16 @@ impl AgentRegistry { AgentSource::ProjectLocal { path: path.clone() } } }; + let mut definition = definition; + if matches!(source, AgentSource::ProjectLocal { .. }) + && definition.permission_mode == Some(PermissionMode::BypassPermissions) + { + tracing::warn!( + agent_id = %definition.id, + "project-local agent definition attempted to set bypass-permissions; downgrading to default" + ); + definition.permission_mode = None; + } self.insert(LoadedAgent { definition, source }); loaded += 1; } diff --git a/crates/jcode-app-core/src/agent.rs b/crates/jcode-app-core/src/agent.rs index f62fb15e7..5518ef25e 100644 --- a/crates/jcode-app-core/src/agent.rs +++ b/crates/jcode-app-core/src/agent.rs @@ -270,6 +270,10 @@ pub struct Agent { mcp_late_register_resolved: bool, /// Override system prompt (used by ambient mode to inject a custom prompt) system_prompt_override: Option, + /// Maximum number of tool-call turns before the agent is forced to + /// stop. `None` means unlimited. Set by `SubagentTool` from the agent + /// definition's `max_turns` field. + max_turns: Option, /// Whether memory features are enabled for this session memory_enabled: bool, /// One-step undo snapshot captured before the most recent rewind. @@ -328,6 +332,7 @@ impl Agent { locked_tools: None, mcp_late_register_resolved: false, system_prompt_override: crate::config::config().provider.system_prompt.clone(), + max_turns: None, memory_enabled: crate::config::config().features.memory, rewind_undo_snapshot: None, stdin_request_tx: None, diff --git a/crates/jcode-app-core/src/agent/turn_execution.rs b/crates/jcode-app-core/src/agent/turn_execution.rs index bb23bded7..f60916a0c 100644 --- a/crates/jcode-app-core/src/agent/turn_execution.rs +++ b/crates/jcode-app-core/src/agent/turn_execution.rs @@ -215,6 +215,10 @@ impl Agent { self.system_prompt_override = Some(prompt.to_string()); } + pub fn set_max_turns(&mut self, max: u32) { + self.max_turns = Some(max); + } + pub fn set_debug(&mut self, is_debug: bool) { self.session.set_debug(is_debug); if let Err(err) = self.session.save() { @@ -246,6 +250,7 @@ impl Agent { pub(super) async fn tool_definitions(&mut self) -> Vec { if self.session.is_canary { self.registry.register_selfdev_tools().await; + self.registry.register_experimental_tools().await; } // Return locked tools if available (prevents cache invalidation from @@ -358,6 +363,7 @@ impl Agent { pub async fn tool_definitions_for_debug(&self) -> Vec { if self.session.is_canary { self.registry.register_selfdev_tools().await; + self.registry.register_experimental_tools().await; } let mut tools = self.registry.definitions(self.allowed_tools.as_ref()).await; if !self.disabled_tools.is_empty() { diff --git a/crates/jcode-app-core/src/agent/turn_loops.rs b/crates/jcode-app-core/src/agent/turn_loops.rs index 8be6df2db..96ccdbd15 100644 --- a/crates/jcode-app-core/src/agent/turn_loops.rs +++ b/crates/jcode-app-core/src/agent/turn_loops.rs @@ -14,8 +14,26 @@ impl Agent { let mut context_limit_retries = 0u32; let mut incomplete_continuations = 0u32; let mut empty_post_tool_continuations = 0u32; + let mut turn_count = 0u32; loop { + turn_count += 1; + if let Some(max) = self.max_turns { + if turn_count > max { + logging::info(&format!( + "max_turns limit reached ({}); forcing turn completion", + max + )); + if final_text.is_empty() { + final_text = format!( + "[agent stopped: reached max_turns limit of {}]", + max + ); + } + break; + } + } + let repaired = self.repair_missing_tool_outputs(); if repaired > 0 { logging::warn(&format!( diff --git a/crates/jcode-app-core/src/ambient/runner.rs b/crates/jcode-app-core/src/ambient/runner.rs index 092f17486..8a973d842 100644 --- a/crates/jcode-app-core/src/ambient/runner.rs +++ b/crates/jcode-app-core/src/ambient/runner.rs @@ -385,9 +385,10 @@ impl AmbientRunnerHandle { ) -> anyhow::Result<()> { let session = Session::load(session_id)?; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone(), None).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; if session.is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } // Issue #89: ambient cycles previously skipped MCP registration, so // user-installed MCP tools were invisible to the cycle agent — @@ -470,9 +471,10 @@ impl AmbientRunnerHandle { let child_is_canary = child.is_canary; let child_is_debug = child.is_debug; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone(), None).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; if child_is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } // Issue #89: register MCP tools for ambient cycles (same as main session). registry @@ -928,7 +930,7 @@ impl AmbientRunnerHandle { self.set_running_detail("setting up tools").await; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone(), None).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; registry.register_ambient_tools().await; // Issue #89: register MCP tools so user-installed MCP servers are // available to the ambient agent — without this, the cycle agent diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs index b91ac925f..1612992b3 100644 --- a/crates/jcode-app-core/src/dcg_bridge.rs +++ b/crates/jcode-app-core/src/dcg_bridge.rs @@ -153,6 +153,36 @@ pub fn session_mode(session_id: &str) -> Option { .and_then(|guard| guard.get(session_id).copied()) } +/// RAII guard that clears a per-session permission mode on drop. +/// +/// Use this instead of manual `set_session_mode` / `clear_session_mode` +/// pairs to guarantee cleanup even when the subagent exits via early +/// return or error path. +pub struct SessionModeGuard { + session_id: String, +} + +impl SessionModeGuard { + /// Set the per-session mode and return a guard that will clear it on + /// drop. If `mode` is `None`, no override is set and the guard is a + /// no-op on drop (but still safe to hold). + #[must_use] + pub fn new(session_id: &str, mode: Option) -> Self { + if let Some(mode) = mode { + set_session_mode(session_id, mode); + } + Self { + session_id: session_id.to_string(), + } + } +} + +impl Drop for SessionModeGuard { + fn drop(&mut self) { + clear_session_mode(&self.session_id); + } +} + /// Classify an action using the agent-specific permission mode when /// provided, falling back to the global mode otherwise. /// @@ -170,6 +200,19 @@ pub fn classify_for_agent( classify_with_mode(action, mode) } +/// Classify an action using the per-session mode override when one exists +/// for `session_id`, falling back to the global mode otherwise. +/// +/// This is the session-aware variant of [`classify`]. Call sites that +/// know the session id (e.g. tool execution within a subagent) should +/// prefer this over the global [`classify`] so that per-session +/// permission overrides set via [`set_session_mode`] are honoured. +#[must_use] +pub fn classify_for_session(action: &str, session_id: &str) -> BridgeDecision { + let mode = session_mode(session_id).unwrap_or_else(current_mode); + classify_with_mode(action, mode) +} + /// Three-state outcome from the bridge. jcode's `SafetySystem` collapses /// `Allow` to `ActionTier::AutoAllowed` and `Prompt`/`Deny` to /// `ActionTier::RequiresPermission` — but exposing the full set here diff --git a/crates/jcode-app-core/src/overnight.rs b/crates/jcode-app-core/src/overnight.rs index a619cdaaf..ee181ef3c 100644 --- a/crates/jcode-app-core/src/overnight.rs +++ b/crates/jcode-app-core/src/overnight.rs @@ -253,6 +253,7 @@ async fn run_supervisor( if child_is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } let mut agent = Agent::new_with_session(provider, registry, child, None); diff --git a/crates/jcode-app-core/src/server.rs b/crates/jcode-app-core/src/server.rs index 8669c9404..6ae36c4bc 100644 --- a/crates/jcode-app-core/src/server.rs +++ b/crates/jcode-app-core/src/server.rs @@ -636,9 +636,10 @@ impl Server { let previous_status = session.status.clone(); let provider = self.provider.fork(); - let registry = crate::tool::Registry::new(provider.clone(), None).await; + let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; if session.is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } registry .register_mcp_tools( diff --git a/crates/jcode-app-core/src/server/client_lifecycle.rs b/crates/jcode-app-core/src/server/client_lifecycle.rs index e52e2dd05..38fc6d646 100644 --- a/crates/jcode-app-core/src/server/client_lifecycle.rs +++ b/crates/jcode-app-core/src/server/client_lifecycle.rs @@ -418,7 +418,7 @@ pub(super) async fn handle_client( let provider = provider_template.fork(); let t0 = std::time::Instant::now(); - let registry = Registry::new(provider.clone(), None).await; + let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; let registry_ms = t0.elapsed().as_millis(); let mut swarm_enabled = crate::config::config().features.swarm; diff --git a/crates/jcode-app-core/src/server/client_session.rs b/crates/jcode-app-core/src/server/client_session.rs index 01b229fd1..d0542800a 100644 --- a/crates/jcode-app-core/src/server/client_session.rs +++ b/crates/jcode-app-core/src/server/client_session.rs @@ -592,6 +592,7 @@ pub(super) async fn handle_subscribe( } drop(agent_guard); registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } let mcp_register_ms = if register_mcp_tools { @@ -1039,6 +1040,7 @@ pub(super) async fn handle_resume_session( if is_canary { *client_selfdev = true; registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } *client_session_id = session_id.clone(); @@ -1233,6 +1235,7 @@ pub(super) async fn handle_resume_session( if result.is_ok() && is_canary { *client_selfdev = true; registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } match result { diff --git a/crates/jcode-app-core/src/server/headless.rs b/crates/jcode-app-core/src/server/headless.rs index ca2093a3f..8dc03feaa 100644 --- a/crates/jcode-app-core/src/server/headless.rs +++ b/crates/jcode-app-core/src/server/headless.rs @@ -50,7 +50,7 @@ pub(super) async fn create_headless_session( }; let provider = provider_template.fork(); - let registry = Registry::new(provider.clone(), None).await; + let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; registry.enable_memory_test_mode().await; diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index ecb9cc2f1..2299e5d88 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -98,6 +98,26 @@ fn session_tool_policy(session_id: &str) -> Option { .cloned() } +static SHARED_AGENT_REGISTRY: LazyLock>> = + LazyLock::new(|| { + let home = dirs::home_dir(); + let cwd = std::env::current_dir().ok(); + let mut registry = jcode_agent_runtime::AgentRegistry::new(); + registry.discover_standard_paths( + home.as_deref(), + cwd.as_deref(), + ); + if registry.is_empty() { + None + } else { + Some(Arc::new(registry)) + } + }); + +pub fn shared_agent_registry() -> Option> { + SHARED_AGENT_REGISTRY.clone() +} + /// Registry of available tools (Arc-wrapped for sharing) /// /// Clone creates a fresh CompactionManager so each subagent gets independent @@ -254,36 +274,6 @@ impl Registry { Self::insert_tool_timed(&mut m, &mut timings, "gmail", gmail::GmailTool::new); Self::insert_tool_timed(&mut m, &mut timings, "schedule", ambient::ScheduleTool::new); Self::insert_tool_timed(&mut m, &mut timings, "selfdev", selfdev::SelfDevTool::new); - Self::insert_tool_timed( - &mut m, - &mut timings, - "team_create", - team::TeamCreateTool::new, - ); - Self::insert_tool_timed( - &mut m, - &mut timings, - "team_delete", - team::TeamDeleteTool::new, - ); - Self::insert_tool_timed( - &mut m, - &mut timings, - "task_create", - task_management::TaskCreateTool::new, - ); - Self::insert_tool_timed( - &mut m, - &mut timings, - "task_update", - task_management::TaskUpdateTool::new, - ); - Self::insert_tool_timed( - &mut m, - &mut timings, - "task_list", - task_management::TaskListTool::new, - ); let nonzero: Vec = timings .iter() .filter(|(_, ms)| *ms > 0) @@ -383,6 +373,45 @@ impl Registry { Self::insert_tool(&mut tools_map, "dcp_recompress", DcpRecompressTool::new()); } + // Register experimental team/task tools when opted in via env var. + // Canary sessions register these explicitly via register_experimental_tools(). + let experimental_tools_enabled = matches!( + std::env::var("JCODE_EXPERIMENTAL_TOOLS") + .ok() + .as_deref() + .map(str::trim) + .map(str::to_ascii_lowercase) + .as_deref(), + Some("1") | Some("true") | Some("yes") | Some("on") + ); + if experimental_tools_enabled && !no_builtin { + Self::insert_tool( + &mut tools_map, + "team_create", + team::TeamCreateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "team_delete", + team::TeamDeleteTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_create", + task_management::TaskCreateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_update", + task_management::TaskUpdateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_list", + task_management::TaskListTool::new(), + ); + } + let write_start = std::time::Instant::now(); *registry.tools.write().await = tools_map; let write_ms = write_start.elapsed().as_millis(); @@ -995,6 +1024,39 @@ impl Registry { .await; } + /// Register experimental team/task tools. + /// + /// Gated behind `JCODE_EXPERIMENTAL_TOOLS=1` or canary sessions. + /// These tools expose team and task management primitives that are + /// still under active development and not yet ready for general use. + pub async fn register_experimental_tools(&self) { + self.register( + "team_create".to_string(), + Arc::new(team::TeamCreateTool::new()) as Arc, + ) + .await; + self.register( + "team_delete".to_string(), + Arc::new(team::TeamDeleteTool::new()) as Arc, + ) + .await; + self.register( + "task_create".to_string(), + Arc::new(task_management::TaskCreateTool::new()) as Arc, + ) + .await; + self.register( + "task_update".to_string(), + Arc::new(task_management::TaskUpdateTool::new()) as Arc, + ) + .await; + self.register( + "task_list".to_string(), + Arc::new(task_management::TaskListTool::new()) as Arc, + ) + .await; + } + /// Register ambient-mode tools (only for ambient sessions) pub async fn register_ambient_tools(&self) { self.register( diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs index 6c87e65ef..31546dddf 100644 --- a/crates/jcode-app-core/src/tool/task.rs +++ b/crates/jcode-app-core/src/tool/task.rs @@ -193,19 +193,39 @@ impl Tool for SubagentTool { }; let parent_subagent_model = Self::preferred_parent_subagent_model(&ctx.session_id); let provider_model = self.provider.model(); - let resolved_model = Self::resolve_model( - params.model.as_deref(), - session.model.as_deref(), - parent_subagent_model.as_deref(), - &provider_model, - ); + // When the agent definition specifies model_override or prefer_tier, + // use its resolve_model() which honours those fields. Otherwise fall + // back to the standard resolution chain. + let resolved_model = if let Some(def) = agent_def { + if def.model_override.is_some() || def.prefer_tier.is_some() { + def.resolve_model(&provider_model) + } else { + Self::resolve_model( + params.model.as_deref(), + session.model.as_deref(), + parent_subagent_model.as_deref(), + &provider_model, + ) + } + } else { + Self::resolve_model( + params.model.as_deref(), + session.model.as_deref(), + parent_subagent_model.as_deref(), + &provider_model, + ) + }; session.model = Some(resolved_model.clone()); if let Some(ref working_dir) = ctx.working_dir { session.working_dir = Some(working_dir.display().to_string()); } - // Register child in parent's session + // Register child in parent's session. + // NOTE: This load→mutate→save sequence is not atomic. Concurrent + // subagent spawns sharing the same parent could clobber each + // other's `children` entries. Acceptable for experimental Phase 0; + // a file-lock or in-memory session cache would fix this properly. if let Ok(mut parent_session) = Session::load(&ctx.session_id) { parent_session.add_child(session.id.clone()); let _ = parent_session.save(); @@ -214,16 +234,19 @@ impl Tool for SubagentTool { session.save()?; // Propagate the effective permission mode to the child session so - // that `dcg_bridge::classify_for_agent` / `session_mode` observe it - // during the child's tool execution. + // that `dcg_bridge::classify_for_session` / `session_mode` observe + // it during the child's tool execution. The guard clears the + // override on drop (both success and error paths). let child_session_id = session.id.clone(); - if let Some(pm) = effective_permission_mode { - let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm); - dcg_bridge::set_session_mode(&child_session_id, dcg_mode); + let _mode_guard = dcg_bridge::SessionModeGuard::new( + &child_session_id, + effective_permission_mode.map(dcg_bridge::permission_mode_to_dcg), + ); + if effective_permission_mode.is_some() { logging::info(&format!( "[tool:subagent] session {} permission mode: {} (from agent definition)", child_session_id, - pm.as_str(), + effective_permission_mode.unwrap().as_str(), )); } @@ -325,8 +348,9 @@ impl Tool for SubagentTool { )); } if let Some(max_turns) = def.max_turns { + agent.set_max_turns(max_turns); logging::info(&format!( - "[tool:subagent] agent definition '{}' specifies max_turns={}", + "[tool:subagent] agent definition '{}' max_turns={} enforced", params.subagent_type, max_turns, )); } @@ -344,7 +368,6 @@ impl Tool for SubagentTool { resolved_model, err )); - dcg_bridge::clear_session_mode(&child_session_id); return Err(err); } }; @@ -367,8 +390,7 @@ impl Tool for SubagentTool { start.elapsed().as_secs_f64() )); - // Clean up per-session permission mode to prevent unbounded growth. - dcg_bridge::clear_session_mode(&child_session_id); + // _mode_guard drops here, clearing the per-session permission override. listener.abort(); diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs index 6c8ad7738..39b48fc75 100644 --- a/crates/jcode-app-core/src/tool/team.rs +++ b/crates/jcode-app-core/src/tool/team.rs @@ -40,9 +40,31 @@ pub struct TeamTask { pub owner: Option, // member name } +/// Validate that a team name is safe for use as a filename. +/// Rejects path traversal attempts and special characters. +fn validate_team_name(name: &str) -> Result<()> { + if name.is_empty() { + anyhow::bail!("Team name cannot be empty"); + } + if name.contains("..") || name.contains('/') || name.contains('\\') { + anyhow::bail!( + "Team name '{}' is invalid: must not contain '..', '/', or '\\'", + name + ); + } + if !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') { + anyhow::bail!( + "Team name '{}' is invalid: only alphanumeric, hyphen, and underscore allowed", + name + ); + } + Ok(()) +} + impl TeamConfig { /// Load a team config from disk by name. pub fn load(name: &str) -> Result> { + validate_team_name(name)?; let path = teams_dir().join(format!("{name}.json")); if !path.exists() { return Ok(None); @@ -53,6 +75,7 @@ impl TeamConfig { /// Save this team config to disk. pub fn save(&self) -> Result<()> { + validate_team_name(&self.name)?; let dir = teams_dir(); std::fs::create_dir_all(&dir)?; let path = dir.join(format!("{}.json", self.name)); @@ -63,6 +86,7 @@ impl TeamConfig { /// Delete a team config from disk by name. pub fn delete(name: &str) -> Result<()> { + validate_team_name(name)?; let path = teams_dir().join(format!("{name}.json")); if path.exists() { std::fs::remove_file(&path)?; diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs index 21c3bcc6f..b59ce9225 100644 --- a/crates/jcode-base/src/provider/gemini_tests.rs +++ b/crates/jcode-base/src/provider/gemini_tests.rs @@ -400,7 +400,7 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() { #[tokio::test] async fn build_tools_from_registry_definitions_omits_const_keywords() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let built = build_tools(&defs).expect("gemini tools"); diff --git a/src/bin/harness.rs b/src/bin/harness.rs index d6e9a301d..e0a467f98 100644 --- a/src/bin/harness.rs +++ b/src/bin/harness.rs @@ -73,7 +73,7 @@ async fn main() -> Result<()> { eprintln!("Harness workspace: {}", workspace.display()); let provider: Arc = Arc::new(NoopProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let session_id = new_id("harness"); let base_ctx = ToolContext { diff --git a/src/cli/commands.rs b/src/cli/commands.rs index cbe734875..a257ce34e 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -2595,7 +2595,7 @@ pub async fn run_single_message_command( } else { super::provider_init::init_provider_for_validation(choice, model).await? }; - let registry = crate::tool::Registry::new(provider.clone()).await; + let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; let mut agent = crate::agent::Agent::new(provider.clone(), registry); restore_agent_session_if_requested(&mut agent, resume_session)?; diff --git a/src/cli/commands_tests.rs b/src/cli/commands_tests.rs index c8aba0c90..224e4bceb 100644 --- a/src/cli/commands_tests.rs +++ b/src/cli/commands_tests.rs @@ -952,7 +952,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut original = crate::agent::Agent::new(provider.clone(), registry); let original_session_id = original.session_id().to_string(); original @@ -960,7 +960,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() { .await .expect("seed session"); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut resumed = crate::agent::Agent::new(provider, registry); let fresh_session_id = resumed.session_id().to_string(); assert_ne!(fresh_session_id, original_session_id); diff --git a/src/cli/provider_init.rs b/src/cli/provider_init.rs index 7fce11689..f6efc81a7 100644 --- a/src/cli/provider_init.rs +++ b/src/cli/provider_init.rs @@ -1780,7 +1780,7 @@ pub async fn init_provider_and_registry( model: Option<&str>, ) -> Result<(Arc, tool::Registry)> { let provider = init_provider(choice, model).await?; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await; Ok((provider, registry)) } @@ -1789,7 +1789,7 @@ pub async fn init_provider_and_registry_for_validation( model: Option<&str>, ) -> Result<(Arc, tool::Registry)> { let provider = init_provider_for_validation(choice, model).await?; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await; Ok((provider, registry)) } diff --git a/src/cli/selfdev_tests.rs b/src/cli/selfdev_tests.rs index 643f73902..0836c9df7 100644 --- a/src/cli/selfdev_tests.rs +++ b/src/cli/selfdev_tests.rs @@ -130,7 +130,7 @@ async fn test_selfdev_tool_registration() { assert!(session.is_canary, "Session should be marked as canary"); let provider = Arc::new(TestProvider) as Arc; - let registry = tool::Registry::new(provider).await; + let registry = tool::Registry::new(provider, None).await; let tools_before: Vec = registry.tool_names().await; let has_selfdev_before = tools_before.contains(&"selfdev".to_string()); @@ -167,7 +167,7 @@ async fn test_selfdev_session_and_registry() { assert!(loaded.is_canary, "Loaded session should be canary"); let provider = Arc::new(TestProvider) as Arc; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), None).await; let tools_before = registry.tool_names().await; assert!( diff --git a/tests/e2e/ambient.rs b/tests/e2e/ambient.rs index d92012834..9438f2b0f 100644 --- a/tests/e2e/ambient.rs +++ b/tests/e2e/ambient.rs @@ -203,7 +203,7 @@ async fn test_ambient_end_cycle_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -261,7 +261,7 @@ async fn test_ambient_request_permission_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -309,7 +309,7 @@ async fn test_ambient_schedule_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -585,7 +585,7 @@ async fn test_full_ambient_cycle_simulation() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider.clone(), registry); diff --git a/tests/e2e/provider_behavior.rs b/tests/e2e/provider_behavior.rs index 5bce2b96f..f82213547 100644 --- a/tests/e2e/provider_behavior.rs +++ b/tests/e2e/provider_behavior.rs @@ -25,7 +25,7 @@ async fn test_multi_turn_conversation() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn @@ -60,7 +60,7 @@ async fn test_token_usage() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let response = agent.run_once_capture("Test").await?; @@ -84,7 +84,7 @@ async fn test_stream_error() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let result = agent.run_once_capture("Test").await; @@ -800,7 +800,7 @@ async fn test_system_prompt_no_claude_code_identity() -> Result<()> { // Keep a clone of Arc before converting to Arc let provider_for_check = provider.clone(); let provider_dyn: Arc = provider; - let registry = Registry::new(provider_dyn.clone()).await; + let registry = Registry::new(provider_dyn.clone(), None).await; let mut agent = Agent::new(provider_dyn, registry); // Run a simple query - we just need to trigger a complete() call diff --git a/tests/e2e/session_flow.rs b/tests/e2e/session_flow.rs index b84df85a1..587781d8b 100644 --- a/tests/e2e/session_flow.rs +++ b/tests/e2e/session_flow.rs @@ -138,7 +138,7 @@ async fn test_simple_response() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let response = agent.run_once_capture("Say hello").await?; @@ -154,7 +154,7 @@ async fn test_agent_clear_preserves_debug_flag() -> Result<()> { let _env = setup_test_env()?; let provider = MockProvider::new(); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.set_debug(true); let old_session_id = agent.session_id().to_string();