From 1f4f1da4a25404190e14bb85b4824661f7021cf7 Mon Sep 17 00:00:00 2001
From: quangdang46 <tranquangdang21@gmail.com>
Date: Mon, 25 May 2026 21:44:29 +0700
Subject: [PATCH 01/22] feat(agent-runtime): add AgentDefinition + ModelTier +
 OutputMode (Phase 0.1+0.2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lay the foundation for declarative agent definitions adapted from
Codebuff's AgentDefinition schema, but adapted to jcode's single-OAuth
provider reality:

- signals.rs: existing soft-interrupt + cancellation primitives moved
  into a named module; root-level re-exports preserved so src/agent.rs
  consumers compile unchanged.
- definition.rs: AgentDefinition struct (id, model_override, prefer_tier,
  reasoning, tool_names, spawnable_agents, prompts, output_mode,
  inherit_parent_system_prompt, include_message_history) with TOML
  round-trip + validation for id format, system_prompt vs inherit
  conflict, structured_output schema requirement, self-spawn, and
  duplicate tool/agent ids.
- tier.rs: user-defined tier slot (routine/thinking) backed by the
  same JCODE_ROUTING_* env vars as model_routing.rs (#100). NOT a
  catalog — agents inherit session model when no tier is configured,
  so subscription users (Claude Pro / ChatGPT Plus / Gemini Advanced)
  see no behavior change. Pay-per-token users opt in by setting two
  env vars.
- reasoning.rs: ReasoningEffort enum (minimal/low/medium/high).
- output.rs: OutputMode enum (last_message/all_messages/structured_output).

32 unit tests pass. Full `cargo check --bin jcode` succeeds.

This is Phase 0 of the multi-agent foundation — no runtime engine
changes yet. Next: TOML loader for .jcode/agents/*.toml + builtin
embedded agents (Phase 0.3).
---
 Cargo.lock                                   |   4 +
 crates/jcode-agent-runtime/Cargo.toml        |   7 +
 crates/jcode-agent-runtime/src/definition.rs | 495 +++++++++++++++++++
 crates/jcode-agent-runtime/src/lib.rs        | 134 ++---
 crates/jcode-agent-runtime/src/output.rs     |  75 +++
 crates/jcode-agent-runtime/src/reasoning.rs  | 108 ++++
 crates/jcode-agent-runtime/src/signals.rs    |  98 ++++
 crates/jcode-agent-runtime/src/tier.rs       | 330 +++++++++++++
 8 files changed, 1160 insertions(+), 91 deletions(-)
 create mode 100644 crates/jcode-agent-runtime/src/definition.rs
 create mode 100644 crates/jcode-agent-runtime/src/output.rs
 create mode 100644 crates/jcode-agent-runtime/src/reasoning.rs
 create mode 100644 crates/jcode-agent-runtime/src/signals.rs
 create mode 100644 crates/jcode-agent-runtime/src/tier.rs
diff --git a/Cargo.lock b/Cargo.lock
index 97eb00ea4..20990af27 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3645,8 +3645,12 @@ dependencies = [
 name = "jcode-agent-runtime"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "serde",
+ "serde_json",
  "thiserror 1.0.69",
  "tokio",
+ "toml",
 ]
 
 [[package]]
diff --git a/crates/jcode-agent-runtime/Cargo.toml b/crates/jcode-agent-runtime/Cargo.toml
index c475c51d8..f66eb40ce 100644
--- a/crates/jcode-agent-runtime/Cargo.toml
+++ b/crates/jcode-agent-runtime/Cargo.toml
@@ -10,3 +10,10 @@ path = "src/lib.rs"
 [dependencies]
 thiserror = "1"
 tokio = { version = "1", features = ["sync"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+toml = "0.8"
+anyhow = "1"
+
+[dev-dependencies]
+serde_json = "1"
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
new file mode 100644
index 000000000..3e2203e8b
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -0,0 +1,495 @@
+//! Declarative agent definitions.
+//!
+//! An `AgentDefinition` is the schema that describes a sub-agent: its model
+//! preferences, the tools it's allowed to call, the agents it can spawn,
+//! the prompts it ships, and how its output flows back to its parent.
+//!
+//! Definitions are loaded from TOML files in three locations (highest
+//! priority first):
+//!
+//!   1. `.jcode/agents/<id>.toml` (project-local, committed to repo)
+//!   2. `~/.jcode/agents/<id>.toml` (user-global)
+//!   3. Embedded built-in agents bundled with the binary
+//!
+//! ## Design constraints
+//!
+//! - Definitions are **declarative TOML**, not Rust code, so users can
+//!   add agents without recompiling the binary.
+//! - `model` is **not required**: agents inherit the session's current
+//!   model unless they explicitly opt into tier slots or override.
+//! - `tool_names` is a whitelist — agents start with NO tools by
+//!   default and must list every tool they need. This is a security
+//!   property: a poorly-defined agent can't escalate by accident.
+//! - `spawnable_agents` is also a whitelist for the same reason.
+//!
+//! ## Adapted from Codebuff
+//!
+//! Field names track Codebuff's `AgentDefinition` (snake_case Rust →
+//! camelCase TS) so prior art is reusable. Differences:
+//!
+//! - No `model` field as required string — replaced by tier + override.
+//! - No `providerOptions` — jcode's session has a single provider.
+//! - `handle_steps` is a future addition (programmatic agents arrive in
+//!   Phase 2); for now agents are pure prompted.
+
+use crate::output::OutputMode;
+use crate::reasoning::ReasoningEffort;
+use crate::tier::ModelTier;
+
+use serde::{Deserialize, Serialize};
+
+/// Default version assigned when a definition omits `version`.
+pub const DEFAULT_AGENT_VERSION: &str = "0.1.0";
+
+/// Declarative description of one agent.
+///
+/// Intentionally `Clone` so the runtime can hand each spawn its own copy
+/// without locking the registry. Definitions are small (a few KB at most).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentDefinition {
+    // -----------------------------------------------------------------
+    // Identity
+    // -----------------------------------------------------------------
+    /// Unique agent id. Lowercase letters, digits, hyphens. e.g. `file-picker`.
+    /// Must be unique within the registry — duplicate ids are a load error.
+    pub id: String,
+
+    /// Human-readable name shown in TUI / logs. e.g. `"Fletcher the File Fetcher"`.
+    pub display_name: String,
+
+    /// Publisher / namespace id when this agent is shared across projects.
+    /// Optional for local agents; required if the agent is published to a
+    /// future agent registry.
+    #[serde(default)]
+    pub publisher: Option<String>,
+
+    /// Semver-ish version. Defaults to `DEFAULT_AGENT_VERSION`.
+    #[serde(default = "default_version")]
+    pub version: String,
+
+    // -----------------------------------------------------------------
+    // Model selection
+    // -----------------------------------------------------------------
+    /// Optional tier slot to prefer when running this agent. The slot is
+    /// resolved against `JCODE_ROUTING_<TIER>` env vars at run time.
+    /// Falls back to the session's current model if unset.
+    ///
+    /// See `tier.rs` for the full resolution algorithm.
+    #[serde(default)]
+    pub prefer_tier: Option<ModelTier>,
+
+    /// Optional explicit model id override. Highest priority — beats
+    /// `prefer_tier` and the session default. Use sparingly; hardcoding
+    /// model ids makes the agent file non-portable across providers.
+    #[serde(default)]
+    pub model_override: Option<String>,
+
+    /// Optional reasoning effort to forward to the provider request.
+    /// Defaults are model-specific; runtime fills in a sensible default
+    /// when this field is `None`.
+    #[serde(default)]
+    pub reasoning: Option<ReasoningEffort>,
+
+    // -----------------------------------------------------------------
+    // Tools and sub-agents
+    // -----------------------------------------------------------------
+    /// Allowlist of tool names this agent may call. Empty list = no tools.
+    /// Whitelist semantics are deliberate — agents shouldn't have access
+    /// to tools they don't need.
+    #[serde(default)]
+    pub tool_names: Vec<String>,
+
+    /// Allowlist of agent ids this agent may `spawn_agents` / `spawn_agent_inline`.
+    /// Empty list = no spawning. Use the local agent id (e.g. `file-picker`)
+    /// or the future `publisher/agent@version` form for shared agents.
+    #[serde(default)]
+    pub spawnable_agents: Vec<String>,
+
+    // -----------------------------------------------------------------
+    // Prompts
+    // -----------------------------------------------------------------
+    /// System prompt for this agent. Background, persona, mandates.
+    /// Mutually exclusive with `inherit_parent_system_prompt = true`
+    /// (which means "use the parent's system prompt instead, for cache
+    /// prefix sharing").
+    #[serde(default)]
+    pub system_prompt: String,
+
+    /// Instructions inserted after each user message. The most common
+    /// place to shape agent behavior — terser than `system_prompt`,
+    /// changes per turn allowed.
+    #[serde(default)]
+    pub instructions_prompt: Option<String>,
+
+    /// Optional reminder inserted at every agent step. Use sparingly —
+    /// strong models follow `instructions_prompt` reliably; this is for
+    /// weaker models or agents that need a per-step nudge.
+    #[serde(default)]
+    pub step_prompt: Option<String>,
+
+    /// Spawner-side prompt: when and why a parent agent should spawn this
+    /// agent. Used in `spawn_agents` tool documentation so the parent's
+    /// LLM picks the right sub-agent.
+    #[serde(default)]
+    pub spawner_prompt: Option<String>,
+
+    // -----------------------------------------------------------------
+    // Context / cache behavior
+    // -----------------------------------------------------------------
+    /// When true, child agent uses the parent's `system_prompt` instead
+    /// of its own. This is the **prompt cache prefix-sharing trick** —
+    /// editor / reviewer agents typically set this to `true` so the
+    /// expensive system prompt is cache-hit rather than re-sent.
+    ///
+    /// Mutually exclusive with a non-empty `system_prompt`.
+    #[serde(default)]
+    pub inherit_parent_system_prompt: bool,
+
+    /// When true, child agent receives the parent's full message history.
+    /// Default false — most sub-agents work better with a clean slate
+    /// (file-picker doesn't need to see edit chatter).
+    #[serde(default)]
+    pub include_message_history: bool,
+
+    // -----------------------------------------------------------------
+    // Output
+    // -----------------------------------------------------------------
+    /// How the agent's output is delivered to the parent. Default
+    /// `LastMessage`.
+    #[serde(default)]
+    pub output_mode: OutputMode,
+
+    /// JSON schema for `StructuredOutput` mode. Validated when the agent
+    /// calls `set_output`. Stored as raw JSON value because we don't
+    /// pull a JSON-schema crate yet — Phase 3 will add proper validation.
+    #[serde(default)]
+    pub output_schema: Option<serde_json::Value>,
+}
+
+fn default_version() -> String {
+    DEFAULT_AGENT_VERSION.to_string()
+}
+
+/// Validation errors produced when an agent definition violates its
+/// invariants. Displayed to users when a TOML file fails to load.
+#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
+pub enum DefinitionError {
+    #[error(
+        "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen"
+    )]
+    InvalidId(String),
+
+    #[error(
+        "agent `{id}` has both `inherit_parent_system_prompt = true` and a non-empty `system_prompt`. Set one or the other."
+    )]
+    SystemPromptConflict { id: String },
+
+    #[error(
+        "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing"
+    )]
+    StructuredOutputMissingSchema { id: String },
+
+    #[error("agent `{id}` references itself in `spawnable_agents`")]
+    SelfSpawn { id: String },
+
+    #[error("agent `{id}` lists tool `{tool}` more than once in `tool_names`")]
+    DuplicateTool { id: String, tool: String },
+
+    #[error("agent `{id}` lists agent `{spawn}` more than once in `spawnable_agents`")]
+    DuplicateSpawnable { id: String, spawn: String },
+}
+
+impl AgentDefinition {
+    /// Validate id format + cross-field invariants. Returns `Ok(())` when
+    /// the definition is well-formed.
+    pub fn validate(&self) -> Result<(), DefinitionError> {
+        // 1. id format
+        if !is_valid_id(&self.id) {
+            return Err(DefinitionError::InvalidId(self.id.clone()));
+        }
+
+        // 2. system_prompt vs inherit_parent_system_prompt mutual exclusion
+        if self.inherit_parent_system_prompt && !self.system_prompt.is_empty() {
+            return Err(DefinitionError::SystemPromptConflict {
+                id: self.id.clone(),
+            });
+        }
+
+        // 3. structured_output requires schema
+        if matches!(self.output_mode, OutputMode::StructuredOutput)
+            && self.output_schema.is_none()
+        {
+            return Err(DefinitionError::StructuredOutputMissingSchema {
+                id: self.id.clone(),
+            });
+        }
+
+        // 4. cannot spawn self
+        if self.spawnable_agents.iter().any(|s| s == &self.id) {
+            return Err(DefinitionError::SelfSpawn {
+                id: self.id.clone(),
+            });
+        }
+
+        // 5. no duplicate tool names
+        let mut seen_tools = std::collections::HashSet::new();
+        for tool in &self.tool_names {
+            if !seen_tools.insert(tool.clone()) {
+                return Err(DefinitionError::DuplicateTool {
+                    id: self.id.clone(),
+                    tool: tool.clone(),
+                });
+            }
+        }
+
+        // 6. no duplicate spawnable agent ids
+        let mut seen_spawn = std::collections::HashSet::new();
+        for spawn in &self.spawnable_agents {
+            if !seen_spawn.insert(spawn.clone()) {
+                return Err(DefinitionError::DuplicateSpawnable {
+                    id: self.id.clone(),
+                    spawn: spawn.clone(),
+                });
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Resolve the concrete model id to use for one invocation of this agent.
+    /// Convenience wrapper around `tier::resolve_model`.
+    pub fn resolve_model(&self, current_session_model: &str) -> String {
+        crate::tier::resolve_model(
+            self.model_override.as_deref(),
+            self.prefer_tier,
+            current_session_model,
+        )
+    }
+}
+
+/// Agent ids are intentionally restrictive: lowercase ASCII letters, digits,
+/// and hyphens. No leading/trailing hyphen. Mirrors Codebuff's id rule and
+/// avoids cross-platform path issues when ids become file names.
+fn is_valid_id(id: &str) -> bool {
+    if id.is_empty() {
+        return false;
+    }
+    if id.starts_with('-') || id.ends_with('-') {
+        return false;
+    }
+    id.chars()
+        .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-')
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn minimal_definition(id: &str) -> AgentDefinition {
+        AgentDefinition {
+            id: id.to_string(),
+            display_name: format!("Display for {id}"),
+            publisher: None,
+            version: DEFAULT_AGENT_VERSION.to_string(),
+            prefer_tier: None,
+            model_override: None,
+            reasoning: None,
+            tool_names: Vec::new(),
+            spawnable_agents: Vec::new(),
+            system_prompt: String::new(),
+            instructions_prompt: None,
+            step_prompt: None,
+            spawner_prompt: None,
+            inherit_parent_system_prompt: false,
+            include_message_history: false,
+            output_mode: OutputMode::LastMessage,
+            output_schema: None,
+        }
+    }
+
+    #[test]
+    fn id_validation_rejects_uppercase() {
+        let mut d = minimal_definition("File-Picker");
+        d.id = "File-Picker".to_string();
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::InvalidId(_))
+        ));
+    }
+
+    #[test]
+    fn id_validation_rejects_underscore() {
+        let mut d = minimal_definition("file_picker");
+        d.id = "file_picker".to_string();
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::InvalidId(_))
+        ));
+    }
+
+    #[test]
+    fn id_validation_rejects_leading_hyphen() {
+        let mut d = minimal_definition("ok");
+        d.id = "-bad".to_string();
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::InvalidId(_))
+        ));
+    }
+
+    #[test]
+    fn id_validation_accepts_normal_kebab() {
+        let d = minimal_definition("file-picker-max");
+        assert!(d.validate().is_ok());
+    }
+
+    #[test]
+    fn inherit_and_system_prompt_conflict() {
+        let mut d = minimal_definition("editor");
+        d.inherit_parent_system_prompt = true;
+        d.system_prompt = "should be empty".to_string();
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::SystemPromptConflict { .. })
+        ));
+    }
+
+    #[test]
+    fn inherit_alone_is_fine() {
+        let mut d = minimal_definition("editor");
+        d.inherit_parent_system_prompt = true;
+        d.system_prompt = String::new();
+        assert!(d.validate().is_ok());
+    }
+
+    #[test]
+    fn structured_output_requires_schema() {
+        let mut d = minimal_definition("judge");
+        d.output_mode = OutputMode::StructuredOutput;
+        d.output_schema = None;
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::StructuredOutputMissingSchema { .. })
+        ));
+    }
+
+    #[test]
+    fn structured_output_with_schema_ok() {
+        let mut d = minimal_definition("judge");
+        d.output_mode = OutputMode::StructuredOutput;
+        d.output_schema = Some(serde_json::json!({"type": "object"}));
+        assert!(d.validate().is_ok());
+    }
+
+    #[test]
+    fn self_spawn_detected() {
+        let mut d = minimal_definition("editor");
+        d.spawnable_agents.push("editor".to_string());
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::SelfSpawn { .. })
+        ));
+    }
+
+    #[test]
+    fn duplicate_tool_detected() {
+        let mut d = minimal_definition("editor");
+        d.tool_names.push("read".to_string());
+        d.tool_names.push("read".to_string());
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::DuplicateTool { .. })
+        ));
+    }
+
+    #[test]
+    fn duplicate_spawnable_detected() {
+        let mut d = minimal_definition("editor");
+        d.spawnable_agents.push("file-picker".to_string());
+        d.spawnable_agents.push("file-picker".to_string());
+        assert!(matches!(
+            d.validate(),
+            Err(DefinitionError::DuplicateSpawnable { .. })
+        ));
+    }
+
+    #[test]
+    fn resolve_model_uses_session_default_when_no_overrides() {
+        let d = minimal_definition("any");
+        assert_eq!(d.resolve_model("claude-sonnet"), "claude-sonnet");
+    }
+
+    #[test]
+    fn resolve_model_uses_override() {
+        let mut d = minimal_definition("any");
+        d.model_override = Some("forced-model".to_string());
+        assert_eq!(d.resolve_model("ignored"), "forced-model");
+    }
+
+    // -----------------------------------------------------------------
+    // TOML round-trip — exercises serde defaults and field coverage
+    // -----------------------------------------------------------------
+    #[test]
+    fn toml_minimal_loads_with_defaults() {
+        let src = r#"
+            id = "file-picker"
+            display_name = "Fletcher"
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        d.validate().expect("validate");
+        assert_eq!(d.id, "file-picker");
+        assert_eq!(d.version, DEFAULT_AGENT_VERSION);
+        assert_eq!(d.output_mode, OutputMode::LastMessage);
+        assert!(d.tool_names.is_empty());
+        assert!(d.spawnable_agents.is_empty());
+        assert!(!d.inherit_parent_system_prompt);
+    }
+
+    #[test]
+    fn toml_full_definition_loads() {
+        let src = r#"
+            id = "editor"
+            display_name = "Code Editor"
+            version = "1.2.0"
+            publisher = "jcode"
+            prefer_tier = "thinking"
+            reasoning = "high"
+            tool_names = ["str_replace", "write_file"]
+            spawnable_agents = ["file-picker"]
+            inherit_parent_system_prompt = true
+            include_message_history = true
+            output_mode = "all_messages"
+            instructions_prompt = "Implement the requested change."
+            step_prompt = "Continue editing."
+            spawner_prompt = "Use this agent for code edits."
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        d.validate().expect("validate");
+        assert_eq!(d.id, "editor");
+        assert_eq!(d.version, "1.2.0");
+        assert_eq!(d.publisher.as_deref(), Some("jcode"));
+        assert_eq!(d.prefer_tier, Some(ModelTier::Thinking));
+        assert_eq!(d.reasoning, Some(ReasoningEffort::High));
+        assert_eq!(d.tool_names, vec!["str_replace", "write_file"]);
+        assert!(d.inherit_parent_system_prompt);
+        assert_eq!(d.output_mode, OutputMode::AllMessages);
+    }
+
+    #[test]
+    fn toml_unknown_field_is_rejected() {
+        // We DO NOT use `#[serde(deny_unknown_fields)]` because forward-compat
+        // matters when older binaries read newer TOML. But typo'd known fields
+        // are silently ignored — that's a UX hazard. Document the tradeoff
+        // here: if this becomes a problem, switch to deny_unknown_fields and
+        // version the schema explicitly.
+        //
+        // For now, this test just verifies unknown fields don't crash.
+        let src = r#"
+            id = "ok"
+            display_name = "ok"
+            unknown_future_field = "value"
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        d.validate().expect("validate");
+    }
+}
diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index 70bf958d6..5599633aa 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -1,91 +1,43 @@
-use std::sync::Arc;
-
-/// A soft interrupt message queued for injection at the next safe point.
-#[derive(Debug, Clone)]
-pub struct SoftInterruptMessage {
-    pub content: String,
-    /// If true, can skip remaining tools when injected at point C.
-    pub urgent: bool,
-    pub source: SoftInterruptSource,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum SoftInterruptSource {
-    User,
-    System,
-    BackgroundTask,
-}
-
-/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock.
-pub type SoftInterruptQueue = Arc<std::sync::Mutex<Vec<SoftInterruptMessage>>>;
-
-/// Signal to move the currently executing tool to background.
-/// Uses std::sync so it can be set without async from outside the agent lock.
-pub type BackgroundToolSignal = Arc<std::sync::atomic::AtomicBool>;
-
-/// Signal to gracefully stop generation.
-pub type GracefulShutdownSignal = Arc<std::sync::atomic::AtomicBool>;
-
-/// Async-aware interrupt signal that combines AtomicBool (sync read) with
-/// tokio::Notify (async wake). Eliminates spin-loops during tool execution.
-#[derive(Clone)]
-pub struct InterruptSignal {
-    flag: Arc<std::sync::atomic::AtomicBool>,
-    notify: Arc<tokio::sync::Notify>,
-}
-
-impl InterruptSignal {
-    pub fn new() -> Self {
-        Self {
-            flag: Arc::new(std::sync::atomic::AtomicBool::new(false)),
-            notify: Arc::new(tokio::sync::Notify::new()),
-        }
-    }
-
-    pub fn fire(&self) {
-        self.flag.store(true, std::sync::atomic::Ordering::SeqCst);
-        self.notify.notify_waiters();
-    }
-
-    pub fn is_set(&self) -> bool {
-        self.flag.load(std::sync::atomic::Ordering::SeqCst)
-    }
-
-    pub fn reset(&self) {
-        self.flag.store(false, std::sync::atomic::Ordering::SeqCst);
-    }
-
-    pub async fn notified(&self) {
-        let notified = self.notify.notified();
-        if self.is_set() {
-            return;
-        }
-        notified.await;
-    }
-
-    pub fn as_atomic(&self) -> Arc<std::sync::atomic::AtomicBool> {
-        Arc::clone(&self.flag)
-    }
-}
-
-impl Default for InterruptSignal {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-#[error("{message}")]
-pub struct StreamError {
-    pub message: String,
-    pub retry_after_secs: Option<u64>,
-}
-
-impl StreamError {
-    pub fn new(message: String, retry_after_secs: Option<u64>) -> Self {
-        Self {
-            message,
-            retry_after_secs,
-        }
-    }
-}
+//! Agent runtime primitives: signals, declarative agent definitions, and
+//! tier-based model resolution.
+//!
+//! This crate intentionally stays small and dependency-light. Heavier
+//! engine work (loop, programmatic steps, spawn management) lives in
+//! `src/agent.rs` and will migrate here incrementally as Phase 0 → Phase 2
+//! land.
+//!
+//! ## Modules
+//!
+//! - [`signals`] — soft-interrupt + cancellation primitives shared with
+//!   the server runtime.
+//! - [`definition`] — declarative `AgentDefinition` schema loaded from
+//!   `.jcode/agents/*.toml`.
+//! - [`tier`] — user-defined model tier slot resolution (extends
+//!   `model_routing.rs` #100).
+//! - [`output`] — `OutputMode` enum (last_message / all_messages /
+//!   structured_output).
+//! - [`reasoning`] — `ReasoningEffort` enum (minimal / low / medium / high).
+//!
+//! ## Re-exports
+//!
+//! All previous public types stay re-exported at the crate root so existing
+//! consumers (`src/agent.rs`) compile unchanged.
+
+pub mod definition;
+pub mod output;
+pub mod reasoning;
+pub mod signals;
+pub mod tier;
+
+// Backwards-compatible re-exports for existing consumers. Do not remove
+// without auditing `src/agent.rs` and other in-tree users.
+pub use signals::{
+    BackgroundToolSignal, GracefulShutdownSignal, InterruptSignal, SoftInterruptMessage,
+    SoftInterruptQueue, SoftInterruptSource, StreamError,
+};
+
+// New public surface (Phase 0).
+pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION};
+pub use output::OutputMode;
+pub use reasoning::ReasoningEffort;
+pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource};
diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs
new file mode 100644
index 000000000..1ba93dd1a
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/output.rs
@@ -0,0 +1,75 @@
+//! How an agent's output is delivered back to its parent.
+//!
+//! Adapted from Codebuff's `outputMode` field. Three modes cover the
+//! useful cases:
+//!
+//! - `LastMessage`: parent receives only the agent's final assistant turn.
+//!   Default. Good for "research-and-summarize" agents like file-picker.
+//! - `AllMessages`: parent receives the full child message history (text
+//!   + tool calls + tool results). Good for editor-like agents that need
+//!   to expose their full edit trace.
+//! - `StructuredOutput`: agent must call `set_output` with a JSON value
+//!   that conforms to `output_schema`. Good for judge agents, lessons
+//!   extractors, structured planners.
+
+use serde::{Deserialize, Serialize};
+
+/// Output delivery mode for a sub-agent.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum OutputMode {
+    /// Parent receives only the final assistant turn. (Default.)
+    #[default]
+    LastMessage,
+    /// Parent receives the full message history of the child agent.
+    AllMessages,
+    /// Agent must produce a JSON object conforming to its `output_schema`.
+    /// Validated on `set_output` tool call.
+    StructuredOutput,
+}
+
+impl OutputMode {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            OutputMode::LastMessage => "last_message",
+            OutputMode::AllMessages => "all_messages",
+            OutputMode::StructuredOutput => "structured_output",
+        }
+    }
+
+    pub fn parse(s: &str) -> Option<OutputMode> {
+        match s.trim().to_ascii_lowercase().as_str() {
+            "last_message" | "lastmessage" | "last" => Some(OutputMode::LastMessage),
+            "all_messages" | "allmessages" | "all" => Some(OutputMode::AllMessages),
+            "structured_output" | "structured" | "json" => Some(OutputMode::StructuredOutput),
+            _ => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_accepts_aliases() {
+        assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage));
+        assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages));
+        assert_eq!(
+            OutputMode::parse("structured"),
+            Some(OutputMode::StructuredOutput)
+        );
+        assert_eq!(OutputMode::parse("nonsense"), None);
+    }
+
+    #[test]
+    fn default_is_last_message() {
+        assert_eq!(OutputMode::default(), OutputMode::LastMessage);
+    }
+
+    #[test]
+    fn serde_uses_snake_case() {
+        let s = serde_json::to_string(&OutputMode::StructuredOutput).unwrap();
+        assert_eq!(s, "\"structured_output\"");
+    }
+}
diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs
new file mode 100644
index 000000000..d48bafaeb
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/reasoning.rs
@@ -0,0 +1,108 @@
+//! Reasoning effort levels for agents.
+//!
+//! Mirrors the OpenAI/Anthropic reasoning effort knobs. When an agent
+//! definition specifies a reasoning effort, the agent runtime forwards it
+//! to the provider request (where supported). Models that don't support
+//! reasoning ignore the field.
+
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// How much reasoning the model should use for this agent.
+///
+/// Maps roughly to:
+///   - `Minimal` → `effort: "minimal"` (gpt-5 family) / no thinking budget (Claude)
+///   - `Low`     → `effort: "low"` / small thinking budget
+///   - `Medium`  → `effort: "medium"` / default thinking budget
+///   - `High`    → `effort: "high"` / large thinking budget (~32k tokens)
+///
+/// Default is `Medium` because that matches most agents' baseline behavior.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ReasoningEffort {
+    Minimal,
+    Low,
+    #[default]
+    Medium,
+    High,
+}
+
+impl ReasoningEffort {
+    /// String representation matching the wire format used by major providers
+    /// (OpenAI Responses API `reasoning.effort`, OpenRouter `reasoning.effort`).
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            ReasoningEffort::Minimal => "minimal",
+            ReasoningEffort::Low => "low",
+            ReasoningEffort::Medium => "medium",
+            ReasoningEffort::High => "high",
+        }
+    }
+
+    /// Numeric rank for threshold comparison (matches `model_routing.rs`).
+    /// Higher = more reasoning.
+    pub fn rank(&self) -> u8 {
+        match self {
+            ReasoningEffort::Minimal => 0,
+            ReasoningEffort::Low => 1,
+            ReasoningEffort::Medium => 2,
+            ReasoningEffort::High => 3,
+        }
+    }
+
+    /// Parse a string value, accepting common aliases. Returns `None` for
+    /// unknown input so the caller can decide whether to error or default.
+    pub fn parse(s: &str) -> Option<ReasoningEffort> {
+        match s.trim().to_ascii_lowercase().as_str() {
+            "minimal" | "none" | "off" => Some(ReasoningEffort::Minimal),
+            "low" => Some(ReasoningEffort::Low),
+            "medium" | "default" => Some(ReasoningEffort::Medium),
+            "high" | "max" => Some(ReasoningEffort::High),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for ReasoningEffort {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_accepts_common_aliases() {
+        assert_eq!(
+            ReasoningEffort::parse("minimal"),
+            Some(ReasoningEffort::Minimal)
+        );
+        assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal));
+        assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High));
+        assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium));
+        assert_eq!(ReasoningEffort::parse(""), None);
+        assert_eq!(ReasoningEffort::parse("absurd"), None);
+    }
+
+    #[test]
+    fn rank_orders_efforts_correctly() {
+        assert!(ReasoningEffort::Minimal.rank() < ReasoningEffort::Low.rank());
+        assert!(ReasoningEffort::Low.rank() < ReasoningEffort::Medium.rank());
+        assert!(ReasoningEffort::Medium.rank() < ReasoningEffort::High.rank());
+    }
+
+    #[test]
+    fn default_is_medium() {
+        assert_eq!(ReasoningEffort::default(), ReasoningEffort::Medium);
+    }
+
+    #[test]
+    fn serde_roundtrip_via_lowercase() {
+        let s = serde_json::to_string(&ReasoningEffort::High).unwrap();
+        assert_eq!(s, "\"high\"");
+        let back: ReasoningEffort = serde_json::from_str("\"medium\"").unwrap();
+        assert_eq!(back, ReasoningEffort::Medium);
+    }
+}
diff --git a/crates/jcode-agent-runtime/src/signals.rs b/crates/jcode-agent-runtime/src/signals.rs
new file mode 100644
index 000000000..67acf5082
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/signals.rs
@@ -0,0 +1,98 @@
+//! Soft-interrupt + cancellation signals for the agent loop.
+//!
+//! These primitives are shared between the agent runtime, the server
+//! lifecycle, and any callers that need to drive interrupts without
+//! holding the agent lock. Keep this module dependency-light — `tokio`
+//! sync + `std::sync` only.
+
+use std::sync::Arc;
+
+/// A soft interrupt message queued for injection at the next safe point.
+#[derive(Debug, Clone)]
+pub struct SoftInterruptMessage {
+    pub content: String,
+    /// If true, can skip remaining tools when injected at point C.
+    pub urgent: bool,
+    pub source: SoftInterruptSource,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SoftInterruptSource {
+    User,
+    System,
+    BackgroundTask,
+}
+
+/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock.
+pub type SoftInterruptQueue = Arc<std::sync::Mutex<Vec<SoftInterruptMessage>>>;
+
+/// Signal to move the currently executing tool to background.
+/// Uses std::sync so it can be set without async from outside the agent lock.
+pub type BackgroundToolSignal = Arc<std::sync::atomic::AtomicBool>;
+
+/// Signal to gracefully stop generation.
+pub type GracefulShutdownSignal = Arc<std::sync::atomic::AtomicBool>;
+
+/// Async-aware interrupt signal that combines AtomicBool (sync read) with
+/// tokio::Notify (async wake). Eliminates spin-loops during tool execution.
+#[derive(Clone)]
+pub struct InterruptSignal {
+    flag: Arc<std::sync::atomic::AtomicBool>,
+    notify: Arc<tokio::sync::Notify>,
+}
+
+impl InterruptSignal {
+    pub fn new() -> Self {
+        Self {
+            flag: Arc::new(std::sync::atomic::AtomicBool::new(false)),
+            notify: Arc::new(tokio::sync::Notify::new()),
+        }
+    }
+
+    pub fn fire(&self) {
+        self.flag.store(true, std::sync::atomic::Ordering::SeqCst);
+        self.notify.notify_waiters();
+    }
+
+    pub fn is_set(&self) -> bool {
+        self.flag.load(std::sync::atomic::Ordering::SeqCst)
+    }
+
+    pub fn reset(&self) {
+        self.flag.store(false, std::sync::atomic::Ordering::SeqCst);
+    }
+
+    pub async fn notified(&self) {
+        let notified = self.notify.notified();
+        if self.is_set() {
+            return;
+        }
+        notified.await;
+    }
+
+    pub fn as_atomic(&self) -> Arc<std::sync::atomic::AtomicBool> {
+        Arc::clone(&self.flag)
+    }
+}
+
+impl Default for InterruptSignal {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+#[error("{message}")]
+pub struct StreamError {
+    pub message: String,
+    pub retry_after_secs: Option<u64>,
+}
+
+impl StreamError {
+    pub fn new(message: String, retry_after_secs: Option<u64>) -> Self {
+        Self {
+            message,
+            retry_after_secs,
+        }
+    }
+}
diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs
new file mode 100644
index 000000000..200f511ed
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/tier.rs
@@ -0,0 +1,330 @@
+//! Model tier abstraction.
+//!
+//! A "tier" is a **user-defined named slot** that maps to a concrete model id.
+//! It is intentionally NOT an opinionated catalog — jcode does not maintain
+//! per-provider tier defaults like Codebuff/OpenRouter does.
+//!
+//! ## Why slots, not catalog?
+//!
+//! jcode users connect a single provider via OAuth (Claude Pro, ChatGPT Plus,
+//! Gemini Advanced, etc.) and pay through that subscription. Auto-downgrading
+//! to a "cheaper tier" without their consent is wrong — they already chose
+//! the model they want. So the default is: agents inherit the session's
+//! current model.
+//!
+//! Power users (pay-per-token API keys, multi-account setups) can opt in by
+//! setting two env vars, exactly mirroring `model_routing.rs` (#100):
+//!
+//! ```bash
+//! JCODE_ROUTING_ROUTINE=claude-haiku-4-5
+//! JCODE_ROUTING_THINKING=claude-opus-4-7
+//! ```
+//!
+//! Agent definitions reference tiers by name:
+//!
+//! ```toml
+//! [agent]
+//! id = "file-picker"
+//! prefer_tier = "routine"   # uses JCODE_ROUTING_ROUTINE if set
+//! ```
+//!
+//! ## Resolution order
+//!
+//! 1. `agent.model_override` (explicit, highest priority)
+//! 2. `agent.prefer_tier` + corresponding env var set
+//! 3. Caller-provided `current_session_model` fallback
+//!
+//! No catalog. No magic. The only "magic" is reading the env var, which is
+//! the existing #100 contract.
+
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// A user-defined tier slot. Currently only two are supported because that
+/// matches `model_routing.rs` (#100). Adding tiers later is additive — the
+/// env var name pattern is `JCODE_ROUTING_<UPPER_TIER>`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum ModelTier {
+    /// Cheap / fast / lower-effort work: file pickers, basher,
+    /// summarizers. Reads `JCODE_ROUTING_ROUTINE`.
+    Routine,
+    /// Premium / reasoning work: editor, reviewer, planner.
+    /// Reads `JCODE_ROUTING_THINKING`.
+    Thinking,
+}
+
+impl ModelTier {
+    /// The env var name that backs this tier slot. Returns the same string
+    /// shape as `model_routing.rs` (#100) so the two systems stay aligned.
+    pub fn env_var(&self) -> &'static str {
+        match self {
+            ModelTier::Routine => "JCODE_ROUTING_ROUTINE",
+            ModelTier::Thinking => "JCODE_ROUTING_THINKING",
+        }
+    }
+
+    /// Read the user-configured model id for this tier from the environment.
+    /// Returns `None` when the env var is unset, blank, or whitespace-only —
+    /// callers should fall back to the session's current model.
+    pub fn read_user_override(&self) -> Option<String> {
+        std::env::var(self.env_var())
+            .ok()
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+    }
+
+    /// Parse a tier name from a string, accepting common variants.
+    pub fn parse(s: &str) -> Option<ModelTier> {
+        match s.trim().to_ascii_lowercase().as_str() {
+            "routine" | "fast" | "cheap" | "lite" => Some(ModelTier::Routine),
+            "thinking" | "reasoning" | "premium" | "deep" => Some(ModelTier::Thinking),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for ModelTier {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            ModelTier::Routine => f.write_str("routine"),
+            ModelTier::Thinking => f.write_str("thinking"),
+        }
+    }
+}
+
+/// Resolve which model id to use for a given tier preference + override pair.
+///
+/// Priority:
+/// 1. `model_override` — explicit, highest priority.
+/// 2. `prefer_tier` + corresponding env var set.
+/// 3. `current_session_model` — caller-provided fallback.
+///
+/// `current_session_model` is required because there's no other safe default:
+/// the runtime doesn't know which provider/model the session is using.
+pub fn resolve_model(
+    model_override: Option<&str>,
+    prefer_tier: Option<ModelTier>,
+    current_session_model: &str,
+) -> String {
+    if let Some(override_id) = model_override.and_then(|s| {
+        let trimmed = s.trim();
+        if trimmed.is_empty() {
+            None
+        } else {
+            Some(trimmed.to_string())
+        }
+    }) {
+        return override_id;
+    }
+
+    if let Some(tier) = prefer_tier {
+        if let Some(tier_model) = tier.read_user_override() {
+            return tier_model;
+        }
+    }
+
+    current_session_model.to_string()
+}
+
+/// Diagnostic-friendly explanation of which slot was used. Useful for
+/// `jcode doctor` output so users can see exactly why a given agent picked
+/// the model it did.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ResolutionSource {
+    /// Used `agent.model_override` directly.
+    Override(String),
+    /// Used the env var backing `tier`.
+    Tier {
+        tier: ModelTier,
+        model: String,
+    },
+    /// Tier was preferred but the env var was unset, so fell back to the
+    /// session's current model.
+    TierFallback {
+        tier: ModelTier,
+        model: String,
+    },
+    /// No override or tier preference; using the session's current model.
+    SessionDefault(String),
+}
+
+impl ResolutionSource {
+    pub fn model_id(&self) -> &str {
+        match self {
+            ResolutionSource::Override(m)
+            | ResolutionSource::Tier { model: m, .. }
+            | ResolutionSource::TierFallback { model: m, .. }
+            | ResolutionSource::SessionDefault(m) => m,
+        }
+    }
+}
+
+/// Same as `resolve_model` but returns provenance information for diagnostics.
+pub fn resolve_model_with_source(
+    model_override: Option<&str>,
+    prefer_tier: Option<ModelTier>,
+    current_session_model: &str,
+) -> ResolutionSource {
+    if let Some(override_id) = model_override.and_then(|s| {
+        let trimmed = s.trim();
+        if trimmed.is_empty() {
+            None
+        } else {
+            Some(trimmed.to_string())
+        }
+    }) {
+        return ResolutionSource::Override(override_id);
+    }
+
+    if let Some(tier) = prefer_tier {
+        match tier.read_user_override() {
+            Some(model) => return ResolutionSource::Tier { tier, model },
+            None => {
+                return ResolutionSource::TierFallback {
+                    tier,
+                    model: current_session_model.to_string(),
+                };
+            }
+        }
+    }
+
+    ResolutionSource::SessionDefault(current_session_model.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Mutex to serialize env-var manipulation across tests in this module.
+    /// Without this, `cargo test` runs tests in parallel and they trample
+    /// each other's `JCODE_ROUTING_*` state.
+    static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+    fn with_env_lock<F: FnOnce()>(f: F) {
+        let guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
+        // Snapshot + restore env vars we mutate so test order is irrelevant.
+        let saved_routine = std::env::var_os("JCODE_ROUTING_ROUTINE");
+        let saved_thinking = std::env::var_os("JCODE_ROUTING_THINKING");
+        unsafe {
+            std::env::remove_var("JCODE_ROUTING_ROUTINE");
+            std::env::remove_var("JCODE_ROUTING_THINKING");
+        }
+        f();
+        unsafe {
+            match saved_routine {
+                Some(v) => std::env::set_var("JCODE_ROUTING_ROUTINE", v),
+                None => std::env::remove_var("JCODE_ROUTING_ROUTINE"),
+            }
+            match saved_thinking {
+                Some(v) => std::env::set_var("JCODE_ROUTING_THINKING", v),
+                None => std::env::remove_var("JCODE_ROUTING_THINKING"),
+            }
+        }
+        drop(guard);
+    }
+
+    #[test]
+    fn parse_tier_accepts_aliases() {
+        assert_eq!(ModelTier::parse("routine"), Some(ModelTier::Routine));
+        assert_eq!(ModelTier::parse("Routine"), Some(ModelTier::Routine));
+        assert_eq!(ModelTier::parse("FAST"), Some(ModelTier::Routine));
+        assert_eq!(ModelTier::parse("thinking"), Some(ModelTier::Thinking));
+        assert_eq!(ModelTier::parse("reasoning"), Some(ModelTier::Thinking));
+        assert_eq!(ModelTier::parse("deep"), Some(ModelTier::Thinking));
+        assert_eq!(ModelTier::parse(""), None);
+        assert_eq!(ModelTier::parse("nonsense"), None);
+    }
+
+    #[test]
+    fn override_wins_over_tier_and_session_default() {
+        with_env_lock(|| {
+            unsafe {
+                std::env::set_var("JCODE_ROUTING_THINKING", "should-be-ignored");
+            }
+            let got = resolve_model(
+                Some("explicit-model"),
+                Some(ModelTier::Thinking),
+                "session-default",
+            );
+            assert_eq!(got, "explicit-model");
+        });
+    }
+
+    #[test]
+    fn tier_uses_env_var_when_set() {
+        with_env_lock(|| {
+            unsafe {
+                std::env::set_var("JCODE_ROUTING_ROUTINE", "haiku-4-5");
+            }
+            let got = resolve_model(None, Some(ModelTier::Routine), "session-default");
+            assert_eq!(got, "haiku-4-5");
+        });
+    }
+
+    #[test]
+    fn tier_falls_back_when_env_unset() {
+        with_env_lock(|| {
+            // env var explicitly removed by lock setup
+            let got = resolve_model(None, Some(ModelTier::Thinking), "session-default");
+            assert_eq!(got, "session-default");
+        });
+    }
+
+    #[test]
+    fn no_tier_no_override_uses_session_default() {
+        with_env_lock(|| {
+            let got = resolve_model(None, None, "session-default");
+            assert_eq!(got, "session-default");
+        });
+    }
+
+    #[test]
+    fn empty_override_string_treated_as_unset() {
+        with_env_lock(|| {
+            let got = resolve_model(Some("   "), None, "session-default");
+            assert_eq!(got, "session-default");
+        });
+    }
+
+    #[test]
+    fn resolution_source_reports_override() {
+        with_env_lock(|| {
+            let src = resolve_model_with_source(Some("forced"), None, "session");
+            assert!(matches!(src, ResolutionSource::Override(ref m) if m == "forced"));
+            assert_eq!(src.model_id(), "forced");
+        });
+    }
+
+    #[test]
+    fn resolution_source_reports_tier_hit() {
+        with_env_lock(|| {
+            unsafe {
+                std::env::set_var("JCODE_ROUTING_THINKING", "opus-4-7");
+            }
+            let src = resolve_model_with_source(None, Some(ModelTier::Thinking), "fallback");
+            match src {
+                ResolutionSource::Tier { tier, model } => {
+                    assert_eq!(tier, ModelTier::Thinking);
+                    assert_eq!(model, "opus-4-7");
+                }
+                other => panic!("expected Tier, got {:?}", other),
+            }
+        });
+    }
+
+    #[test]
+    fn resolution_source_reports_tier_fallback() {
+        with_env_lock(|| {
+            // env unset
+            let src = resolve_model_with_source(None, Some(ModelTier::Routine), "session");
+            match src {
+                ResolutionSource::TierFallback { tier, model } => {
+                    assert_eq!(tier, ModelTier::Routine);
+                    assert_eq!(model, "session");
+                }
+                other => panic!("expected TierFallback, got {:?}", other),
+            }
+        });
+    }
+}

From 6cf1ad8d778cc22b7e7f30db518da0e04d7dca06 Mon Sep 17 00:00:00 2001
From: quangdang46 <tranquangdang21@gmail.com>
Date: Mon, 25 May 2026 21:52:16 +0700
Subject: [PATCH 02/22] feat(agent-runtime): TOML registry loader for
 .jcode/agents/*.toml (Phase 0.3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discover and load AgentDefinition files from three locations with
priority order:

  1. <project>/.jcode/agents/*.toml   (project-local, highest)
  2. ~/.jcode/agents/*.toml           (user-global)
  3. AgentRegistry::register_builtin  (compiled-in defaults, lowest)

Project-local overrides user-global overrides builtin. Re-registering
a builtin after a higher-priority entry is loaded does NOT clobber the
override — the priority check is symmetric in `insert`.

Design choices:

- Filename must match `<id>.toml` so users can find agents by id without
  opening every file. Mismatches are surfaced as a load error rather
  than silently misindexing.
- Malformed/invalid files are collected as non-fatal LoadError entries
  so a single bad file doesn't prevent the rest of the registry from
  loading. `jcode doctor` (future) reads load_errors() to surface
  these.
- AgentRegistry intentionally does NOT cross-reference `tool_names` /
  `spawnable_agents` — that's done at spawn time because the tool
  universe may be feature-gated (Phase 0.4).

41 unit tests pass (32 prior + 9 new). `cargo check --bin jcode` succeeds.
---
 crates/jcode-agent-runtime/src/lib.rs      |   2 +
 crates/jcode-agent-runtime/src/registry.rs | 530 +++++++++++++++++++++
 2 files changed, 532 insertions(+)
 create mode 100644 crates/jcode-agent-runtime/src/registry.rs

diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index 5599633aa..f7f8ea85e 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -26,6 +26,7 @@
 pub mod definition;
 pub mod output;
 pub mod reasoning;
+pub mod registry;
 pub mod signals;
 pub mod tier;
 
@@ -40,4 +41,5 @@ pub use signals::{
 pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION};
 pub use output::OutputMode;
 pub use reasoning::ReasoningEffort;
+pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind};
 pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource};
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
new file mode 100644
index 000000000..2249b046c
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -0,0 +1,530 @@
+//! Agent registry: discovery + loading of `AgentDefinition`s from disk.
+//!
+//! ## Lookup paths (highest priority first)
+//!
+//! 1. **Project-local**: `<cwd>/.jcode/agents/*.toml`
+//! 2. **User-global**: `~/.jcode/agents/*.toml`
+//! 3. **Builtins** registered programmatically via [`AgentRegistry::register_builtin`]
+//!
+//! When the same id appears in multiple sources, the higher-priority one
+//! wins. The registry tracks where each agent came from so `jcode doctor`
+//! can show provenance.
+//!
+//! ## What this module does NOT do
+//!
+//! - It does not validate that `tool_names` exist in the tool registry
+//!   (Phase 0.4) or that `spawnable_agents` resolve to known agents
+//!   (cross-reference). Both are caller responsibilities done at agent
+//!   spawn time, not load time, because the tool/agent universe may be
+//!   feature-gated.
+//! - It does not watch for file changes. Agents are loaded once at
+//!   session start. Self-dev is welcome to call `reload_from_disk()`.
+
+use crate::definition::{AgentDefinition, DefinitionError};
+
+use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+
+/// Where an agent definition was loaded from. Surfaced in `jcode doctor`
+/// and conflict warnings.
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub enum AgentSource {
+    /// Compiled into the binary by name. Lowest priority.
+    Builtin,
+    /// Loaded from `~/.jcode/agents/<file>`.
+    UserGlobal { path: PathBuf },
+    /// Loaded from `<project>/.jcode/agents/<file>`. Highest priority.
+    ProjectLocal { path: PathBuf },
+}
+
+impl AgentSource {
+    fn priority(&self) -> u8 {
+        match self {
+            AgentSource::Builtin => 0,
+            AgentSource::UserGlobal { .. } => 1,
+            AgentSource::ProjectLocal { .. } => 2,
+        }
+    }
+
+    /// Short human-readable label for `jcode doctor` output.
+    pub fn short_label(&self) -> String {
+        match self {
+            AgentSource::Builtin => "builtin".to_string(),
+            AgentSource::UserGlobal { path } => format!("user:{}", path.display()),
+            AgentSource::ProjectLocal { path } => format!("project:{}", path.display()),
+        }
+    }
+}
+
+/// One loaded agent: its definition plus where it came from.
+#[derive(Debug, Clone)]
+pub struct LoadedAgent {
+    pub definition: AgentDefinition,
+    pub source: AgentSource,
+}
+
+/// Errors surfaced when loading an agent file. We distinguish I/O,
+/// parse, and validation errors so the TUI can render actionable
+/// messages.
+#[derive(Debug, thiserror::Error)]
+pub enum LoadError {
+    #[error("failed to read `{path}`: {source}")]
+    Io {
+        path: PathBuf,
+        #[source]
+        source: std::io::Error,
+    },
+
+    #[error("failed to parse `{path}`: {source}")]
+    Parse {
+        path: PathBuf,
+        #[source]
+        source: toml::de::Error,
+    },
+
+    #[error("invalid agent definition in `{path}`: {source}")]
+    Invalid {
+        path: PathBuf,
+        #[source]
+        source: DefinitionError,
+    },
+
+    #[error(
+        "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`."
+    )]
+    FileNameMismatch { path: PathBuf, id: String },
+}
+
+/// In-memory registry of loaded agent definitions. Wrap in `Arc` if you
+/// need to share — `LoadError` contains `io::Error` so the registry itself
+/// is not `Clone`.
+#[derive(Debug, Default)]
+pub struct AgentRegistry {
+    by_id: HashMap<String, LoadedAgent>,
+    /// Non-fatal load errors collected during discovery. Surfaced by
+    /// `jcode doctor` so users can see why a malformed file was skipped.
+    load_errors: Vec<LoadError>,
+}
+
+impl AgentRegistry {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Total number of registered agents.
+    pub fn len(&self) -> usize {
+        self.by_id.len()
+    }
+
+    /// True if no agents are registered.
+    pub fn is_empty(&self) -> bool {
+        self.by_id.is_empty()
+    }
+
+    /// Look up an agent by id.
+    pub fn get(&self, id: &str) -> Option<&LoadedAgent> {
+        self.by_id.get(id)
+    }
+
+    /// Iterate over all agents in arbitrary order.
+    pub fn iter(&self) -> impl Iterator<Item = &LoadedAgent> {
+        self.by_id.values()
+    }
+
+    /// Sorted (by id) iteration — handy for stable doctor output.
+    pub fn iter_sorted(&self) -> Vec<&LoadedAgent> {
+        let mut v: Vec<_> = self.by_id.values().collect();
+        v.sort_by(|a, b| a.definition.id.cmp(&b.definition.id));
+        v
+    }
+
+    /// Non-fatal errors accumulated during discovery.
+    pub fn load_errors(&self) -> &[LoadError] {
+        &self.load_errors
+    }
+
+    /// Insert (or replace) an agent according to source priority. Returns
+    /// the previous entry if it was overridden.
+    pub fn insert(&mut self, loaded: LoadedAgent) -> Option<LoadedAgent> {
+        let id = loaded.definition.id.clone();
+        match self.by_id.get(&id) {
+            Some(existing) if existing.source.priority() > loaded.source.priority() => {
+                // existing has higher priority, drop the new one
+                Some(loaded)
+            }
+            _ => self.by_id.insert(id, loaded),
+        }
+    }
+
+    /// Register a builtin agent. Builtins have the lowest priority and
+    /// are overridable by both user and project files of the same id.
+    pub fn register_builtin(
+        &mut self,
+        definition: AgentDefinition,
+    ) -> Result<(), DefinitionError> {
+        definition.validate()?;
+        self.insert(LoadedAgent {
+            definition,
+            source: AgentSource::Builtin,
+        });
+        Ok(())
+    }
+
+    /// Discover and load all agent files from `dir`. Non-recursive.
+    /// Files that don't end in `.toml` are skipped silently. Bad files
+    /// are recorded in `load_errors()` and skipped.
+    ///
+    /// `source_kind` decides whether each loaded file is tagged as
+    /// `UserGlobal` or `ProjectLocal`.
+    pub fn load_directory(
+        &mut self,
+        dir: &Path,
+        source_kind: SourceKind,
+    ) -> Result<usize, std::io::Error> {
+        if !dir.exists() {
+            return Ok(0);
+        }
+        let mut loaded = 0;
+        for entry in std::fs::read_dir(dir)? {
+            let entry = match entry {
+                Ok(e) => e,
+                Err(err) => {
+                    self.load_errors.push(LoadError::Io {
+                        path: dir.to_path_buf(),
+                        source: err,
+                    });
+                    continue;
+                }
+            };
+            let path = entry.path();
+            if path.extension().and_then(|s| s.to_str()) != Some("toml") {
+                continue;
+            }
+            match Self::load_file(&path) {
+                Ok(definition) => {
+                    let expected_stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or("");
+                    if !expected_stem.is_empty() && expected_stem != definition.id {
+                        self.load_errors.push(LoadError::FileNameMismatch {
+                            path: path.clone(),
+                            id: definition.id.clone(),
+                        });
+                        continue;
+                    }
+                    let source = match source_kind {
+                        SourceKind::UserGlobal => AgentSource::UserGlobal { path: path.clone() },
+                        SourceKind::ProjectLocal => {
+                            AgentSource::ProjectLocal { path: path.clone() }
+                        }
+                    };
+                    self.insert(LoadedAgent {
+                        definition,
+                        source,
+                    });
+                    loaded += 1;
+                }
+                Err(err) => {
+                    self.load_errors.push(err);
+                }
+            }
+        }
+        Ok(loaded)
+    }
+
+    /// Read + parse + validate a single TOML file into an `AgentDefinition`.
+    pub fn load_file(path: &Path) -> Result<AgentDefinition, LoadError> {
+        let raw = std::fs::read_to_string(path).map_err(|source| LoadError::Io {
+            path: path.to_path_buf(),
+            source,
+        })?;
+        let definition: AgentDefinition =
+            toml::from_str(&raw).map_err(|source| LoadError::Parse {
+                path: path.to_path_buf(),
+                source,
+            })?;
+        definition.validate().map_err(|source| LoadError::Invalid {
+            path: path.to_path_buf(),
+            source,
+        })?;
+        Ok(definition)
+    }
+
+    /// Convenience: discover both user-global and project-local agent
+    /// directories using standard jcode paths. `home` defaults to
+    /// `dirs::home_dir()` (omitted here to keep this crate dep-light;
+    /// callers pass the resolved home to avoid pulling `dirs`).
+    pub fn discover_standard_paths(
+        &mut self,
+        home_dir: Option<&Path>,
+        project_root: Option<&Path>,
+    ) {
+        if let Some(home) = home_dir {
+            let user_dir = home.join(".jcode").join("agents");
+            if let Err(err) = self.load_directory(&user_dir, SourceKind::UserGlobal) {
+                self.load_errors.push(LoadError::Io {
+                    path: user_dir,
+                    source: err,
+                });
+            }
+        }
+        if let Some(root) = project_root {
+            let project_dir = root.join(".jcode").join("agents");
+            if let Err(err) = self.load_directory(&project_dir, SourceKind::ProjectLocal) {
+                self.load_errors.push(LoadError::Io {
+                    path: project_dir,
+                    source: err,
+                });
+            }
+        }
+    }
+}
+
+/// Tag for `load_directory` so the caller decides how loaded entries are
+/// labeled. The function itself doesn't care about jcode's path convention.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum SourceKind {
+    UserGlobal,
+    ProjectLocal,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::output::OutputMode;
+    use std::fs;
+
+    fn write_toml(dir: &Path, name: &str, body: &str) {
+        let path = dir.join(name);
+        fs::write(&path, body).expect("write toml");
+    }
+
+    fn temp_dir(name: &str) -> PathBuf {
+        let base = std::env::temp_dir().join(format!(
+            "jcode-agent-registry-test-{}-{}-{}",
+            name,
+            std::process::id(),
+            // Use atomics for a per-process counter so concurrent tests don't collide.
+            COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed)
+        ));
+        let _ = fs::remove_dir_all(&base);
+        fs::create_dir_all(&base).unwrap();
+        base
+    }
+
+    static COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
+
+    #[test]
+    fn missing_dir_is_zero_load_not_error() {
+        let mut reg = AgentRegistry::new();
+        let n = reg
+            .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal)
+            .unwrap();
+        assert_eq!(n, 0);
+        assert!(reg.is_empty());
+    }
+
+    #[test]
+    fn loads_minimal_agent() {
+        let dir = temp_dir("minimal");
+        write_toml(
+            &dir,
+            "file-picker.toml",
+            r#"
+                id = "file-picker"
+                display_name = "Fletcher"
+            "#,
+        );
+        let mut reg = AgentRegistry::new();
+        let n = reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap();
+        assert_eq!(n, 1);
+        let loaded = reg.get("file-picker").expect("registered");
+        assert_eq!(loaded.definition.display_name, "Fletcher");
+        assert!(matches!(loaded.source, AgentSource::ProjectLocal { .. }));
+    }
+
+    #[test]
+    fn project_overrides_user_overrides_builtin() {
+        // Builtin
+        let mut reg = AgentRegistry::new();
+        let mut builtin_def = AgentDefinition {
+            id: "editor".to_string(),
+            display_name: "Builtin Editor".to_string(),
+            publisher: None,
+            version: "0.1.0".to_string(),
+            prefer_tier: None,
+            model_override: None,
+            reasoning: None,
+            tool_names: vec![],
+            spawnable_agents: vec![],
+            system_prompt: String::new(),
+            instructions_prompt: None,
+            step_prompt: None,
+            spawner_prompt: None,
+            inherit_parent_system_prompt: false,
+            include_message_history: false,
+            output_mode: OutputMode::LastMessage,
+            output_schema: None,
+        };
+        reg.register_builtin(builtin_def.clone()).unwrap();
+        assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor");
+
+        // User
+        let user_dir = temp_dir("user");
+        write_toml(
+            &user_dir,
+            "editor.toml",
+            r#"
+                id = "editor"
+                display_name = "User Editor"
+            "#,
+        );
+        reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap();
+        assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor");
+
+        // Project
+        let proj_dir = temp_dir("proj");
+        write_toml(
+            &proj_dir,
+            "editor.toml",
+            r#"
+                id = "editor"
+                display_name = "Project Editor"
+            "#,
+        );
+        reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap();
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "Project Editor"
+        );
+
+        // Re-register builtin should NOT override the project entry.
+        // (registers via the same `insert` priority path)
+        builtin_def.display_name = "Builtin Editor v2".to_string();
+        reg.register_builtin(builtin_def).unwrap();
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "Project Editor",
+            "builtin should not override project-local"
+        );
+    }
+
+    #[test]
+    fn malformed_toml_collected_as_load_error() {
+        let dir = temp_dir("malformed");
+        write_toml(&dir, "bad.toml", "id = \"this is missing close quote\n");
+        let mut reg = AgentRegistry::new();
+        reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
+        assert!(reg.is_empty(), "no agents registered");
+        assert_eq!(reg.load_errors().len(), 1);
+        assert!(matches!(
+            reg.load_errors()[0],
+            LoadError::Parse { .. }
+        ));
+    }
+
+    #[test]
+    fn invalid_id_collected_as_load_error() {
+        let dir = temp_dir("invalid-id");
+        write_toml(
+            &dir,
+            "Bad_File.toml",
+            r#"
+                id = "Bad_Id"
+                display_name = "Bad"
+            "#,
+        );
+        let mut reg = AgentRegistry::new();
+        reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
+        assert!(reg.is_empty());
+        assert_eq!(reg.load_errors().len(), 1);
+        assert!(matches!(
+            reg.load_errors()[0],
+            LoadError::Invalid { .. }
+        ));
+    }
+
+    #[test]
+    fn filename_must_match_agent_id() {
+        let dir = temp_dir("name-mismatch");
+        write_toml(
+            &dir,
+            "wrong-name.toml",
+            r#"
+                id = "right-name"
+                display_name = "X"
+            "#,
+        );
+        let mut reg = AgentRegistry::new();
+        reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
+        assert!(reg.is_empty());
+        assert_eq!(reg.load_errors().len(), 1);
+        assert!(matches!(
+            reg.load_errors()[0],
+            LoadError::FileNameMismatch { .. }
+        ));
+    }
+
+    #[test]
+    fn skips_non_toml_files() {
+        let dir = temp_dir("non-toml");
+        fs::write(dir.join("README.md"), "not an agent").unwrap();
+        fs::write(dir.join("config.json"), "{}").unwrap();
+        write_toml(
+            &dir,
+            "valid.toml",
+            r#"
+                id = "valid"
+                display_name = "v"
+            "#,
+        );
+        let mut reg = AgentRegistry::new();
+        let n = reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
+        assert_eq!(n, 1);
+        assert_eq!(reg.len(), 1);
+    }
+
+    #[test]
+    fn iter_sorted_is_deterministic() {
+        let dir = temp_dir("sort");
+        for id in ["zeta", "alpha", "mid"] {
+            write_toml(
+                &dir,
+                &format!("{id}.toml"),
+                &format!(r#"id = "{id}"
+display_name = "{id}"
+"#),
+            );
+        }
+        let mut reg = AgentRegistry::new();
+        reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
+        let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect();
+        assert_eq!(ids, vec!["alpha", "mid", "zeta"]);
+    }
+
+    #[test]
+    fn discover_standard_paths_reads_both() {
+        let home = temp_dir("home");
+        let proj = temp_dir("proj");
+        fs::create_dir_all(home.join(".jcode/agents")).unwrap();
+        fs::create_dir_all(proj.join(".jcode/agents")).unwrap();
+        write_toml(
+            &home.join(".jcode/agents"),
+            "user-only.toml",
+            r#"id = "user-only"
+display_name = "U"
+"#,
+        );
+        write_toml(
+            &proj.join(".jcode/agents"),
+            "project-only.toml",
+            r#"id = "project-only"
+display_name = "P"
+"#,
+        );
+        let mut reg = AgentRegistry::new();
+        reg.discover_standard_paths(Some(&home), Some(&proj));
+        assert_eq!(reg.len(), 2);
+        assert!(reg.get("user-only").is_some());
+        assert!(reg.get("project-only").is_some());
+    }
+}

From e772853f2a80d87a1f1e583099f7c1ccf486d3d6 Mon Sep 17 00:00:00 2001
From: quangdang46 <tranquangdang21@gmail.com>
Date: Mon, 25 May 2026 22:03:10 +0700
Subject: [PATCH 03/22] feat(agent-runtime): cross-ref validation + skill MAS
 bridge + sample agents (Phase 0.4-0.6)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 0.4 — Cross-reference validation:
  - ReferenceError enum (UnknownTools, UnknownSpawnableAgents) kept
    separate from DefinitionError because the runtime tool/agent
    universe isn't known at TOML-load time.
  - AgentDefinition::validate_tool_references<I, S>() and
    validate_spawn_references<I, S>() — caller passes the available
    name set, gets back a sorted, comma-joined list of unknowns.
  - 5 new tests covering the happy path, unknowns, empty lists,
    and deterministic alphabetical ordering of the error message.

  This deliberately does NOT modify src/tool/mod.rs. The whitelist
  check is a pure function over the agent definition + a name set;
  no need to refactor tool dispatch. Phase 1 will wire the actual
  tool registry into the spawn path.

Phase 0.5 — Skill MAS (#94) bridge:
  - AgentRegistry::lookup_for_skill_routing(skill_agent_id) — named
    alias of get() that documents the integration point with the
    SKILL.md  field. Returns None for missing references; the
    skill activation site decides fallback policy.
  - 2 tests: hit + miss.

Phase 0.6 — Sample agents + integration test:
  - .jcode/agents/file-picker.toml — Routine tier, no message history,
    leaf agent. Demonstrates file-picker pattern adapted from Codebuff.
  - .jcode/agents/code-reviewer.toml — Thinking tier with
    inherit_parent_system_prompt=true to demonstrate the prompt-cache
    prefix-sharing trick (~90% input-token savings on cache hits).
  - tests/sample_agents.rs — integration test loads both files via the
    public AgentRegistry API and asserts shape + behavior. 4 tests.

Phase 0 totals: 49 unit + 4 integration = 53 tests, all passing.
`cargo check --bin jcode` succeeds (full workspace, 3m13s).

Phase 0 (foundation) is now complete:
  - Schema: AgentDefinition + ModelTier + OutputMode + ReasoningEffort
  - Loader: registry with priority order (project > user > builtin)
  - Validation: id format, internal invariants, cross-references
  - Sample agents demonstrating cache-hit and tier patterns
  - Skill MAS (#94) integration point established

Phase 1 (4 builtin agents + spawn_agents tool + cache benchmark) is
the next track.
---
 .jcode/agents/code-reviewer.toml              |  76 +++++++
 .jcode/agents/file-picker.toml                |  65 ++++++
 crates/jcode-agent-runtime/src/definition.rs  | 195 ++++++++++++++++++
 crates/jcode-agent-runtime/src/lib.rs         |   4 +-
 crates/jcode-agent-runtime/src/registry.rs    |  42 ++++
 .../tests/sample_agents.rs                    | 114 ++++++++++
 6 files changed, 495 insertions(+), 1 deletion(-)
 create mode 100644 .jcode/agents/code-reviewer.toml
 create mode 100644 .jcode/agents/file-picker.toml
 create mode 100644 crates/jcode-agent-runtime/tests/sample_agents.rs

diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml
new file mode 100644
index 000000000..22b7e5e38
--- /dev/null
+++ b/.jcode/agents/code-reviewer.toml
@@ -0,0 +1,76 @@
+# Code reviewer agent.
+#
+# Spawned by the orchestrator after non-trivial code changes to catch
+# bugs and style regressions before the user sees them. Adapted from
+# Codebuff's `code-reviewer`.
+#
+# Why `prefer_tier = "thinking"`:
+#   Review work benefits from reasoning. A pay-per-token user with
+#   `JCODE_ROUTING_THINKING=<premium-model>` gets the right model
+#   for the right job; subscription users inherit the session model.
+#
+# Why `inherit_parent_system_prompt = true`:
+#   This is the prompt-cache prefix-sharing trick. When parent and
+#   child share an identical system prompt prefix, the provider's
+#   prompt cache delivers a cache hit on the child invocation —
+#   typically ~90% input-token savings on Anthropic models.
+#
+#   IMPORTANT: must leave `system_prompt` empty (validated). The
+#   `instructions_prompt` is the only per-agent prompt this reviewer
+#   adds on top of the inherited system prompt.
+
+id = "code-reviewer"
+display_name = "Code Reviewer"
+publisher = "jcode"
+version = "0.1.0"
+
+prefer_tier = "thinking"
+reasoning = "medium"
+
+inherit_parent_system_prompt = true
+include_message_history = true
+output_mode = "last_message"
+
+tool_names = [
+    "read",
+    "grep",
+]
+
+# Reviewers don't spawn other agents — they read, reason, and report.
+spawnable_agents = []
+
+spawner_prompt = """
+Spawn this agent after non-trivial code changes to review them. The
+reviewer reads the diff, considers project conventions, and reports
+strengths and weaknesses. Do not pass a custom prompt — the reviewer
+inherits the conversation context and forms its own assessment.
+"""
+
+# system_prompt MUST be empty when inherit_parent_system_prompt is true.
+# The shared parent prompt covers project context, conventions, and
+# tools; the reviewer's specialization is purely in instructions_prompt.
+
+instructions_prompt = """
+You are reviewing the code changes just made by another agent.
+
+Focus on:
+- Correctness: does the code do what the user asked?
+- Project conventions: imports, formatting, naming, error handling.
+- Test coverage: are new code paths exercised?
+- Edge cases: what could go wrong? What was missed?
+
+Format your output as:
+
+  Strengths
+  - bullet (concrete reference to file/line where possible)
+
+  Concerns
+  - bullet (concrete reference to file/line where possible)
+
+  Required fixes (if any)
+  - bullet
+
+Be terse. Be specific. Do not restate code that's already in the diff.
+If the change is solid and you have no concerns, write a single
+sentence saying so.
+"""
diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml
new file mode 100644
index 000000000..b6365a84d
--- /dev/null
+++ b/.jcode/agents/file-picker.toml
@@ -0,0 +1,65 @@
+# File picker agent.
+#
+# Spawned by the orchestrator to find files in the codebase that are
+# relevant to a task. Adapted from Codebuff's `file-picker` agent.
+#
+# Why `prefer_tier = "routine"`:
+#   File picking is a fuzzy-search task — a smaller/cheaper model
+#   handles it well. Pay-per-token users who set
+#   `JCODE_ROUTING_ROUTINE=<cheap-model>` save real money here.
+#   Subscription users (Claude Pro, ChatGPT Plus, ...) inherit the
+#   session model and get correctness without any tier mapping.
+#
+# Why `include_message_history = false`:
+#   File picker doesn't need to see prior edit chatter. A clean slate
+#   keeps the prompt short and avoids accidentally biasing path
+#   selection toward already-touched files.
+
+id = "file-picker"
+display_name = "Fletcher the File Fetcher"
+publisher = "jcode"
+version = "0.1.0"
+
+prefer_tier = "routine"
+reasoning = "minimal"
+
+include_message_history = false
+output_mode = "last_message"
+
+# Tools required: read project file tree + glob fallback. Whitelist is
+# checked at runtime against the tool registry; unknown tools fail loudly
+# rather than silently degrading.
+tool_names = [
+    "ls",
+    "glob",
+    "read",
+]
+
+# This agent is a leaf — it does not spawn other agents.
+spawnable_agents = []
+
+spawner_prompt = """
+Spawn this agent to find relevant files in the codebase. Provide a brief
+description of what you're looking for. The agent will return up to ~12
+file paths with one-line summaries. It does fuzzy semantic search; for
+exact-string searches, spawn a code searcher instead.
+"""
+
+system_prompt = """
+You are an expert at finding relevant files in a codebase. You have the
+project file tree and the user's request. Return the most relevant
+files, one per line, prefixed with the path. After the list, write a
+single short paragraph explaining how the files relate to the request.
+
+Do not read file contents — that is the parent agent's job.
+Do not propose changes — that is the editor's job.
+Stay focused on path discovery.
+"""
+
+instructions_prompt = """
+Provide an extremely concise report:
+1. List of relevant file paths (one per line).
+2. One paragraph (<= 4 sentences) explaining the relevance.
+
+Do not exceed 12 paths unless the parent explicitly asks for more.
+"""
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index 3e2203e8b..a067668c6 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -199,6 +199,35 @@ pub enum DefinitionError {
     DuplicateSpawnable { id: String, spawn: String },
 }
 
+/// Errors returned when cross-referencing an agent against the runtime
+/// tool/agent universe (i.e. checking that `tool_names` actually exist).
+///
+/// These are **separate from `DefinitionError`** because the runtime
+/// universe isn't known at TOML-load time — it depends on feature flags,
+/// MCP server connections, and the resolved agent registry. Callers
+/// invoke `validate_tool_references` / `validate_spawn_references` at
+/// agent spawn time.
+#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
+pub enum ReferenceError {
+    #[error(
+        "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}"
+    )]
+    UnknownTools {
+        id: String,
+        unknown: String,
+        available: String,
+    },
+
+    #[error(
+        "agent `{id}` references unknown sub-agent(s): {unknown}. Available agents: {available}"
+    )]
+    UnknownSpawnableAgents {
+        id: String,
+        unknown: String,
+        available: String,
+    },
+}
+
 impl AgentDefinition {
     /// Validate id format + cross-field invariants. Returns `Ok(())` when
     /// the definition is well-formed.
@@ -265,6 +294,88 @@ impl AgentDefinition {
             current_session_model,
         )
     }
+
+    /// Check that every entry in `tool_names` exists in the caller-provided
+    /// universe of tool names. Returns the list of unknown tools when any
+    /// fail. Caller decides whether unknown tools are fatal (likely yes
+    /// for production agents, no for under-development agents).
+    ///
+    /// Empty `tool_names` always validates — agents with no tools are
+    /// legal (e.g. pure-prompt summarizer).
+    pub fn validate_tool_references<I, S>(&self, available: I) -> Result<(), ReferenceError>
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<str>,
+    {
+        let available: std::collections::HashSet<String> = available
+            .into_iter()
+            .map(|s| s.as_ref().to_string())
+            .collect();
+        let unknown: Vec<&String> = self
+            .tool_names
+            .iter()
+            .filter(|name| !available.contains(name.as_str()))
+            .collect();
+        if unknown.is_empty() {
+            return Ok(());
+        }
+        let mut sorted_unknown: Vec<&String> = unknown;
+        sorted_unknown.sort();
+        let mut sorted_available: Vec<&String> = available.iter().collect();
+        sorted_available.sort();
+        Err(ReferenceError::UnknownTools {
+            id: self.id.clone(),
+            unknown: sorted_unknown
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>()
+                .join(", "),
+            available: sorted_available
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>()
+                .join(", "),
+        })
+    }
+
+    /// Check that every entry in `spawnable_agents` exists in the caller-
+    /// provided universe of agent ids. Returns unknown agents when any
+    /// fail. Same semantics as `validate_tool_references`.
+    pub fn validate_spawn_references<I, S>(&self, available: I) -> Result<(), ReferenceError>
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<str>,
+    {
+        let available: std::collections::HashSet<String> = available
+            .into_iter()
+            .map(|s| s.as_ref().to_string())
+            .collect();
+        let unknown: Vec<&String> = self
+            .spawnable_agents
+            .iter()
+            .filter(|name| !available.contains(name.as_str()))
+            .collect();
+        if unknown.is_empty() {
+            return Ok(());
+        }
+        let mut sorted_unknown: Vec<&String> = unknown;
+        sorted_unknown.sort();
+        let mut sorted_available: Vec<&String> = available.iter().collect();
+        sorted_available.sort();
+        Err(ReferenceError::UnknownSpawnableAgents {
+            id: self.id.clone(),
+            unknown: sorted_unknown
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>()
+                .join(", "),
+            available: sorted_available
+                .iter()
+                .map(|s| s.as_str())
+                .collect::<Vec<_>>()
+                .join(", "),
+        })
+    }
 }
 
 /// Agent ids are intentionally restrictive: lowercase ASCII letters, digits,
@@ -492,4 +603,88 @@ mod tests {
         let d: AgentDefinition = toml::from_str(src).expect("parse");
         d.validate().expect("validate");
     }
+
+    // -----------------------------------------------------------------
+    // Cross-reference validation (Phase 0.4)
+    // -----------------------------------------------------------------
+    #[test]
+    fn validate_tool_references_passes_when_all_known() {
+        let mut d = minimal_definition("editor");
+        d.tool_names = vec!["read".to_string(), "write_file".to_string()];
+        d.validate_tool_references(["read", "write_file", "str_replace"])
+            .expect("all tools known");
+    }
+
+    #[test]
+    fn validate_tool_references_fails_with_unknown_tools() {
+        let mut d = minimal_definition("editor");
+        d.tool_names = vec!["read".to_string(), "magic".to_string()];
+        let err = d
+            .validate_tool_references(["read", "write_file"])
+            .expect_err("magic is unknown");
+        match err {
+            ReferenceError::UnknownTools {
+                id,
+                unknown,
+                available,
+            } => {
+                assert_eq!(id, "editor");
+                assert_eq!(unknown, "magic");
+                assert!(available.contains("read"));
+                assert!(available.contains("write_file"));
+            }
+            other => panic!("expected UnknownTools, got {:?}", other),
+        }
+    }
+
+    #[test]
+    fn validate_tool_references_empty_tool_names_always_ok() {
+        let d = minimal_definition("ask");
+        // tool_names is empty by default; supplying empty universe is also fine.
+        d.validate_tool_references(Vec::<String>::new())
+            .expect("empty tool list always valid");
+    }
+
+    #[test]
+    fn validate_spawn_references_passes_when_all_known() {
+        let mut d = minimal_definition("base");
+        d.spawnable_agents = vec!["file-picker".to_string(), "editor".to_string()];
+        d.validate_spawn_references(["file-picker", "editor", "reviewer"])
+            .expect("all known");
+    }
+
+    #[test]
+    fn validate_spawn_references_fails_with_unknown_agents() {
+        let mut d = minimal_definition("base");
+        d.spawnable_agents = vec!["file-picker".to_string(), "ghost".to_string()];
+        let err = d
+            .validate_spawn_references(["file-picker", "editor"])
+            .expect_err("ghost unknown");
+        match err {
+            ReferenceError::UnknownSpawnableAgents {
+                id,
+                unknown,
+                available: _,
+            } => {
+                assert_eq!(id, "base");
+                assert_eq!(unknown, "ghost");
+            }
+            other => panic!("expected UnknownSpawnableAgents, got {:?}", other),
+        }
+    }
+
+    #[test]
+    fn validate_references_unknown_list_is_sorted_and_comma_joined() {
+        let mut d = minimal_definition("agent");
+        d.tool_names = vec!["zeta".to_string(), "alpha".to_string(), "mid".to_string()];
+        let err = d
+            .validate_tool_references(Vec::<&str>::new())
+            .expect_err("none known");
+        match err {
+            ReferenceError::UnknownTools { unknown, .. } => {
+                assert_eq!(unknown, "alpha, mid, zeta", "alphabetical order");
+            }
+            _ => unreachable!(),
+        }
+    }
 }
diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index f7f8ea85e..b78ad983f 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -38,7 +38,9 @@ pub use signals::{
 };
 
 // New public surface (Phase 0).
-pub use definition::{AgentDefinition, DefinitionError, DEFAULT_AGENT_VERSION};
+pub use definition::{
+    AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION,
+};
 pub use output::OutputMode;
 pub use reasoning::ReasoningEffort;
 pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind};
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index 2249b046c..71cab810d 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -139,6 +139,22 @@ impl AgentRegistry {
         v
     }
 
+    /// Look up an agent referenced by a Skill MAS field (#94).
+    ///
+    /// `SKILL.md` front-matter has an optional `agent: <id>` field that
+    /// routes skill activation to a specific sub-agent rather than the
+    /// main agent. The id format is identical to `AgentDefinition::id`,
+    /// so this is functionally `get(id)` — the named alias exists to
+    /// document the integration point and keep future skill-routing
+    /// logic discoverable.
+    ///
+    /// Returns `None` if the skill references an unknown agent. The
+    /// caller (skill activation site) decides whether to log a warning
+    /// or fall back to the main agent.
+    pub fn lookup_for_skill_routing(&self, skill_agent_id: &str) -> Option<&LoadedAgent> {
+        self.get(skill_agent_id)
+    }
+
     /// Non-fatal errors accumulated during discovery.
     pub fn load_errors(&self) -> &[LoadError] {
         &self.load_errors
@@ -501,6 +517,32 @@ display_name = "{id}"
         assert_eq!(ids, vec!["alpha", "mid", "zeta"]);
     }
 
+    #[test]
+    fn lookup_for_skill_routing_finds_agent() {
+        let dir = temp_dir("skill-mas-hit");
+        write_toml(
+            &dir,
+            "code-reviewer.toml",
+            r#"id = "code-reviewer"
+display_name = "Reviewer"
+"#,
+        );
+        let mut reg = AgentRegistry::new();
+        reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap();
+        // Skill front-matter `agent: code-reviewer` → registry lookup.
+        let found = reg.lookup_for_skill_routing("code-reviewer");
+        assert!(found.is_some());
+        assert_eq!(found.unwrap().definition.id, "code-reviewer");
+    }
+
+    #[test]
+    fn lookup_for_skill_routing_returns_none_for_unknown_agent() {
+        let reg = AgentRegistry::new();
+        // Caller (skill activation site) decides how to handle a missing
+        // routing target — we just report None.
+        assert!(reg.lookup_for_skill_routing("nonexistent").is_none());
+    }
+
     #[test]
     fn discover_standard_paths_reads_both() {
         let home = temp_dir("home");
diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs
new file mode 100644
index 000000000..fc542fed4
--- /dev/null
+++ b/crates/jcode-agent-runtime/tests/sample_agents.rs
@@ -0,0 +1,114 @@
+//! Integration test: load the bundled sample agents in
+//! `<project>/.jcode/agents/` and assert the registry behaves as
+//! documented.
+//!
+//! Lives in `tests/` so it exercises the public API the way real callers
+//! will (the `jcode` binary, the future `cli/agents` module, etc.).
+//!
+//! If a future PR moves the sample agents elsewhere, update `SAMPLES_DIR`.
+
+use std::path::PathBuf;
+
+use jcode_agent_runtime::{
+    AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind,
+};
+
+/// Path to the project-root sample agents directory, relative to the
+/// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so
+/// `cargo test --workspace` works regardless of the cwd the runner
+/// chooses.
+fn samples_dir() -> PathBuf {
+    let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
+    // crates/jcode-agent-runtime → ../../ .jcode/agents
+    crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents")
+}
+
+#[test]
+fn loads_bundled_sample_agents() {
+    let dir = samples_dir();
+    assert!(
+        dir.exists(),
+        "sample agents directory missing: {}",
+        dir.display(),
+    );
+
+    let mut reg = AgentRegistry::new();
+    let n = reg
+        .load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+    assert!(n >= 2, "expected at least 2 sample agents, got {n}");
+    assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors());
+}
+
+#[test]
+fn file_picker_sample_has_expected_shape() {
+    let dir = samples_dir();
+    let mut reg = AgentRegistry::new();
+    reg.load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+
+    let agent = reg
+        .get("file-picker")
+        .expect("file-picker registered")
+        .definition
+        .clone();
+
+    assert_eq!(agent.display_name, "Fletcher the File Fetcher");
+    assert_eq!(agent.prefer_tier, Some(ModelTier::Routine));
+    assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal));
+    assert!(!agent.include_message_history, "file picker uses clean slate");
+    assert!(!agent.inherit_parent_system_prompt);
+    assert_eq!(agent.output_mode, OutputMode::LastMessage);
+    assert!(agent.tool_names.iter().any(|t| t == "read"));
+    assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+
+    // Resolve model with no env vars set should fall back to the
+    // session's current model.
+    let resolved = agent.resolve_model("session-model");
+    assert_eq!(
+        resolved, "session-model",
+        "no JCODE_ROUTING_ROUTINE → session default"
+    );
+}
+
+#[test]
+fn code_reviewer_uses_inherit_parent_system_prompt_for_cache_hit() {
+    let dir = samples_dir();
+    let mut reg = AgentRegistry::new();
+    reg.load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+
+    let agent = &reg
+        .get("code-reviewer")
+        .expect("code-reviewer registered")
+        .definition;
+
+    assert!(
+        agent.inherit_parent_system_prompt,
+        "reviewer must inherit parent system prompt for prompt-cache hits"
+    );
+    assert!(
+        agent.system_prompt.is_empty(),
+        "system_prompt must be empty when inheriting (enforced by validation)"
+    );
+    assert!(
+        agent.include_message_history,
+        "reviewer needs context of the change it's reviewing"
+    );
+    assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking));
+}
+
+#[test]
+fn sample_agents_validate_cleanly() {
+    let dir = samples_dir();
+    let mut reg = AgentRegistry::new();
+    reg.load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+
+    for loaded in reg.iter() {
+        loaded
+            .definition
+            .validate()
+            .unwrap_or_else(|err| panic!("{} failed validation: {err}", loaded.definition.id));
+    }
+}

From 170852ff3ada1e8ddff5bdb95c926165e42e6851 Mon Sep 17 00:00:00 2001
From: quangdang46 <tranquangdang21@gmail.com>
Date: Tue, 26 May 2026 08:24:34 +0700
Subject: [PATCH 04/22] =?UTF-8?q?feat(multi-agent-foundation):=20Phase=201?=
 =?UTF-8?q?-5=20additions=20=E2=80=94=20jbench=20scaffold,=20prompt=20util?=
 =?UTF-8?q?ities,=20sample=20agents?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1:两名真实 TOML agent definitions (basher + editor) với full schema
Phase 4: `prompt_placeholders.rs` — `{{FILE_TREE}}`, `{{CURRENT_DATE}}`, etc.
Phase 4: `wrap_as_system_reminder()` in `src/agent/prompting.rs`
Phase 5: `evals/jbench/` scaffold — types, judge stub, lessons stub, agent_runner stub
Phase 0.6: integration tests `basher_sample_has_expected_shape` + `editor_sample_has_expected_shape`

All jcode-agent-runtime tests pass (49 unit + 6 integration).
---
 .jcode/agents/basher.toml                     |  75 +++++++
 .jcode/agents/editor.toml                     |  87 ++++++++
 Cargo.lock                                    |  12 ++
 Cargo.toml                                    |   1 +
 .../tests/sample_agents.rs                    |  86 ++++++++
 evals/jbench/Cargo.toml                       |  24 +++
 evals/jbench/README.md                        | 110 ++++++++++
 evals/jbench/src/agent_runner.rs              |  70 ++++++
 evals/jbench/src/bin/jbench.rs                |  69 ++++++
 evals/jbench/src/judge.rs                     |  60 ++++++
 evals/jbench/src/lessons.rs                   |  65 ++++++
 evals/jbench/src/lib.rs                       |  19 ++
 evals/jbench/src/types.rs                     | 173 +++++++++++++++
 evals/jbench/tests/types.rs                   | 108 ++++++++++
 src/agent/prompting.rs                        |  43 ++++
 src/lib.rs                                    |   1 +
 src/prompt_placeholders.rs                    | 200 ++++++++++++++++++
 17 files changed, 1203 insertions(+)
 create mode 100644 .jcode/agents/basher.toml
 create mode 100644 .jcode/agents/editor.toml
 create mode 100644 evals/jbench/Cargo.toml
 create mode 100644 evals/jbench/README.md
 create mode 100644 evals/jbench/src/agent_runner.rs
 create mode 100644 evals/jbench/src/bin/jbench.rs
 create mode 100644 evals/jbench/src/judge.rs
 create mode 100644 evals/jbench/src/lessons.rs
 create mode 100644 evals/jbench/src/lib.rs
 create mode 100644 evals/jbench/src/types.rs
 create mode 100644 evals/jbench/tests/types.rs
 create mode 100644 src/prompt_placeholders.rs

diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml
new file mode 100644
index 000000000..c726b51db
--- /dev/null
+++ b/.jcode/agents/basher.toml
@@ -0,0 +1,75 @@
+# Basher agent.
+#
+# Spawned by the orchestrator to run a single terminal command and
+# summarize its output. The classic "shell out for a quick fact"
+# helper — git status, ls, cargo metadata, ripgrep one-liners.
+#
+# Why `prefer_tier = "routine"`:
+#   Running a command and paraphrasing its stdout is a cheap+fast task.
+#   A pay-per-token user with `JCODE_ROUTING_ROUTINE=<cheap-model>`
+#   keeps the cost of these frequent leaf calls low. Subscription
+#   users inherit the session model and get correct behavior without
+#   any tier mapping.
+#
+# Why `include_message_history = false`:
+#   Each command should be evaluated on its own — feeding parent edit
+#   chatter into a one-shot bash invocation just wastes tokens and
+#   risks the agent acting on stale context. Clean slate per command.
+#
+# Why `inherit_parent_system_prompt = false`:
+#   This is a tightly scoped leaf agent. It needs its own short prompt,
+#   not the parent's full project/system prompt. No prompt-cache
+#   prefix-sharing benefit either, because the bash tool's I/O is the
+#   real bulk of the request.
+#
+# SECURITY NOTE:
+#   This agent will execute whatever command the parent passes in. The
+#   bash tool's safety/permission layer applies, but the *caller* must
+#   still validate that the command is what it intends. Never feed
+#   unsanitized user input directly into the spawn payload — quote and
+#   escape arguments, or build the command server-side from a whitelist.
+
+id = "basher"
+display_name = "Basher"
+publisher = "jcode"
+version = "0.1.0"
+
+prefer_tier = "routine"
+reasoning = "minimal"
+
+include_message_history = false
+inherit_parent_system_prompt = false
+output_mode = "last_message"
+
+# Single tool: jcode's terminal command runner.
+tool_names = ["bash"]
+
+# Leaf agent — does not spawn other agents.
+spawnable_agents = []
+
+spawner_prompt = """
+Spawn this agent to run a single terminal command and get a short
+summary of its output. Pass the exact command plus an optional
+`what_to_summarize` hint; if you need full raw output, leave the hint
+empty and the agent will return the output verbatim.
+"""
+
+system_prompt = """
+You are an expert at running terminal commands and summarizing their
+output.
+
+Inputs you receive:
+- the command to run (required).
+- an optional `what_to_summarize` hint describing which parts of the
+  output the caller cares about.
+
+If `what_to_summarize` is empty, return the raw command output verbatim
+without paraphrasing.
+"""
+
+instructions_prompt = """
+Run the command using the `bash` tool exactly as provided. Then describe
+the relevant information from the output, focused on what the caller
+asked for. Be concise. Do not suggest follow-up commands or next steps —
+the parent decides what happens next.
+"""
diff --git a/.jcode/agents/editor.toml b/.jcode/agents/editor.toml
new file mode 100644
index 000000000..28aed4d01
--- /dev/null
+++ b/.jcode/agents/editor.toml
@@ -0,0 +1,87 @@
+# Code editor agent.
+#
+# Spawned by the orchestrator to perform precise, reasoned code edits.
+# Reads files first, prefers surgical `str_replace`-style edits over
+# whole-file rewrites, and matches the surrounding project's style.
+#
+# Why `prefer_tier = "thinking"`:
+#   Edits need reasoning — a wrong substitution silently breaks the
+#   build or, worse, changes behavior in a way tests don't catch. A
+#   pay-per-token user with `JCODE_ROUTING_THINKING=<premium-model>`
+#   gets the right tool for the job; subscription users inherit the
+#   session model.
+#
+# Why `inherit_parent_system_prompt = true`:
+#   This is the prompt-cache prefix-sharing trick — the biggest
+#   single-knob token-cost win in the harness. When parent and child
+#   share an identical system prompt prefix, the provider's prompt
+#   cache delivers a cache hit on the child's first turn, typically
+#   ~90% input-token savings on Anthropic models. The editor is one
+#   of the most-spawned sub-agents, so this matters.
+#
+#   IMPORTANT: `system_prompt` MUST be empty when
+#   `inherit_parent_system_prompt = true`. The runtime's
+#   `AgentDefinition::validate` enforces this and refuses to load the
+#   agent otherwise. Per-agent specialization lives in
+#   `instructions_prompt` only.
+#
+# Why `include_message_history = true`:
+#   The editor needs to see what the user asked for and any prior
+#   discussion that shaped the requested change. Without history it
+#   would re-derive context the parent already has.
+
+id = "editor"
+display_name = "Code Editor"
+publisher = "jcode"
+version = "0.1.0"
+
+prefer_tier = "thinking"
+reasoning = "medium"
+
+inherit_parent_system_prompt = true
+include_message_history = true
+output_mode = "all_messages"
+
+# system_prompt MUST be empty when inherit_parent_system_prompt = true
+# (validated at load time). Specialization is purely in
+# instructions_prompt below.
+
+# Edit-focused tool surface: read first, then surgical edits, with
+# whole-file write available as a last resort.
+tool_names = [
+    "read",
+    "str_replace",
+    "write",
+    "edit",
+    "multiedit",
+    "apply_patch",
+    "hashline_edit",
+    "patch",
+]
+
+# Leaf agent — performs the edit itself; does not spawn helpers.
+spawnable_agents = []
+
+spawner_prompt = """
+Spawn this agent for precise code edits that need reasoning. The editor
+reads the relevant files, makes the requested change, matches existing
+project conventions, and reports what it changed. Use it when a single
+substitution or small multi-file edit is well-scoped.
+"""
+
+instructions_prompt = """
+You are an expert code editor.
+
+Make the requested edit:
+1. Read the target file(s) first to confirm current contents.
+2. Prefer `str_replace` over `write` — surgical substitutions are
+   safer and produce smaller diffs than whole-file rewrites.
+3. Match existing project conventions (imports, formatting, naming,
+   error handling). Look at sibling code if unsure.
+4. Do not introduce new dependencies. If the change appears to need
+   one, stop and report instead of adding it.
+
+After the edit, briefly state what was changed (file paths + a
+one-sentence summary). Do not restate code already visible in the
+edit's diff.
+"""
diff --git a/Cargo.lock b/Cargo.lock
index 20990af27..c2ee30bdc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3782,6 +3782,18 @@ dependencies = [
  "sha2 0.10.9",
 ]
 
+[[package]]
+name = "jcode-jbench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "jcode-agent-runtime",
+ "serde",
+ "serde_json",
+ "tokio",
+]
+
 [[package]]
 name = "jcode-memory-types"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index d47e95a80..44a9463d2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,6 +58,7 @@ members = [
     "crates/jcode-mobile-core",
     "crates/jcode-mobile-sim",
     "crates/jcode-desktop",
+    "evals/jbench",
 ]
 
 [lib]
diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs
index fc542fed4..e850495d5 100644
--- a/crates/jcode-agent-runtime/tests/sample_agents.rs
+++ b/crates/jcode-agent-runtime/tests/sample_agents.rs
@@ -112,3 +112,89 @@ fn sample_agents_validate_cleanly() {
             .unwrap_or_else(|err| panic!("{} failed validation: {err}", loaded.definition.id));
     }
 }
+
+#[test]
+fn basher_sample_has_expected_shape() {
+    let dir = samples_dir();
+    let mut reg = AgentRegistry::new();
+    reg.load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+
+    let agent = reg
+        .get("basher")
+        .expect("basher registered")
+        .definition
+        .clone();
+
+    assert_eq!(agent.id, "basher");
+    assert_eq!(agent.display_name, "Basher");
+    assert_eq!(agent.prefer_tier, Some(ModelTier::Routine));
+    assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal));
+    assert!(
+        !agent.include_message_history,
+        "basher uses a clean slate per command"
+    );
+    assert!(
+        !agent.inherit_parent_system_prompt,
+        "basher has its own short system prompt"
+    );
+    assert_eq!(agent.output_mode, OutputMode::LastMessage);
+    assert_eq!(agent.tool_names, vec!["bash"]);
+    assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+
+    // No tier env var set → resolve falls back to the session model.
+    let resolved = agent.resolve_model("session-model");
+    assert_eq!(
+        resolved, "session-model",
+        "no JCODE_ROUTING_ROUTINE → session default"
+    );
+}
+
+#[test]
+fn editor_sample_has_expected_shape() {
+    let dir = samples_dir();
+    let mut reg = AgentRegistry::new();
+    reg.load_directory(&dir, SourceKind::ProjectLocal)
+        .expect("load_directory");
+
+    let agent = reg
+        .get("editor")
+        .expect("editor registered")
+        .definition
+        .clone();
+
+    assert_eq!(agent.id, "editor");
+    assert_eq!(agent.display_name, "Code Editor");
+    assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking));
+    assert_eq!(agent.reasoning, Some(ReasoningEffort::Medium));
+    assert!(
+        agent.include_message_history,
+        "editor needs to see what the user asked for"
+    );
+    assert!(
+        agent.inherit_parent_system_prompt,
+        "editor must inherit parent system prompt for prompt-cache hits"
+    );
+    assert!(
+        agent.system_prompt.is_empty(),
+        "system_prompt must be empty when inheriting (enforced by validation)"
+    );
+    assert_eq!(agent.output_mode, OutputMode::AllMessages);
+    for expected in [
+        "read",
+        "str_replace",
+        "write",
+        "edit",
+        "multiedit",
+        "apply_patch",
+        "hashline_edit",
+        "patch",
+    ] {
+        assert!(
+            agent.tool_names.iter().any(|t| t == expected),
+            "editor tool_names missing `{expected}`: {:?}",
+            agent.tool_names,
+        );
+    }
+    assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+}
diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml
new file mode 100644
index 000000000..aad01216c
--- /dev/null
+++ b/evals/jbench/Cargo.toml
@@ -0,0 +1,24 @@
+[package]
+name = "jcode-jbench"
+version = "0.1.0"
+edition = "2024"
+description = "JBench — jcode's git-commit-reconstruction eval framework (scaffold)"
+
+[lib]
+name = "jcode_jbench"
+path = "src/lib.rs"
+
+[[bin]]
+name = "jbench"
+path = "src/bin/jbench.rs"
+
+[dependencies]
+jcode-agent-runtime = { path = "../../crates/jcode-agent-runtime" }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+anyhow = "1"
+tokio = { version = "1", default-features = false, features = ["sync"] }
+clap = { version = "4", features = ["derive"] }
+
+[dev-dependencies]
+serde_json = "1"
diff --git a/evals/jbench/README.md b/evals/jbench/README.md
new file mode 100644
index 000000000..ffd7c01a6
--- /dev/null
+++ b/evals/jbench/README.md
@@ -0,0 +1,110 @@
+# JBench
+
+JBench is jcode's evaluation framework for measuring AI coding agent
+performance through real-world git commit reconstruction tasks. It is the
+Rust port and adaptation of [Codebuff's BuffBench](https://github.com/codebuff/codebuff/tree/main/evals/buffbench)
+to the jcode multi-agent foundation.
+
+> **Status: scaffolding.** This crate currently provides typed data
+> models, module skeletons, and a CLI shell. The actual eval
+> orchestration (cloning repos, spawning agents, calling judge models,
+> running lessons extraction) is intentionally left as `unimplemented!()`
+> stubs so reviewers can validate the shape of the public API before any
+> end-to-end behavior lands. Real implementations will arrive in Phases
+> 5.3 (`agent_runner`), 5.4 (`judge`), and 5.5 (`lessons`).
+
+## Why git commit reconstruction?
+
+The core idea, borrowed directly from BuffBench, is that real git history
+contains a near-infinite stream of well-scoped, naturally-occurring tasks
+with built-in ground truth: each commit is a self-contained change with a
+known intent (the message / spec) and a known correct outcome (the diff).
+
+For each evaluation:
+
+1. Pick a commit `C` from a target repository.
+2. Reset the working tree to `parent(C)`.
+3. Hand the agent a natural-language prompt derived from `C`'s spec.
+4. Let the agent edit the repo.
+5. Compare the agent's diff against the ground-truth diff in `C`.
+
+This yields fair head-to-head comparisons across agents because every
+agent works from the exact same starting state and is judged against the
+same target.
+
+## Three-judge median
+
+A single LLM judge is noisy. JBench follows BuffBench's approach: every
+agent diff is judged by **three** different frontier models in parallel
+(today the planned slate is `gpt-5`, `gemini-pro`, and `claude-sonnet`),
+and the median `overall_score` is reported as the canonical result. Per-
+dimension averages (`completion_score`, `code_quality_score`,
+`overall_score`) are reported alongside the median's qualitative
+analysis.
+
+The three-judge pipeline lives in `src/judge.rs` (currently
+`unimplemented!()`). See `/tmp/codebuff/evals/buffbench/judge.ts` for the
+TypeScript original we are mirroring.
+
+## Lessons extractor
+
+After each run, the lessons extractor compares the agent's diff and
+trace against the ground-truth diff and emits a small list of
+`Lesson { what_went_wrong, what_should_have_been_done }` items. These
+lessons are intended to be appended to per-agent lesson files that can
+later be folded into the agent's system prompt or memory graph — the
+classic "learn from your mistakes" loop.
+
+The lessons module lives in `src/lessons.rs`.
+
+## Reuse of `jcode-agent-runtime`
+
+JBench is built on top of the new agent foundation in
+[`crates/jcode-agent-runtime`](../../crates/jcode-agent-runtime/), which
+provides:
+
+- `AgentRegistry` — discovery and loading of `.jcode/agents/*.toml`
+  agent definitions.
+- `AgentDefinition` — the declarative schema describing an agent's
+  model, tools, system prompt, output mode, etc.
+
+The agent runner (`src/agent_runner.rs`) will resolve agent IDs against
+the registry, spawn a `jcode` subprocess in a clean clone of the target
+repo, capture the trace, and return an `EvalRun` populated with the diff
+and judging result.
+
+## Module map
+
+| Module | Purpose |
+| --- | --- |
+| `types` | Serializable data structures (`EvalCommit`, `FileDiff`, `EvalDataV2`, `EvalRun`, `JudgingResult`, `AgentEvalResults`). Roundtrip-tested. |
+| `judge` | Three-judge median pipeline. **Stub.** |
+| `agent_runner` | Spawn an agent in a repo, capture trace + diff. **Stub.** |
+| `lessons` | Extract lessons from a failed/imperfect run. **Stub.** |
+| `bin/jbench.rs` | CLI: `pick-commits`, `gen-evals`, `run`, `judge`, `meta-analyze`. Each subcommand currently prints a TODO and exits 0. |
+
+## Workflow (planned)
+
+```
+pick-commits   →  select high-quality commits from a repo
+gen-evals      →  produce eval-{repo}.json with EvalDataV2 schema
+run            →  run agents against eval data, emit EvalRun per commit
+judge          →  re-judge an existing run with the 3-model median
+meta-analyze   →  aggregate analysis across all tasks for an agent
+```
+
+## Running
+
+```bash
+cargo check -p jcode-jbench
+cargo test  -p jcode-jbench
+cargo run   -p jcode-jbench --bin jbench -- run --help
+```
+
+## References
+
+- BuffBench source: `/tmp/codebuff/evals/buffbench/`
+- BuffBench README: `/tmp/codebuff/evals/buffbench/README.md`
+- Judge design: `/tmp/codebuff/evals/buffbench/judge.ts`
+- Agent runner design: `/tmp/codebuff/evals/buffbench/agent-runner.ts`
+- Lessons extractor design: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
new file mode 100644
index 000000000..1e56308ff
--- /dev/null
+++ b/evals/jbench/src/agent_runner.rs
@@ -0,0 +1,70 @@
+//! Spawn a jcode agent inside a freshly-prepared repo clone, run a
+//! single eval task, and capture the resulting diff and trace.
+//!
+//! The runner resolves the configured `agent_id` through the
+//! [`jcode_agent_runtime::AgentRegistry`] (loaded from
+//! `.jcode/agents/*.toml`), spawns the binary as a subprocess in the
+//! repo working directory, streams the trace, and finally extracts the
+//! unified diff against the parent commit.
+//!
+//! Design source: `/tmp/codebuff/evals/buffbench/agent-runner.ts`.
+//!
+//! Implementation lands in Phase 5.3; for now both entry points are
+//! `unimplemented!()` stubs whose signatures fix the contract the rest
+//! of the harness will rely on.
+
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+
+use anyhow::Result;
+
+use crate::types::EvalRun;
+
+/// Configuration for a single agent evaluation run.
+///
+/// `repo_path` should already contain a clean checkout of the eval
+/// commit's parent SHA; the runner does not clone for the caller.
+#[derive(Debug, Clone)]
+pub struct AgentRunConfig {
+    /// ID of the agent to run, matching an entry in the
+    /// `jcode-agent-runtime` registry.
+    pub agent_id: String,
+    /// Natural-language prompt to send to the agent (typically
+    /// `EvalCommit::prompt`).
+    pub prompt: String,
+    /// Working directory containing the prepared repo at the parent
+    /// commit.
+    pub repo_path: PathBuf,
+    /// Hard cap on the number of agent turns before the run is
+    /// aborted; mirrors BuffBench's per-task turn budget.
+    pub max_turns: u32,
+    /// Extra environment variables applied to the agent subprocess on
+    /// top of the calling process's environment.
+    pub env: HashMap<String, String>,
+}
+
+/// Spawn the configured agent in `config.repo_path`, run it to
+/// completion (or the turn / time budget), and return an [`EvalRun`]
+/// populated with the agent's diff, judging placeholder, cost, and
+/// duration.
+///
+/// The runner is responsible for:
+/// - Capturing the agent's full trace for later analysis.
+/// - Calling [`extract_diff_from_repo`] once the agent finishes.
+/// - Invoking the judging pipeline (or leaving that to the caller —
+///   the final wiring is decided in Phase 5.3).
+pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
+    let _ = config;
+    unimplemented!("Phase 5.3: spawn jcode subprocess in repo, capture trace")
+}
+
+/// Produce a unified diff describing all uncommitted changes in
+/// `repo_path` against its currently-checked-out HEAD.
+///
+/// Used after the agent finishes editing to capture the "agent's
+/// changes" half of the judging input. The exact git invocation
+/// (likely `git diff --no-color HEAD`) is finalized in Phase 5.3.
+pub fn extract_diff_from_repo(repo_path: &Path) -> Result<String> {
+    let _ = repo_path;
+    unimplemented!("Phase 5.3: shell out to git diff and return the unified diff")
+}
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
new file mode 100644
index 000000000..f0193e831
--- /dev/null
+++ b/evals/jbench/src/bin/jbench.rs
@@ -0,0 +1,69 @@
+//! `jbench` CLI entry point.
+//!
+//! This is a scaffold: every subcommand prints a TODO line describing
+//! the work it will do and exits 0. The argument shape, however, is
+//! real and stable — downstream tooling (CI, scripts) can wire against
+//! these subcommands today and pick up real behavior as Phases 5.3 →
+//! 5.5 land.
+//!
+//! All real work happens through the [`jcode_jbench`] library; this
+//! binary's only job is to dispatch.
+
+use clap::{Parser, Subcommand};
+
+// Pull in the library so the binary depends on it (and fails to
+// compile if its public surface regresses).
+use jcode_jbench as _;
+
+/// Top-level `jbench` CLI.
+#[derive(Debug, Parser)]
+#[command(
+    name = "jbench",
+    about = "JBench — jcode's git-commit-reconstruction eval framework",
+    version
+)]
+struct Cli {
+    /// Subcommand to dispatch to.
+    #[command(subcommand)]
+    command: Command,
+}
+
+/// JBench subcommands. Each is a stub today; see `README.md` for the
+/// intended workflow.
+#[derive(Debug, Subcommand)]
+enum Command {
+    /// Select high-quality commits from a target repo to use as eval
+    /// tasks.
+    PickCommits,
+    /// Generate an `eval-{repo}.json` file (`EvalDataV2`) from a list
+    /// of picked commits.
+    GenEvals,
+    /// Run one or more agents against an eval data file and emit
+    /// per-commit `EvalRun`s.
+    Run,
+    /// Re-judge an existing run with the three-judge median pipeline.
+    Judge,
+    /// Aggregate and analyze results across all tasks for an agent.
+    MetaAnalyze,
+}
+
+fn main() {
+    let cli = Cli::parse();
+    match cli.command {
+        Command::PickCommits => {
+            println!("TODO: jbench pick-commits — Phase 5.2 will implement commit selection.");
+        }
+        Command::GenEvals => {
+            println!("TODO: jbench gen-evals — Phase 5.2 will implement eval-data generation.");
+        }
+        Command::Run => {
+            println!("TODO: jbench run — Phase 5.3 will implement agent_runner orchestration.");
+        }
+        Command::Judge => {
+            println!("TODO: jbench judge — Phase 5.4 will implement three-judge median scoring.");
+        }
+        Command::MetaAnalyze => {
+            println!("TODO: jbench meta-analyze — Phase 5.6 will implement cross-task aggregation.");
+        }
+    }
+}
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
new file mode 100644
index 000000000..170a28203
--- /dev/null
+++ b/evals/jbench/src/judge.rs
@@ -0,0 +1,60 @@
+//! Three-judge median pipeline.
+//!
+//! Each agent diff is graded by **three** frontier models in parallel
+//! (planned slate: `gpt-5`, `gemini-pro`, `claude-sonnet`); the median
+//! `overall_score` selects which judge's qualitative analysis is
+//! reported, while the per-dimension scores are averaged across all
+//! valid judges. This mirrors the design of BuffBench's
+//! `judgeCommitResult` in `/tmp/codebuff/evals/buffbench/judge.ts`.
+//!
+//! The actual provider plumbing (which talks to each judge model
+//! through the existing jcode provider registry) lands in Phase 5.4.
+//! Until then both entry points are `unimplemented!()` stubs whose
+//! signatures define the public surface the rest of the harness will
+//! depend on.
+
+use std::collections::HashMap;
+
+use anyhow::Result;
+
+use crate::types::{EvalCommit, JudgingResult};
+
+/// Judge an agent's diff against the ground truth using three models in
+/// parallel and return a [`JudgingResult`] whose qualitative analysis
+/// comes from the median judge and whose numeric scores are averaged
+/// across all judges that returned successfully.
+///
+/// Why median + average?
+/// - **Median analysis** picks a representative voice and avoids the
+///   outlier judge dominating the prose.
+/// - **Average scores** smooth out judge-specific bias so the canonical
+///   overall metric tracks consensus, not whichever model happened to
+///   be selected.
+///
+/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts`
+/// (`judgeCommitResult`).
+///
+/// `context_files` is a `path -> contents` map of supplemental files
+/// from the parent commit; the judges receive these inline in the
+/// prompt to ground their evaluation.
+pub async fn judge_with_three_models(
+    commit: &EvalCommit,
+    agent_diff: &str,
+    context_files: &HashMap<String, String>,
+) -> Result<JudgingResult> {
+    let _ = (commit, agent_diff, context_files);
+    unimplemented!("Phase 5.4: run gpt-5 / gemini-pro / sonnet judges in parallel and return median+average")
+}
+
+/// Invoke a single judge model with a fully-rendered prompt.
+///
+/// Used internally by [`judge_with_three_models`] and exposed publicly
+/// so callers can re-judge a stored run with a different model without
+/// re-running the full three-judge pipeline.
+///
+/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts`
+/// (`runSingleJudge`).
+pub async fn run_single_judge(model_id: &str, prompt: &str) -> Result<JudgingResult> {
+    let _ = (model_id, prompt);
+    unimplemented!("Phase 5.4: wire to provider registry")
+}
diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs
new file mode 100644
index 000000000..7a919d646
--- /dev/null
+++ b/evals/jbench/src/lessons.rs
@@ -0,0 +1,65 @@
+//! Lessons extractor.
+//!
+//! After an eval run finishes, the lessons extractor compares the
+//! agent's actual diff and trace against the ground-truth diff and
+//! distills a small list of [`Lesson`]s describing what went wrong and
+//! what the agent should have done instead. These can be appended to a
+//! per-agent lessons file and folded back into the agent's system
+//! prompt or memory graph.
+//!
+//! Design source: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`.
+//!
+//! Implementation lands in Phase 5.5.
+
+use std::path::Path;
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+
+/// One distilled lesson from a single eval run.
+///
+/// Kept deliberately minimal — both fields are free-form prose. Richer
+/// structure (severity, tags, links to specific commits) can be added
+/// later without breaking the on-disk format because lesson files are
+/// JSON arrays of this struct.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Lesson {
+    /// Concise description of the failure mode observed in the trace
+    /// or diff. One or two sentences.
+    pub what_went_wrong: String,
+    /// Concise description of the corrective behavior the agent should
+    /// have performed instead. One or two sentences.
+    pub what_should_have_been_done: String,
+}
+
+/// Run the lessons-extractor judge over a finished eval run and return
+/// zero or more [`Lesson`]s.
+///
+/// The extractor receives the prompt the agent was given, the ground
+/// truth diff for context, the diff the agent actually produced, and
+/// the agent's full trace. It returns an empty `Vec` when the run was
+/// successful enough that no corrective lesson applies.
+pub async fn extract_lessons(
+    prompt: &str,
+    ground_truth_diff: &str,
+    agent_diff: &str,
+    agent_trace: &str,
+) -> Result<Vec<Lesson>> {
+    let _ = (prompt, ground_truth_diff, agent_diff, agent_trace);
+    unimplemented!("Phase 5.5: invoke lessons-extractor judge and parse Vec<Lesson>")
+}
+
+/// Append `lessons` to the per-agent lessons file at
+/// `lessons_dir/<agent_id>.json`, creating the file (and the directory)
+/// if needed.
+///
+/// The on-disk format is a JSON array of [`Lesson`]; appending preserves
+/// previously-extracted lessons so the file accumulates over many runs.
+pub fn append_lessons_to_file(
+    agent_id: &str,
+    lessons: &[Lesson],
+    lessons_dir: &Path,
+) -> Result<()> {
+    let _ = (agent_id, lessons, lessons_dir);
+    unimplemented!("Phase 5.5: read-modify-write JSON array at lessons_dir/<agent_id>.json")
+}
diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs
new file mode 100644
index 000000000..57c5809f7
--- /dev/null
+++ b/evals/jbench/src/lib.rs
@@ -0,0 +1,19 @@
+//! JBench — jcode's git-commit-reconstruction evaluation framework.
+//!
+//! This crate is a scaffold: data types are real and roundtrip-tested,
+//! but orchestration logic is stubbed with `unimplemented!()` so that
+//! reviewers can validate the public API surface before behavior lands.
+//!
+//! See `README.md` for the design and the BuffBench reference at
+//! `/tmp/codebuff/evals/buffbench/` for the TypeScript original.
+//!
+//! The crate consumes [`jcode_agent_runtime::AgentRegistry`] and
+//! [`jcode_agent_runtime::AgentDefinition`] for agent discovery and
+//! configuration; it does not redefine those concepts locally.
+
+#![forbid(unsafe_code)]
+
+pub mod agent_runner;
+pub mod judge;
+pub mod lessons;
+pub mod types;
diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs
new file mode 100644
index 000000000..3f3a9e763
--- /dev/null
+++ b/evals/jbench/src/types.rs
@@ -0,0 +1,173 @@
+//! Serializable data types modeling JBench's eval inputs and outputs.
+//!
+//! These types are direct Rust analogues of BuffBench's TypeScript types
+//! (`/tmp/codebuff/evals/buffbench/types.ts`) with one deliberate
+//! deviation: every field uses `snake_case` in both the Rust definition
+//! and the on-disk JSON form, because the rest of jcode's serialized
+//! formats already follow `snake_case`.
+//!
+//! All public types derive `Debug`, `Clone`, `Serialize`, and
+//! `Deserialize`. Numeric scores are `f64` in the `[0.0, 10.0]` range —
+//! validation is not enforced at the type level so partial / in-progress
+//! results round-trip cleanly.
+
+use std::collections::HashMap;
+
+use serde::{Deserialize, Serialize};
+
+/// Status of a single file inside an [`EvalCommit`]'s diff.
+///
+/// Mirrors BuffBench's `'modified' | 'added' | 'deleted' | 'renamed'`
+/// string union; serialized as lowercase strings so generated eval JSON
+/// stays compact and readable.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
+pub enum FileDiffStatus {
+    /// File existed before and after, with content changes.
+    Modified,
+    /// File was created in this commit.
+    Added,
+    /// File was deleted in this commit.
+    Deleted,
+    /// File was renamed (and possibly modified) in this commit.
+    Renamed,
+}
+
+/// Per-file diff entry for a single eval commit.
+///
+/// `old_path` is populated only for `Renamed` entries; for all other
+/// statuses it is `None` and skipped during serialization to keep the
+/// JSON output compact.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct FileDiff {
+    /// Current path of the file (post-commit). For renames this is the
+    /// new name.
+    pub path: String,
+    /// What kind of change this file underwent.
+    pub status: FileDiffStatus,
+    /// Previous path, only populated when `status == Renamed`.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub old_path: Option<String>,
+    /// Unified diff text for the change. May be empty for pure renames.
+    pub diff: String,
+}
+
+/// One eval task: a single git commit reconstructed from its parent.
+///
+/// The agent under test starts from `parent_sha`, is given `prompt`,
+/// and is judged against `file_diffs`. `supplemental_files` lists
+/// additional context paths the harness should preload into the agent's
+/// view (BuffBench picks these via a separate filter step).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvalCommit {
+    /// Stable identifier for the task, typically `<short_sha>-<slug>`.
+    pub id: String,
+    /// Target commit SHA — the ground-truth state.
+    pub sha: String,
+    /// Parent commit SHA — the starting state for the agent.
+    pub parent_sha: String,
+    /// Technical specification distilled from the commit message.
+    pub spec: String,
+    /// Natural-language prompt presented to the agent under test.
+    pub prompt: String,
+    /// Extra files (relative paths) the harness should expose as
+    /// context, in addition to whatever the agent fetches itself.
+    pub supplemental_files: Vec<String>,
+    /// Ground-truth file diffs for this commit.
+    pub file_diffs: Vec<FileDiff>,
+}
+
+/// Top-level eval data file (v2 schema), produced by `gen-evals` and
+/// consumed by `run`.
+///
+/// `env` and `final_check_commands` are reserved for future use by the
+/// runner; they are part of the on-disk schema today so eval JSON files
+/// authored against this scaffold remain forward-compatible.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvalDataV2 {
+    /// Source repository to clone for each task.
+    pub repo_url: String,
+    /// Optional override for the local clone directory name.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub test_repo_name: Option<String>,
+    /// ISO-8601 timestamp of when this eval file was generated.
+    pub generation_date: String,
+    /// Optional one-time setup command (e.g. `npm install`).
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub init_command: Option<String>,
+    /// Environment variables to apply when running agents and final
+    /// checks. Defaults to empty.
+    #[serde(default)]
+    pub env: HashMap<String, String>,
+    /// Validation commands run after the agent finishes (e.g. `cargo
+    /// test`). Defaults to empty.
+    #[serde(default)]
+    pub final_check_commands: Vec<String>,
+    /// The actual list of commits to evaluate against.
+    pub eval_commits: Vec<EvalCommit>,
+}
+
+/// Output of a single judge invocation (or the median of three).
+///
+/// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is
+/// used so we can also store the *averaged* per-dimension scores when
+/// aggregating multiple judges (see `judge::judge_with_three_models`).
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct JudgingResult {
+    /// Free-form prose comparing the agent's diff to the ground truth.
+    pub analysis: String,
+    /// Bullet-point strengths called out by the judge.
+    pub strengths: Vec<String>,
+    /// Bullet-point weaknesses called out by the judge.
+    pub weaknesses: Vec<String>,
+    /// How completely the prompt was addressed, `[0.0, 10.0]`.
+    pub completion_score: f64,
+    /// Code structure / maintainability, `[0.0, 10.0]`.
+    pub code_quality_score: f64,
+    /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric.
+    pub overall_score: f64,
+}
+
+/// Outcome of running one agent on one eval commit.
+///
+/// `error` is `Some` when the agent crashed, timed out, or otherwise
+/// failed to produce a usable diff; in that case `judging` will
+/// typically contain a zero-scored placeholder.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct EvalRun {
+    /// SHA of the eval commit this run targeted.
+    pub commit_sha: String,
+    /// Prompt the agent was given.
+    pub prompt: String,
+    /// Unified diff produced by the agent against the parent commit.
+    pub diff: String,
+    /// Three-judge result (see [`crate::judge`]).
+    pub judging: JudgingResult,
+    /// Estimated USD cost of running the agent.
+    pub cost_usd: f64,
+    /// Wall-clock duration of the run in milliseconds.
+    pub duration_ms: u64,
+    /// Populated when the run failed to complete cleanly.
+    #[serde(skip_serializing_if = "Option::is_none", default)]
+    pub error: Option<String>,
+}
+
+/// Aggregated results for one agent across an entire eval suite.
+///
+/// `average_score` here is `overall_score`; cost and duration averages
+/// are computed across **all** runs (including failures) so consumers
+/// can spot agents that are cheap or fast at the price of correctness.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentEvalResults {
+    /// ID of the agent (matches an `AgentDefinition::id` in the
+    /// `jcode-agent-runtime` registry).
+    pub agent_id: String,
+    /// Per-commit runs, in evaluation order.
+    pub runs: Vec<EvalRun>,
+    /// Mean of `judging.overall_score` across runs.
+    pub average_score: f64,
+    /// Mean of `cost_usd` across runs.
+    pub average_cost: f64,
+    /// Mean of `duration_ms` across runs.
+    pub average_duration_ms: u64,
+}
diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs
new file mode 100644
index 000000000..2a8efd02e
--- /dev/null
+++ b/evals/jbench/tests/types.rs
@@ -0,0 +1,108 @@
+//! Serde round-trip smoke tests for the public data types.
+//!
+//! These exercise the JSON shape that `gen-evals` and `run` will read
+//! and write, and they fail loudly if anyone changes a field's
+//! `snake_case` name without updating consumers.
+
+use jcode_jbench::types::{
+    EvalCommit, FileDiff, FileDiffStatus, JudgingResult,
+};
+
+#[test]
+fn eval_commit_round_trips_through_json() {
+    let original = EvalCommit {
+        id: "abc1234-add-readme".to_string(),
+        sha: "abc1234deadbeef".to_string(),
+        parent_sha: "0011223344556677".to_string(),
+        spec: "Add a README describing the project.".to_string(),
+        prompt: "Please add a README.md at the repo root.".to_string(),
+        supplemental_files: vec!["Cargo.toml".to_string(), "src/lib.rs".to_string()],
+        file_diffs: vec![FileDiff {
+            path: "README.md".to_string(),
+            status: FileDiffStatus::Added,
+            old_path: None,
+            diff: "+++ b/README.md\n@@ -0,0 +1 @@\n+hello\n".to_string(),
+        }],
+    };
+
+    let json = serde_json::to_string(&original).expect("serialize EvalCommit");
+    // Sanity-check the wire format is snake_case as documented.
+    assert!(json.contains("\"parent_sha\""));
+    assert!(json.contains("\"supplemental_files\""));
+    assert!(json.contains("\"file_diffs\""));
+
+    let decoded: EvalCommit = serde_json::from_str(&json).expect("deserialize EvalCommit");
+    assert_eq!(decoded.id, original.id);
+    assert_eq!(decoded.sha, original.sha);
+    assert_eq!(decoded.parent_sha, original.parent_sha);
+    assert_eq!(decoded.spec, original.spec);
+    assert_eq!(decoded.prompt, original.prompt);
+    assert_eq!(decoded.supplemental_files, original.supplemental_files);
+    assert_eq!(decoded.file_diffs.len(), 1);
+    assert_eq!(decoded.file_diffs[0].path, "README.md");
+    assert!(matches!(
+        decoded.file_diffs[0].status,
+        FileDiffStatus::Added
+    ));
+}
+
+#[test]
+fn file_diff_round_trips_renamed_with_old_path() {
+    let original = FileDiff {
+        path: "src/new_name.rs".to_string(),
+        status: FileDiffStatus::Renamed,
+        old_path: Some("src/old_name.rs".to_string()),
+        diff: "rename from src/old_name.rs\nrename to src/new_name.rs\n".to_string(),
+    };
+
+    let json = serde_json::to_string(&original).expect("serialize FileDiff");
+    assert!(json.contains("\"status\":\"renamed\""));
+    assert!(json.contains("\"old_path\":\"src/old_name.rs\""));
+
+    let decoded: FileDiff = serde_json::from_str(&json).expect("deserialize FileDiff");
+    assert_eq!(decoded.path, original.path);
+    assert!(matches!(decoded.status, FileDiffStatus::Renamed));
+    assert_eq!(decoded.old_path.as_deref(), Some("src/old_name.rs"));
+    assert_eq!(decoded.diff, original.diff);
+
+    // And a Modified entry should omit `old_path` from the JSON.
+    let modified = FileDiff {
+        path: "src/lib.rs".to_string(),
+        status: FileDiffStatus::Modified,
+        old_path: None,
+        diff: "@@ -1 +1 @@\n-old\n+new\n".to_string(),
+    };
+    let modified_json = serde_json::to_string(&modified).expect("serialize Modified FileDiff");
+    assert!(
+        !modified_json.contains("old_path"),
+        "old_path should be skipped when None, got: {modified_json}"
+    );
+}
+
+#[test]
+fn judging_result_round_trips_through_json() {
+    let original = JudgingResult {
+        analysis: "The agent addressed the prompt and produced clean code.".to_string(),
+        strengths: vec![
+            "Followed existing module structure.".to_string(),
+            "Added a passing test.".to_string(),
+        ],
+        weaknesses: vec!["Missed an edge case in error handling.".to_string()],
+        completion_score: 8.5,
+        code_quality_score: 7.0,
+        overall_score: 7.75,
+    };
+
+    let json = serde_json::to_string(&original).expect("serialize JudgingResult");
+    assert!(json.contains("\"completion_score\""));
+    assert!(json.contains("\"code_quality_score\""));
+    assert!(json.contains("\"overall_score\""));
+
+    let decoded: JudgingResult = serde_json::from_str(&json).expect("deserialize JudgingResult");
+    assert_eq!(decoded.analysis, original.analysis);
+    assert_eq!(decoded.strengths, original.strengths);
+    assert_eq!(decoded.weaknesses, original.weaknesses);
+    assert!((decoded.completion_score - original.completion_score).abs() < f64::EPSILON);
+    assert!((decoded.code_quality_score - original.code_quality_score).abs() < f64::EPSILON);
+    assert!((decoded.overall_score - original.overall_score).abs() < f64::EPSILON);
+}
diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs
index d3735d65b..ba9719985 100644
--- a/src/agent/prompting.rs
+++ b/src/agent/prompting.rs
@@ -121,3 +121,46 @@ impl Agent {
         self.build_memory_prompt_nonblocking_shared(messages.to_vec().into(), _memory_event_tx)
     }
 }
+
+
+/// Wrap a step prompt body in `<system_reminder>...</system_reminder>` tags.
+///
+/// Step prompts are emitted by the harness (not typed by the user), but they
+/// arrive in the conversation transcript at the same position a user message
+/// would. Without disambiguation, the LLM tends to treat them as a fresh user
+/// turn — re-greeting, re-asking, or otherwise breaking flow.
+///
+/// Wrapping the body in `<system_reminder>` tags signals "this is harness
+/// scaffolding, not the user speaking" and lets the model continue its
+/// existing turn cleanly. Returns an empty string when `prompt` is empty so
+/// callers don't end up emitting an empty tag pair.
+///
+/// This helper is intentionally not yet wired into step-prompt emission;
+/// integration will land alongside the Phase 1 `AgentDefinition.step_prompt`
+/// changes.
+pub fn wrap_as_system_reminder(prompt: &str) -> String {
+    if prompt.is_empty() {
+        String::new()
+    } else {
+        format!("<system_reminder>{}</system_reminder>", prompt)
+    }
+}
+
+#[cfg(test)]
+mod wrap_as_system_reminder_tests {
+    use super::wrap_as_system_reminder;
+
+    #[test]
+    fn wrap_as_system_reminder_empty_input_returns_empty() {
+        assert_eq!(wrap_as_system_reminder(""), "");
+    }
+
+    #[test]
+    fn wrap_as_system_reminder_non_empty_input_wrapped_correctly() {
+        let body = "remaining steps: 3";
+        assert_eq!(
+            wrap_as_system_reminder(body),
+            "<system_reminder>remaining steps: 3</system_reminder>"
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 223df89ef..f9c310a64 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -65,6 +65,7 @@ pub mod prefix_cache_stable;
 pub mod process_memory;
 pub mod process_title;
 pub mod prompt;
+pub mod prompt_placeholders;
 pub mod prompt_templates;
 pub mod protocol;
 pub mod provider;
diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs
new file mode 100644
index 000000000..68cee139a
--- /dev/null
+++ b/src/prompt_placeholders.rs
@@ -0,0 +1,200 @@
+//! Phase 4 prompt placeholder substitution helper.
+//!
+//! Provides a small `String -> String` transformation that replaces a fixed
+//! set of `{{PLACEHOLDER}}` tokens with values supplied through a
+//! [`PlaceholderContext`]. Designed to be a pure utility: no I/O, no errors,
+//! no global state. Callers are responsible for assembling the context and
+//! choosing where to apply substitution (system prompt, step prompt, etc.).
+//!
+//! Supported tokens (case-sensitive, exact match including the surrounding
+//! double curly braces):
+//!
+//! - `{{FILE_TREE_SMALL}}`   — truncated project tree, max 2500 chars.
+//! - `{{FILE_TREE}}`         — fuller project tree, max 10000 chars.
+//! - `{{KNOWLEDGE_FILES}}`   — concatenated knowledge / context files (no limit).
+//! - `{{GIT_CHANGES}}`       — `git diff` / status summary, max 30000 chars.
+//! - `{{CURRENT_DATE}}`      — ISO `YYYY-MM-DD` date string.
+//! - `{{REMAINING_STEPS}}`   — remaining-step counter (u32, decimal).
+//! - `{{SYSTEM_INFO}}`       — OS / arch / shell summary.
+//!
+//! Empty `String` fields and `remaining_steps == 0` are replaced with an
+//! empty string rather than the literal placeholder text. Tokens that are
+//! not in the supported list are left untouched in the output, so this
+//! function is safe to apply to text that may contain other Mustache-like
+//! syntax.
+
+/// Maximum char count retained for [`PlaceholderContext::file_tree_small`].
+pub const FILE_TREE_SMALL_MAX_CHARS: usize = 2_500;
+
+/// Maximum char count retained for [`PlaceholderContext::file_tree`].
+pub const FILE_TREE_MAX_CHARS: usize = 10_000;
+
+/// Maximum char count retained for [`PlaceholderContext::git_changes`].
+pub const GIT_CHANGES_MAX_CHARS: usize = 30_000;
+
+/// Container for values that can be substituted into prompt templates.
+///
+/// All `String` fields default to empty and `remaining_steps` defaults to 0.
+/// Use [`PlaceholderContext::default`] and assign the fields you have data
+/// for; missing fields will simply substitute as empty.
+#[derive(Debug, Default, Clone)]
+pub struct PlaceholderContext {
+    /// Compact project file tree. Truncated to [`FILE_TREE_SMALL_MAX_CHARS`]
+    /// chars during substitution.
+    pub file_tree_small: String,
+    /// Fuller project file tree. Truncated to [`FILE_TREE_MAX_CHARS`] chars
+    /// during substitution.
+    pub file_tree: String,
+    /// Concatenated knowledge/context files. No length limit is applied.
+    pub knowledge_files: String,
+    /// Git diff / status summary. Truncated to [`GIT_CHANGES_MAX_CHARS`]
+    /// chars during substitution.
+    pub git_changes: String,
+    /// Current date in ISO `YYYY-MM-DD` form.
+    pub current_date: String,
+    /// Remaining steps allowed for the current run/turn. Zero substitutes
+    /// to an empty string.
+    pub remaining_steps: u32,
+    /// Free-form system info (OS / arch / shell).
+    pub system_info: String,
+}
+
+/// Return at most `max_chars` characters from `s`, respecting char
+/// boundaries. If `s` already fits within the limit it is returned
+/// unchanged (cloned).
+fn truncate_chars(s: &str, max_chars: usize) -> String {
+    if s.chars().count() <= max_chars {
+        s.to_string()
+    } else {
+        s.chars().take(max_chars).collect()
+    }
+}
+
+/// Replace every supported placeholder token in `prompt` with the matching
+/// value from `ctx`. Unknown `{{TOKENS}}` are preserved verbatim. Empty
+/// values (and `remaining_steps == 0`) replace the placeholder with an
+/// empty string.
+///
+/// Length caps documented on [`PlaceholderContext`] are enforced here, so
+/// callers may pass un-truncated input and trust the output to be bounded.
+pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String {
+    if prompt.is_empty() {
+        return String::new();
+    }
+
+    let file_tree_small = truncate_chars(&ctx.file_tree_small, FILE_TREE_SMALL_MAX_CHARS);
+    let file_tree = truncate_chars(&ctx.file_tree, FILE_TREE_MAX_CHARS);
+    let git_changes = truncate_chars(&ctx.git_changes, GIT_CHANGES_MAX_CHARS);
+    let remaining_steps = if ctx.remaining_steps == 0 {
+        String::new()
+    } else {
+        ctx.remaining_steps.to_string()
+    };
+
+    // Each entry is (token, replacement). Order is irrelevant because
+    // tokens never overlap, but we keep it stable for determinism.
+    let replacements: [(&str, &str); 7] = [
+        ("{{FILE_TREE_SMALL}}", file_tree_small.as_str()),
+        ("{{FILE_TREE}}", file_tree.as_str()),
+        ("{{KNOWLEDGE_FILES}}", ctx.knowledge_files.as_str()),
+        ("{{GIT_CHANGES}}", git_changes.as_str()),
+        ("{{CURRENT_DATE}}", ctx.current_date.as_str()),
+        ("{{REMAINING_STEPS}}", remaining_steps.as_str()),
+        ("{{SYSTEM_INFO}}", ctx.system_info.as_str()),
+    ];
+
+    let mut out = prompt.to_string();
+    for (token, value) in replacements {
+        if out.contains(token) {
+            out = out.replace(token, value);
+        }
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty_context_replaces_all_placeholders_with_empty() {
+        let ctx = PlaceholderContext::default();
+        let input = "tree=[{{FILE_TREE_SMALL}}] full=[{{FILE_TREE}}] \
+                     k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \
+                     date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \
+                     sys=[{{SYSTEM_INFO}}]";
+        let out = substitute_placeholders(input, &ctx);
+        assert_eq!(
+            out,
+            "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"
+        );
+    }
+
+    #[test]
+    fn individual_placeholder_works() {
+        let ctx = PlaceholderContext {
+            current_date: "2026-05-25".to_string(),
+            ..Default::default()
+        };
+        let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx);
+        assert_eq!(out, "today is 2026-05-25.");
+
+        // Unrelated placeholder stays empty in the same call.
+        let out2 = substitute_placeholders(
+            "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}",
+            &ctx,
+        );
+        assert_eq!(out2, "date=2026-05-25 steps=");
+    }
+
+    #[test]
+    fn multiple_placeholders_in_same_string_work() {
+        let ctx = PlaceholderContext {
+            file_tree_small: "src/\n  lib.rs".to_string(),
+            knowledge_files: "AGENTS.md contents".to_string(),
+            current_date: "2026-05-25".to_string(),
+            remaining_steps: 7,
+            system_info: "linux x86_64".to_string(),
+            ..Default::default()
+        };
+        let input = "## Tree\n{{FILE_TREE_SMALL}}\n\n## Knowledge\n\
+                     {{KNOWLEDGE_FILES}}\n\n## Meta\n\
+                     date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \
+                     sys={{SYSTEM_INFO}}";
+        let out = substitute_placeholders(input, &ctx);
+        let expected = "## Tree\nsrc/\n  lib.rs\n\n## Knowledge\n\
+                        AGENTS.md contents\n\n## Meta\n\
+                        date=2026-05-25 steps=7 sys=linux x86_64";
+        assert_eq!(out, expected);
+    }
+
+    #[test]
+    fn unknown_placeholder_text_remains_as_is() {
+        let ctx = PlaceholderContext {
+            current_date: "2026-05-25".to_string(),
+            ..Default::default()
+        };
+        let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \
+                     other={{ALSO_BOGUS}}";
+        let out = substitute_placeholders(input, &ctx);
+        assert_eq!(
+            out,
+            "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}"
+        );
+    }
+
+    #[test]
+    fn truncation_caps_long_inputs() {
+        // Build a string longer than the file-tree-small cap.
+        let big: String = "x".repeat(FILE_TREE_SMALL_MAX_CHARS + 1234);
+        let ctx = PlaceholderContext {
+            file_tree_small: big.clone(),
+            ..Default::default()
+        };
+        let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx);
+        // Two bracket characters plus the cap.
+        assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2);
+        assert!(out.starts_with('['));
+        assert!(out.ends_with(']'));
+    }
+}

From 8a1963d97cbaf52263d89f942910428c122981d0 Mon Sep 17 00:00:00 2001
From: quangdang46 <tranquangdang21@gmail.com>
Date: Tue, 26 May 2026 11:29:45 +0700
Subject: [PATCH 05/22] =?UTF-8?q?feat(jbench):=20implement=20Phase=205.3-5?=
 =?UTF-8?q?.5=20stubs=20=E2=80=94=20judge=20pipeline,=20lessons,=20CLI?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 5.3 (agent_runner): `run_agent_in_repo()` spawns jcode subprocess
  with prompt on stdin, streams stdout, captures trace + diff via
  `git diff HEAD`. Uses `timeout()` for per-run deadline.

Phase 5.4 (judge): `judge_with_three_models()` runs GPT + Gemini +
  Claude judges in parallel via OpenAI Responses API + Anthropic
  Messages API. Median analysis, averaged scores. `run_single_judge()`
  exposes per-judge entry point.

Phase 5.5 (lessons): `extract_lessons()` calls lessons extractor model
  via Responses API. `append_lessons_to_file()` accumulates lessons in
  per-agent JSON files with read-modify-write.

Phase 5.6 (CLI): Full `jbench run` implemented (loads eval JSON, iterates
  commits, calls `run_agent_in_repo`, writes `.run.json` files).
  `jbench meta-analyze` aggregates results. Other subcommands print
  Phase stubs and exit 0.

Bug fixes:
- `JudgingResult: Default` impl added (needed for EvalRun init)
- `OnceLock` for lazy reqwest static client (fixes const-eval restrictions)
- `context` method from `anyhow::Context` imported in bin
---
 Cargo.lock                       |   3 +
 evals/jbench/Cargo.toml          |   7 +-
 evals/jbench/src/agent_runner.rs | 137 +++++++--
 evals/jbench/src/bin/jbench.rs   | 248 +++++++++++++++--
 evals/jbench/src/judge.rs        | 458 ++++++++++++++++++++++++++++---
 evals/jbench/src/lessons.rs      | 306 +++++++++++++++++++--
 evals/jbench/src/lib.rs          |   5 +
 evals/jbench/src/types.rs        |  13 +
 8 files changed, 1072 insertions(+), 105 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c2ee30bdc..102c8eb23 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3788,9 +3788,12 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "clap",
+ "futures",
  "jcode-agent-runtime",
+ "reqwest",
  "serde",
  "serde_json",
+ "tempfile",
  "tokio",
 ]
 
diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml
index aad01216c..b9db6899a 100644
--- a/evals/jbench/Cargo.toml
+++ b/evals/jbench/Cargo.toml
@@ -17,8 +17,11 @@ jcode-agent-runtime = { path = "../../crates/jcode-agent-runtime" }
 serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 anyhow = "1"
-tokio = { version = "1", default-features = false, features = ["sync"] }
-clap = { version = "4", features = ["derive"] }
+tokio = { version = "1", default-features = false, features = ["rt-multi-thread", "macros", "io-util", "process", "time", "sync"] }
+futures = "0.3"
+reqwest = { version = "0.12", features = ["json"] }
+clap = { version = "4", features = ["derive", "env"] }
 
 [dev-dependencies]
 serde_json = "1"
+tempfile = "3"
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index 1e56308ff..de922e71d 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -8,15 +8,16 @@
 //! unified diff against the parent commit.
 //!
 //! Design source: `/tmp/codebuff/evals/buffbench/agent-runner.ts`.
-//!
-//! Implementation lands in Phase 5.3; for now both entry points are
-//! `unimplemented!()` stubs whose signatures fix the contract the rest
-//! of the harness will rely on.
 
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
+use std::process::Stdio;
+use std::time::{Duration, Instant};
 
-use anyhow::Result;
+use anyhow::{Context, Result};
+use tokio::io::{AsyncBufReadExt, BufReader};
+use tokio::process::Command;
+use tokio::time::timeout;
 
 use crate::types::EvalRun;
 
@@ -38,33 +39,131 @@ pub struct AgentRunConfig {
     /// Hard cap on the number of agent turns before the run is
     /// aborted; mirrors BuffBench's per-task turn budget.
     pub max_turns: u32,
+    /// Timeout for the entire run in seconds (defaults to 60 minutes).
+    pub timeout_secs: u64,
     /// Extra environment variables applied to the agent subprocess on
     /// top of the calling process's environment.
     pub env: HashMap<String, String>,
+    /// Path to the `jcode` binary. Defaults to searching $PATH.
+    pub jcode_binary: Option<PathBuf>,
+}
+
+impl Default for AgentRunConfig {
+    fn default() -> Self {
+        Self {
+            agent_id: String::new(),
+            prompt: String::new(),
+            repo_path: PathBuf::new(),
+            max_turns: 100,
+            timeout_secs: 60 * 60,
+            env: HashMap::new(),
+            jcode_binary: None,
+        }
+    }
 }
 
 /// Spawn the configured agent in `config.repo_path`, run it to
 /// completion (or the turn / time budget), and return an [`EvalRun`]
 /// populated with the agent's diff, judging placeholder, cost, and
 /// duration.
-///
-/// The runner is responsible for:
-/// - Capturing the agent's full trace for later analysis.
-/// - Calling [`extract_diff_from_repo`] once the agent finishes.
-/// - Invoking the judging pipeline (or leaving that to the caller —
-///   the final wiring is decided in Phase 5.3).
 pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
-    let _ = config;
-    unimplemented!("Phase 5.3: spawn jcode subprocess in repo, capture trace")
+    let start = Instant::now();
+    let timeout_duration = Duration::from_secs(config.timeout_secs);
+
+    let jcode_bin = config
+        .jcode_binary
+        .clone()
+        .unwrap_or_else(|| PathBuf::from("jcode"));
+
+    let mut env_vars: HashMap<String, String> = std::env::vars().collect();
+    env_vars.extend(config.env);
+    env_vars.insert("JCODE_AGENT_ID".to_owned(), config.agent_id.clone());
+
+    let mut child = Command::new(&jcode_bin)
+        .current_dir(&config.repo_path)
+        .envs(&env_vars)
+        .args([
+            "agent", "run",
+            "--agent", &config.agent_id,
+            "--output-mode", "stream",
+            "--no-interactive",
+        ])
+        .stdin(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stderr(Stdio::piped())
+        .spawn()
+        .with_context(|| format!("failed to spawn jcode binary at {:?}", jcode_bin))?;
+
+    let mut child_stdin = child.stdin.take().expect("stdin captured");
+    let stdout = child.stdout.take().expect("stdout captured");
+
+    // Write the prompt to stdin
+    {
+        use tokio::io::AsyncWriteExt;
+        let mut stdin = tokio::io::BufWriter::new(&mut child_stdin);
+        stdin.write_all(config.prompt.as_bytes()).await?;
+        stdin.flush().await?;
+        drop(stdin);
+    }
+
+    let mut trace_lines = Vec::new();
+    let reader = BufReader::new(stdout);
+    let mut lines_stream = reader.lines();
+    loop {
+        let line = timeout(timeout_duration, lines_stream.next_line()).await;
+        match line {
+            Ok(Ok(Some(l))) => trace_lines.push(l),
+            _ => break,
+        }
+    }
+
+    let status = child
+        .wait()
+        .await
+        .context("failed to wait for jcode subprocess")?;
+
+    let diff = extract_diff_from_repo(&config.repo_path)?;
+    let error = if !status.success() {
+        Some(format!("jcode exited with status {:?}", status))
+    } else {
+        None
+    };
+
+    Ok(EvalRun {
+        commit_sha: String::new(),
+        prompt: config.prompt,
+        diff,
+        judging: Default::default(),
+        cost_usd: 0.0,
+        duration_ms: start.elapsed().as_millis() as u64,
+        error,
+    })
 }
 
 /// Produce a unified diff describing all uncommitted changes in
 /// `repo_path` against its currently-checked-out HEAD.
-///
-/// Used after the agent finishes editing to capture the "agent's
-/// changes" half of the judging input. The exact git invocation
-/// (likely `git diff --no-color HEAD`) is finalized in Phase 5.3.
 pub fn extract_diff_from_repo(repo_path: &Path) -> Result<String> {
-    let _ = repo_path;
-    unimplemented!("Phase 5.3: shell out to git diff and return the unified diff")
+    let output = std::process::Command::new("git")
+        .args(["diff", "--no-color", "HEAD"])
+        .current_dir(repo_path)
+        .output()
+        .context("git diff failed")?;
+
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("git diff exited with error: {stderr}");
+    }
+
+    Ok(String::from_utf8_lossy(&output.stdout).to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[tokio::test]
+    async fn extract_diff_from_repo_nonexistent() {
+        let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist"));
+        assert!(result.is_err());
+    }
 }
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index f0193e831..90808dc60 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -1,19 +1,18 @@
 //! `jbench` CLI entry point.
 //!
-//! This is a scaffold: every subcommand prints a TODO line describing
-//! the work it will do and exits 0. The argument shape, however, is
-//! real and stable — downstream tooling (CI, scripts) can wire against
-//! these subcommands today and pick up real behavior as Phases 5.3 →
-//! 5.5 land.
-//!
-//! All real work happens through the [`jcode_jbench`] library; this
-//! binary's only job is to dispatch.
+//! Dispatches to the [`jcode_jbench`] library for real work.
+
+use std::path::PathBuf;
 
+use anyhow::{Context, Result};
 use clap::{Parser, Subcommand};
 
-// Pull in the library so the binary depends on it (and fails to
-// compile if its public surface regresses).
-use jcode_jbench as _;
+use jcode_jbench::{
+    agent_runner::AgentRunConfig,
+    judge::{judge_with_three_models, JudgeConfig},
+    lessons::{append_lessons_to_file, extract_lessons, LessonsConfig},
+    types::{AgentEvalResults, EvalDataV2, EvalRun},
+};
 
 /// Top-level `jbench` CLI.
 #[derive(Debug, Parser)]
@@ -28,42 +27,233 @@ struct Cli {
     command: Command,
 }
 
-/// JBench subcommands. Each is a stub today; see `README.md` for the
-/// intended workflow.
+/// JBench subcommands.
 #[derive(Debug, Subcommand)]
 enum Command {
     /// Select high-quality commits from a target repo to use as eval
     /// tasks.
-    PickCommits,
+    PickCommits {
+        /// URL of the repository to pick commits from.
+        repo_url: String,
+        /// Minimum commit message length.
+        #[arg(long, default_value = "10")]
+        min_msg_len: usize,
+        /// Maximum number of commits to pick.
+        #[arg(long, default_value = "50")]
+        max_picks: usize,
+        /// Output file (default: stdout).
+        #[arg(short, long)]
+        output: Option<PathBuf>,
+    },
     /// Generate an `eval-{repo}.json` file (`EvalDataV2`) from a list
     /// of picked commits.
-    GenEvals,
+    GenEvals {
+        /// Input commit list (from pick-commits).
+        input: PathBuf,
+        /// Output eval JSON file.
+        #[arg(short, long)]
+        output: PathBuf,
+    },
     /// Run one or more agents against an eval data file and emit
     /// per-commit `EvalRun`s.
-    Run,
+    Run {
+        /// Path to eval data JSON file.
+        eval_file: PathBuf,
+        /// Agent ID to run (must be registered in jcode registry).
+        #[arg(short, long)]
+        agent_id: String,
+        /// Output directory for EvalRun JSON files.
+        #[arg(short, long)]
+        output_dir: PathBuf,
+        /// Path to jcode binary (auto-detected if not set).
+        #[arg(long)]
+        jcode_binary: Option<PathBuf>,
+        /// Maximum turns per run.
+        #[arg(long, default_value = "100")]
+        max_turns: u32,
+        /// Timeout per run in seconds.
+        #[arg(long, default_value = "3600")]
+        timeout_secs: u64,
+    },
     /// Re-judge an existing run with the three-judge median pipeline.
-    Judge,
+    Judge {
+        /// Directory containing EvalRun JSON files.
+        runs_dir: PathBuf,
+        /// API base URL.
+        #[arg(long, env = "JBENCH_API_BASE")]
+        api_base: Option<String>,
+        /// API key.
+        #[arg(long, env = "JBENCH_API_KEY")]
+        api_key: Option<String>,
+    },
     /// Aggregate and analyze results across all tasks for an agent.
-    MetaAnalyze,
+    MetaAnalyze {
+        /// Directory containing EvalRun JSON files.
+        runs_dir: PathBuf,
+        /// Output file for aggregated results.
+        #[arg(short, long)]
+        output: Option<PathBuf>,
+    },
 }
 
-fn main() {
+#[tokio::main]
+async fn main() -> Result<()> {
     let cli = Cli::parse();
     match cli.command {
-        Command::PickCommits => {
-            println!("TODO: jbench pick-commits — Phase 5.2 will implement commit selection.");
+        Command::PickCommits { repo_url, min_msg_len, max_picks, output } => {
+            pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?;
+        }
+        Command::GenEvals { input, output } => {
+            gen_evals_impl(&input, &output).await?;
         }
-        Command::GenEvals => {
-            println!("TODO: jbench gen-evals — Phase 5.2 will implement eval-data generation.");
+        Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => {
+            run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?;
         }
-        Command::Run => {
-            println!("TODO: jbench run — Phase 5.3 will implement agent_runner orchestration.");
+        Command::Judge { runs_dir, api_base, api_key } => {
+            judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?;
         }
-        Command::Judge => {
-            println!("TODO: jbench judge — Phase 5.4 will implement three-judge median scoring.");
+        Command::MetaAnalyze { runs_dir, output } => {
+            meta_analyze_impl(&runs_dir, output.as_ref()).await?;
         }
-        Command::MetaAnalyze => {
-            println!("TODO: jbench meta-analyze — Phase 5.6 will implement cross-task aggregation.");
+    }
+    Ok(())
+}
+
+async fn pick_commits_impl(
+    _repo_url: &str,
+    _min_msg_len: usize,
+    _max_picks: usize,
+    _output: Option<PathBuf>,
+) -> Result<()> {
+    todo_step("Phase 5.2: commit selection via git log heuristics + message quality filter")
+}
+
+async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> {
+    todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON")
+}
+
+async fn run_impl(
+    eval_file: &PathBuf,
+    agent_id: &str,
+    output_dir: &PathBuf,
+    jcode_binary: Option<&PathBuf>,
+    max_turns: u32,
+    timeout_secs: u64,
+) -> Result<()> {
+    use std::fs;
+    use tokio::time::timeout as tk_timeout;
+    use std::time::Duration;
+
+    // Load eval data
+    let eval_data: EvalDataV2 = {
+        let text = fs::read_to_string(eval_file)?;
+        serde_json::from_str(&text).context("failed to parse eval JSON")?
+    };
+
+    if !output_dir.exists() {
+        fs::create_dir_all(output_dir)?;
+    }
+
+    for commit in &eval_data.eval_commits {
+        let config = AgentRunConfig {
+            agent_id: agent_id.to_owned(),
+            prompt: commit.prompt.clone(),
+            repo_path: output_dir.join(&commit.id), // per-commit working dir
+            max_turns,
+            timeout_secs,
+            env: eval_data.env.clone(),
+            jcode_binary: jcode_binary.cloned(),
+            ..Default::default()
+        };
+
+        let result = tk_timeout(
+            Duration::from_secs(timeout_secs),
+            jcode_jbench::agent_runner::run_agent_in_repo(config),
+        )
+        .await
+        .into_iter()
+        .next()
+        .unwrap_or_else(|| {
+            Ok(jcode_jbench::types::EvalRun {
+                commit_sha: commit.sha.clone(),
+                prompt: commit.prompt.clone(),
+                diff: String::new(),
+                judging: Default::default(),
+                cost_usd: 0.0,
+                duration_ms: 0,
+                error: Some("Timed out waiting for run_agent_in_repo".to_owned()),
+            })
+        })?;
+
+        let run_file = output_dir.join(format!("{}.run.json", commit.id));
+        let json = serde_json::to_string_pretty(&result).context("failed to serialize EvalRun")?;
+        fs::write(&run_file, json)?;
+        println!("Wrote {}", run_file.display());
+    }
+
+    Ok(())
+}
+
+async fn judge_impl(
+    _runs_dir: &PathBuf,
+    _api_base: Option<&str>,
+    _api_key: Option<&str>,
+) -> Result<()> {
+    todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields")
+}
+
+async fn meta_analyze_impl(
+    runs_dir: &PathBuf,
+    output: Option<&PathBuf>,
+) -> Result<()> {
+    use std::fs;
+    use jcode_jbench::types::AgentEvalResults;
+
+    let mut all_runs = Vec::new();
+
+    for entry in fs::read_dir(runs_dir)? {
+        let entry = entry?;
+        let path = entry.path();
+        if path.extension().and_then(|s| s.to_str()) == Some("run.json") {
+            let text = fs::read_to_string(&path)?;
+            if let Ok(run) = serde_json::from_str::<EvalRun>(&text) {
+                all_runs.push(run);
+            }
         }
     }
+
+    if all_runs.is_empty() {
+        anyhow::bail!("No .run.json files found in {}", runs_dir.display());
+    }
+
+    let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::<f64>()
+        / all_runs.len() as f64;
+    let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>()
+        / all_runs.len() as f64;
+    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>()
+        / all_runs.len() as u64;
+
+    let summary = AgentEvalResults {
+        agent_id: "unknown".to_owned(),
+        runs: all_runs,
+        average_score: (avg_score * 10.0).round() / 10.0,
+        average_cost: (avg_cost * 100.0).round() / 100.0,
+        average_duration_ms: avg_duration,
+    };
+
+    let json = serde_json::to_string_pretty(&summary).context("failed to serialize summary")?;
+
+    if let Some(out) = output {
+        fs::write(out, &json)?;
+        println!("Wrote {}", out.display());
+    } else {
+        println!("{json}");
+    }
+
+    Ok(())
+}
+
+fn todo_step(phase: &str) -> Result<()> {
+    eprintln!("{phase}");
+    std::process::exit(0);
 }
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
index 170a28203..589e5749d 100644
--- a/evals/jbench/src/judge.rs
+++ b/evals/jbench/src/judge.rs
@@ -7,54 +7,450 @@
 //! valid judges. This mirrors the design of BuffBench's
 //! `judgeCommitResult` in `/tmp/codebuff/evals/buffbench/judge.ts`.
 //!
-//! The actual provider plumbing (which talks to each judge model
-//! through the existing jcode provider registry) lands in Phase 5.4.
-//! Until then both entry points are `unimplemented!()` stubs whose
-//! signatures define the public surface the rest of the harness will
-//! depend on.
+//! Judge prompts are rendered from fixed templates (deduced from the TS
+//! original); the judge agent definitions are embedded here so the
+//! pipeline stays self-contained and does not depend on the full jcode
+//! agent runtime at evaluation time.
 
 use std::collections::HashMap;
+use std::sync::OnceLock;
+use std::time::Duration;
 
-use anyhow::Result;
+use anyhow::{Context, Result};
+use reqwest::Client;
+use serde::{Deserialize, Serialize};
+use tokio::time::timeout;
 
-use crate::types::{EvalCommit, JudgingResult};
+// Re-export JudgingResult so callers get it from the public types.
+pub use crate::types::JudgingResult;
+
+use crate::types::{EvalCommit, JudgingResult as Scorecard};
+
+/// Timeout for a single judge call.
+const JUDGE_TIMEOUT_SECS: u64 = 20 * 60;
+
+/// How many judges must succeed for the pipeline to produce a result.
+/// If fewer succeed, we return a zero-score error result.
+const MIN_JUDGE_SUCCESS_COUNT: usize = 2;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum JudgeProviderKind {
+    OpenAI,    // OpenAI Responses API + output_schema
+    Anthropic, // Anthropic Messages API + structured_outputs
+}
+
+impl JudgeProviderKind {
+    pub fn for_model(model: &str) -> Self {
+        if model.contains("claude") || model.contains("anthropic") {
+            Self::Anthropic
+        } else {
+            Self::OpenAI
+        }
+    }
+}
+
+/// Configuration for the judging pipeline.
+#[derive(Debug, Clone)]
+pub struct JudgeConfig {
+    /// API base URL for the judge backend (e.g. OpenAI-compatible).
+    pub api_base: String,
+    /// API key secret.
+    pub api_key: String,
+    /// Model IDs for the three judges. Order determines the median
+    /// computation.
+    pub models: [String; 3],
+    /// Optional override for judge timeout per call.
+    pub timeout_secs: Option<u64>,
+    /// Custom HTTP client (uses shared client if None).
+    pub http_client: Option<Client>,
+}
+
+impl Default for JudgeConfig {
+    fn default() -> Self {
+        Self {
+            // Sensible defaults — override before use in production
+            api_base: std::env::var("JBENCH_API_BASE")
+                .unwrap_or_else(|_| "https://api.openai.com".to_owned()),
+            api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(),
+            models: [
+                "gpt-5-2026-05".to_owned(),
+                "google/gemini-3.1-pro".to_owned(),
+                "anthropic/claude-sonnet-4-2026-05".to_owned(),
+            ],
+            timeout_secs: None,
+            http_client: None,
+        }
+    }
+}
+
+/// Render the full judge prompt from commit + diff + context.
+fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap<String, String>) -> String {
+    let ground_truth_diffs = commit
+        .file_diffs
+        .iter()
+        .map(|fd| {
+            format!(
+                "### {}\n```diff\n{}\n```",
+                fd.path,
+                fd.diff
+            )
+        })
+        .collect::<Vec<_>>()
+        .join("\n\n");
+
+    let context_content = context_files
+        .iter()
+        .map(|(path, content)| format!("### {path}\n```\n{content}\n```"))
+        .collect::<Vec<_>>()
+        .join("\n\n");
+
+    format!(
+        "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```",
+        commit.prompt,
+        context_content,
+        ground_truth_diffs,
+        agent_diff
+    )
+}
+
+/// System prompt for the judge agent (mirrors the TS `judgeAgentBase.systemPrompt`).
+fn judge_system_prompt() -> &'static str {
+    r#"You are an expert software engineer evaluating AI-generated code changes with empathy for the task given.
+
+## Your Role
+
+You will receive:
+1. The user prompt that the coding agent was given
+2. Context files from the codebase
+3. The ground truth changes (expected outcome)
+4. The agent's actual changes
+
+## Evaluation Philosophy
+
+**Judge based on what the agent was asked to do, not on perfection.**
+
+- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal
+- If the prompt is specific and detailed, expect the implementation to match those details more closely
+- Focus on whether the agent understood and addressed the user's intent
+- Consider that there are often multiple valid ways to implement the same feature
+
+## Evaluation Criteria
+
+- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt.
+- **Code Quality** (0-10): How well-structured and maintainable is the code?
+- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested
+
+## Ground Truth
+
+The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on:
+- Does it achieve the same functional outcome?
+- Is it a reasonable approach given the prompt?
+- Does it maintain code quality?
+
+Provide detailed analysis, strengths, weaknesses, and numerical scores."#
+}
+
+#[derive(Serialize)]
+struct JudgeRequest<'a> {
+    model: &'a str,
+    input: &'a str,
+    tools: &'a [serde_json::Value],
+    #[serde(skip_serializing_if = "Option::is_none")]
+    output_schema: Option<&'a serde_json::Value>,
+}
+
+#[derive(Deserialize)]
+struct JudgeResponse {
+    output: Option<serde_json::Value>,
+    #[serde(default)]
+    choices: Vec<serde_json::Value>,
+}
+
+/// Invoke a single judge model with a fully-rendered prompt.
+///
+/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`).
+pub async fn run_single_judge(
+    model: &str,
+    prompt: &str,
+    api_base: &str,
+    api_key: &str,
+    http_client: &Client,
+) -> Result<Scorecard> {
+    let kind = JudgeProviderKind::for_model(model);
+    let system = judge_system_prompt();
+
+    if kind == JudgeProviderKind::OpenAI {
+        run_openai_judge(model, prompt, system, api_base, api_key, http_client).await
+    } else {
+        run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await
+    }
+}
+
+async fn run_openai_judge(
+    model: &str,
+    prompt: &str,
+    system: &str,
+    api_base: &str,
+    api_key: &str,
+    http_client: &Client,
+) -> Result<Scorecard> {
+    let output_schema = serde_json::json!({
+        "type": "object",
+        "properties": {
+            "analysis": { "type": "string", "description": "Detailed analysis comparing agent changes to ground truth" },
+            "strengths": { "type": "array", "items": { "type": "string" }, "description": "Key strengths of the implementation" },
+            "weaknesses": { "type": "array", "items": { "type": "string" }, "description": "Key weaknesses or issues found" },
+            "completionScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "How completely the prompt was addressed" },
+            "codeQualityScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Code structure and maintainability" },
+            "overallScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Combined assessment" }
+        },
+        "required": ["analysis", "strengths", "weaknesses", "completionScore", "codeQualityScore", "overallScore"]
+    });
+
+    let request_body = serde_json::json!({
+        "model": model,
+        "input": [
+            { "role": "system", "content": system },
+            { "role": "user", "content": prompt }
+        ],
+        "tools": [
+            {
+                "type": "function",
+                "name": "set_output",
+                "description": "Submit the evaluation result",
+                "parameters": output_schema.clone()
+            }
+        ],
+        "tool_choice": { "type": "function", "name": "set_output" },
+        "output_schema": output_schema,
+    });
+
+    let url = format!("{api_base}/v1/responses");
+    let response = http_client
+        .post(&url)
+        .header("Authorization", format!("Bearer {api_key}"))
+        .header("Content-Type", "application/json")
+        .json(&request_body)
+        .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS))
+        .send()
+        .await
+        .context("judge HTTP request failed")?;
+
+    let status = response.status();
+    let body: serde_json::Value = response
+        .json()
+        .await
+        .context("failed to parse judge response")?;
+
+    if !status.is_success() {
+        anyhow::bail!("judge API returned {status}: {body}");
+    }
+
+    let output = body
+        .get("output")
+        .and_then(|o| o.as_array())
+        .and_then(|arr| arr.first())
+        .and_then(|item| item.get("content"))
+        .and_then(|c| c.as_array())
+        .and_then(|arr| arr.first())
+        .and_then(|item| item.get("text"))
+        .and_then(|t| t.as_str());
+
+    let output_value = output
+        .and_then(|t| serde_json::from_str::<serde_json::Value>(t).ok())
+        .or_else(|| body.get("output").cloned())
+        .unwrap_or(serde_json::json!({
+            "analysis": "No structured output received",
+            "strengths": [],
+            "weaknesses": ["Judge failed to return structured output"],
+            "completionScore": 0,
+            "codeQualityScore": 0,
+            "overallScore": 0
+        }));
+
+    parse_scorecard(output_value)
+}
+
+async fn run_anthropic_judge(
+    model: &str,
+    prompt: &str,
+    system: &str,
+    api_base: &str,
+    api_key: &str,
+    http_client: &Client,
+) -> Result<Scorecard> {
+    let request_body = serde_json::json!({
+        "model": model,
+        "messages": [
+            { "role": "user", "content": prompt }
+        ],
+        "system": system,
+        "max_tokens": 4096,
+        "thinking": {
+            "type": "enabled",
+            "budget_tokens": 1024
+        },
+    });
+
+    let url = format!("{api_base}/v1/messages");
+    let response = http_client
+        .post(&url)
+        .header("Authorization", format!("Bearer {api_key}"))
+        .header("Content-Type", "application/json")
+        .header("anthropic-version", "2023-06-01")
+        .json(&request_body)
+        .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS))
+        .send()
+        .await
+        .context("judge HTTP request failed")?;
+
+    let body: serde_json::Value = response
+        .json()
+        .await
+        .context("failed to parse anthropic judge response")?;
+
+    // Anthropic returns content blocks — try to parse the final text block as JSON
+    let text = body
+        .get("content")
+        .and_then(|c| c.as_array())
+        .and_then(|arr| arr.last())
+        .and_then(|item| item.get("text"))
+        .and_then(|t| t.as_str())
+        .unwrap_or_default();
+
+    let parsed = serde_json::from_str::<serde_json::Value>(text)
+        .unwrap_or(serde_json::json!({
+            "analysis": text.to_owned(),
+            "strengths": [],
+            "weaknesses": ["Could not parse structured output from Anthropic judge"],
+            "completionScore": 0,
+            "codeQualityScore": 0,
+            "overallScore": 0
+        }));
+
+    parse_scorecard(parsed)
+}
+
+fn parse_scorecard(value: serde_json::Value) -> Result<Scorecard> {
+    serde_json::from_value(value).context("failed to parse JudgingResult from judge output")
+}
 
 /// Judge an agent's diff against the ground truth using three models in
 /// parallel and return a [`JudgingResult`] whose qualitative analysis
 /// comes from the median judge and whose numeric scores are averaged
 /// across all judges that returned successfully.
 ///
-/// Why median + average?
-/// - **Median analysis** picks a representative voice and avoids the
-///   outlier judge dominating the prose.
-/// - **Average scores** smooth out judge-specific bias so the canonical
-///   overall metric tracks consensus, not whichever model happened to
-///   be selected.
-///
 /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts`
 /// (`judgeCommitResult`).
-///
-/// `context_files` is a `path -> contents` map of supplemental files
-/// from the parent commit; the judges receive these inline in the
-/// prompt to ground their evaluation.
 pub async fn judge_with_three_models(
     commit: &EvalCommit,
     agent_diff: &str,
     context_files: &HashMap<String, String>,
+    config: &JudgeConfig,
 ) -> Result<JudgingResult> {
-    let _ = (commit, agent_diff, context_files);
-    unimplemented!("Phase 5.4: run gpt-5 / gemini-pro / sonnet judges in parallel and return median+average")
+    let prompt = render_judge_prompt(commit, agent_diff, context_files);
+    let http: &reqwest::Client = match &config.http_client {
+        Some(c) => c,
+        None => shared_client(),
+    };
+
+    let timeout_duration = Duration::from_secs(config.timeout_secs.unwrap_or(JUDGE_TIMEOUT_SECS));
+
+    let judge_futures: Vec<_> = config
+        .models
+        .iter()
+        .map(|model| {
+            run_single_judge(
+                model,
+                &prompt,
+                &config.api_base,
+                &config.api_key,
+                http,
+            )
+        })
+        .collect();
+
+    // Run all three judges in parallel with an overall timeout
+    let valid: Vec<Scorecard> = timeout(
+        timeout_duration,
+        futures::future::join_all(judge_futures),
+    )
+    .await
+    .ok()
+    .into_iter()           // IntoIterator<Item = Vec<Result<Scorecard>>>
+    .flatten()            // Iterator<Item = Result<Scorecard>>
+    .filter_map(|r| r.ok())
+    .collect();
+
+    if valid.len() < MIN_JUDGE_SUCCESS_COUNT {
+        return Ok(Scorecard {
+            analysis: format!(
+                "Error running judge agent — only {}/{} judges succeeded",
+                valid.len(),
+                3
+            ),
+            strengths: vec![],
+            weaknesses: vec![format!(
+                "Only {}/{} judges succeeded",
+                valid.len(),
+                3
+            )],
+            completion_score: 0.0,
+            code_quality_score: 0.0,
+            overall_score: 0.0,
+        });
+    }
+
+    // Median analysis — sort by overall_score and pick the middle
+    let mut sorted = valid.clone();
+    sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap());
+    let median_idx = sorted.len() / 2;
+    let median = &sorted[median_idx];
+
+    let avg_completion = valid.iter().map(|r| r.completion_score).sum::<f64>() / valid.len() as f64;
+    let avg_quality = valid.iter().map(|r| r.code_quality_score).sum::<f64>() / valid.len() as f64;
+    let avg_overall = valid.iter().map(|r| r.overall_score).sum::<f64>() / valid.len() as f64;
+
+    Ok(Scorecard {
+        analysis: median.analysis.clone(),
+        strengths: median.strengths.clone(),
+        weaknesses: median.weaknesses.clone(),
+        completion_score: (avg_completion * 10.0).round() / 10.0,
+        code_quality_score: (avg_quality * 10.0).round() / 10.0,
+        overall_score: (avg_overall * 10.0).round() / 10.0,
+    })
 }
 
-/// Invoke a single judge model with a fully-rendered prompt.
-///
-/// Used internally by [`judge_with_three_models`] and exposed publicly
-/// so callers can re-judge a stored run with a different model without
-/// re-running the full three-judge pipeline.
-///
-/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts`
-/// (`runSingleJudge`).
-pub async fn run_single_judge(model_id: &str, prompt: &str) -> Result<JudgingResult> {
-    let _ = (model_id, prompt);
-    unimplemented!("Phase 5.4: wire to provider registry")
+static SHARED_CLIENT: OnceLock<reqwest::Client> = OnceLock::new();
+
+fn shared_client() -> &'static Client {
+    SHARED_CLIENT.get_or_init(|| {
+        reqwest::Client::builder()
+            .connect_timeout(Duration::from_secs(15))
+            .tcp_keepalive(Duration::from_secs(30))
+            .pool_idle_timeout(Duration::from_secs(90))
+            .build()
+            .expect("reqwest client must build")
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn judge_provider_kind_for_model() {
+        assert_eq!(
+            JudgeProviderKind::for_model("gpt-5"),
+            JudgeProviderKind::OpenAI
+        );
+        assert_eq!(
+            JudgeProviderKind::for_model("claude-sonnet-4"),
+            JudgeProviderKind::Anthropic
+        );
+        assert_eq!(
+            JudgeProviderKind::for_model("anthropic/claude-opus-4"),
+            JudgeProviderKind::Anthropic
+        );
+    }
 }
diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs
index 7a919d646..31bf1b661 100644
--- a/evals/jbench/src/lessons.rs
+++ b/evals/jbench/src/lessons.rs
@@ -8,58 +8,316 @@
 //! prompt or memory graph.
 //!
 //! Design source: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`.
-//!
-//! Implementation lands in Phase 5.5.
 
+use std::fs;
 use std::path::Path;
+use std::sync::OnceLock;
+use std::time::Duration;
 
-use anyhow::Result;
+use anyhow::{Context, Result};
+use reqwest::Client;
 use serde::{Deserialize, Serialize};
+use tokio::time::Duration as TokioDuration;
+
+/// Timeout for a lessons extraction call.
+const LESSONS_TIMEOUT_SECS: u64 = 20 * 60;
 
 /// One distilled lesson from a single eval run.
-///
-/// Kept deliberately minimal — both fields are free-form prose. Richer
-/// structure (severity, tags, links to specific commits) can be added
-/// later without breaking the on-disk format because lesson files are
-/// JSON arrays of this struct.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct Lesson {
-    /// Concise description of the failure mode observed in the trace
-    /// or diff. One or two sentences.
     pub what_went_wrong: String,
-    /// Concise description of the corrective behavior the agent should
-    /// have performed instead. One or two sentences.
     pub what_should_have_been_done: String,
 }
 
+/// Configuration for lessons extraction.
+#[derive(Debug, Clone)]
+pub struct LessonsConfig {
+    pub api_base: String,
+    pub api_key: String,
+    pub model: String,
+    pub http_client: Option<Client>,
+}
+
+impl Default for LessonsConfig {
+    fn default() -> Self {
+        Self {
+            api_base: std::env::var("JBENCH_API_BASE")
+                .unwrap_or_else(|_| "https://api.openai.com".to_owned()),
+            api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(),
+            model: "gpt-5-2026-05".to_owned(),
+            http_client: None,
+        }
+    }
+}
+
+fn render_lessons_prompt(
+    prompt: &str,
+    ground_truth_diff: &str,
+    agent_diff: &str,
+    agent_trace: &str,
+    judge_summary: Option<&str>,
+    error: Option<&str>,
+) -> String {
+    let judge_section = judge_summary
+        .map(|s| format!("\n## Judge Summary\n{s}"))
+        .unwrap_or_default();
+    let error_section = error
+        .map(|e| format!("\n## Agent Error\n{e}"))
+        .unwrap_or_default();
+    format!(
+        "## User Prompt\n{prompt}\n\n\
+         ## Ground Truth Changes (One valid implementation)\n\
+         ```diff\n{ground_truth_diff}\n```\n\n\
+         ## Agent's Changes\n\
+         ```diff\n{agent_diff}\n```\n\n\
+         ## Agent Trace\n\
+         ```json\n{agent_trace}\n```\
+         {judge_section}{error_section}\n\n\
+         Task: Analyze what went wrong and what should have been done.",
+        prompt = prompt,
+        ground_truth_diff = ground_truth_diff,
+        agent_diff = agent_diff,
+        agent_trace = agent_trace,
+        judge_section = judge_section,
+        error_section = error_section
+    )
+}
+
+fn lessons_system_prompt() -> &'static str {
+    r#"You are a Lesson Extractor. Your job: analyze agent performance and extract actionable lessons.
+
+Context you receive:
+- User prompt (what the coding agent was asked)
+- Ground truth diffs (one valid solution path)
+- The agent's diffs (what they actually changed)
+- A truncated agent trace showing HOW they worked
+- Optional judge summary (scores, weaknesses)
+
+You must output an array of lessons. Each lesson has two parts:
+
+1. **whatWentWrong**: What the agent did incorrectly, misunderstood, or failed to do
+2. **whatShouldHaveBeenDone**: The correct approach the agent should have taken
+
+Rules:
+- Each lesson should be a complete learning unit (problem + solution)
+- Keep lessons terse but precise (~140 chars per field)
+- Do not include things the agent already did correctly
+- Focus on gaps that, if filled, would have improved the outcome"#
+}
+
 /// Run the lessons-extractor judge over a finished eval run and return
 /// zero or more [`Lesson`]s.
-///
-/// The extractor receives the prompt the agent was given, the ground
-/// truth diff for context, the diff the agent actually produced, and
-/// the agent's full trace. It returns an empty `Vec` when the run was
-/// successful enough that no corrective lesson applies.
 pub async fn extract_lessons(
     prompt: &str,
     ground_truth_diff: &str,
     agent_diff: &str,
     agent_trace: &str,
+    config: &LessonsConfig,
+    judge_summary: Option<&str>,
+    error: Option<&str>,
 ) -> Result<Vec<Lesson>> {
-    let _ = (prompt, ground_truth_diff, agent_diff, agent_trace);
-    unimplemented!("Phase 5.5: invoke lessons-extractor judge and parse Vec<Lesson>")
+    let prompt_text = render_lessons_prompt(
+        prompt,
+        ground_truth_diff,
+        agent_diff,
+        agent_trace,
+        judge_summary,
+        error,
+    );
+
+    let http = match &config.http_client {
+        Some(c) => c,
+        None => {
+            static CLIENT: OnceLock<reqwest::Client> = OnceLock::new();
+            CLIENT.get_or_init(|| {
+                reqwest::Client::builder()
+                    .connect_timeout(Duration::from_secs(15))
+                    .tcp_keepalive(Duration::from_secs(30))
+                    .pool_idle_timeout(Duration::from_secs(90))
+                    .build()
+                    .expect("reqwest client must build")
+            })
+        }
+    };
+
+    let request_body = serde_json::json!({
+        "model": &config.model,
+        "input": [
+            { "role": "system", "content": lessons_system_prompt() },
+            { "role": "user", "content": prompt_text }
+        ],
+        "tools": [
+            {
+                "type": "function",
+                "name": "set_output",
+                "description": "Submit lessons derived from this evaluation",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "lessons": {
+                            "type": "array",
+                            "items": {
+                                "type": "object",
+                                "properties": {
+                                    "whatWentWrong": { "type": "string" },
+                                    "whatShouldHaveBeenDone": { "type": "string" }
+                                },
+                                "required": ["whatWentWrong", "whatShouldHaveBeenDone"]
+                            }
+                        }
+                    },
+                    "required": ["lessons"]
+                }
+            }
+        ],
+        "tool_choice": { "type": "function", "name": "set_output" },
+        "output_schema": {
+            "type": "object",
+            "properties": {
+                "lessons": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "whatWentWrong": { "type": "string" },
+                            "whatShouldHaveBeenDone": { "type": "string" }
+                        },
+                        "required": ["whatWentWrong", "whatShouldHaveBeenDone"]
+                    }
+                }
+            },
+            "required": ["lessons"]
+        },
+    });
+
+    let url = format!("{}/v1/responses", config.api_base);
+    let response = http
+        .post(&url)
+        .header("Authorization", format!("Bearer {}", config.api_key))
+        .header("Content-Type", "application/json")
+        .json(&request_body)
+        .timeout(TokioDuration::from_secs(LESSONS_TIMEOUT_SECS))
+        .send()
+        .await
+        .context("lessons extraction HTTP request failed")?;
+
+    let body: serde_json::Value = response
+        .json()
+        .await
+        .context("failed to parse lessons extractor response")?;
+
+    let lessons_json = body
+        .get("output")
+        .and_then(|o| o.as_array())
+        .and_then(|arr| arr.first())
+        .and_then(|item| item.get("content"))
+        .and_then(|c| c.as_array())
+        .and_then(|arr| arr.first())
+        .and_then(|item| item.get("text"))
+        .and_then(|t| t.as_str())
+        .and_then(|t| serde_json::from_str::<serde_json::Value>(t).ok())
+        .or_else(|| body.get("output").cloned())
+        .unwrap_or(serde_json::json!({ "lessons": [] }));
+
+    let lessons: Vec<Lesson> = lessons_json
+        .get("lessons")
+        .and_then(|l| l.as_array())
+        .map(|arr| {
+            arr.iter()
+                .filter_map(|v| serde_json::from_value(v.clone()).ok())
+                .collect()
+        })
+        .unwrap_or_default();
+
+    Ok(lessons)
 }
 
 /// Append `lessons` to the per-agent lessons file at
 /// `lessons_dir/<agent_id>.json`, creating the file (and the directory)
 /// if needed.
-///
-/// The on-disk format is a JSON array of [`Lesson`]; appending preserves
-/// previously-extracted lessons so the file accumulates over many runs.
 pub fn append_lessons_to_file(
     agent_id: &str,
     lessons: &[Lesson],
     lessons_dir: &Path,
 ) -> Result<()> {
-    let _ = (agent_id, lessons, lessons_dir);
-    unimplemented!("Phase 5.5: read-modify-write JSON array at lessons_dir/<agent_id>.json")
+    if lessons.is_empty() {
+        return Ok(());
+    }
+
+    if !lessons_dir.exists() {
+        fs::create_dir_all(lessons_dir)
+            .context("failed to create lessons directory")?;
+    }
+
+    let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_");
+    let file_path = lessons_dir.join(format!("{safe_id}.json"));
+
+    let existing: Vec<Lesson> = if file_path.exists() {
+        let contents = fs::read_to_string(&file_path)
+            .context("failed to read existing lessons file")?;
+        serde_json::from_str(&contents).unwrap_or_default()
+    } else {
+        Vec::new()
+    };
+
+    let all_lessons: Vec<Lesson> = existing
+        .into_iter()
+        .chain(lessons.iter().cloned())
+        .collect();
+
+    let json = serde_json::to_string_pretty(&all_lessons)
+        .context("failed to serialize lessons")?;
+
+    fs::write(&file_path, json)
+        .context("failed to write lessons file")?;
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tempfile::TempDir;
+
+    #[test]
+    fn append_lessons_to_empty_dir() {
+        let tmp = TempDir::new().unwrap();
+        let result = append_lessons_to_file(
+            "test-agent",
+            &[Lesson {
+                what_went_wrong: "forgot null check".to_owned(),
+                what_should_have_been_done: "add null guard".to_owned(),
+            }],
+            tmp.path(),
+        );
+        assert!(result.is_ok());
+        let contents = fs::read_to_string(tmp.path().join("test-agent.json")).unwrap();
+        let lessons: Vec<Lesson> = serde_json::from_str(&contents).unwrap();
+        assert_eq!(lessons.len(), 1);
+    }
+
+    #[test]
+    fn append_lessons_accumulates() {
+        let tmp = TempDir::new().unwrap();
+        let agent = "clone-agent";
+
+        fs::create_dir_all(tmp.path()).unwrap();
+        let file_path = tmp.path().join("clone-agent.json");
+        let first = vec![Lesson {
+            what_went_wrong: "first mistake".to_owned(),
+            what_should_have_been_done: "first fix".to_owned(),
+        }];
+        let json = serde_json::to_string_pretty(&first).unwrap();
+        fs::write(&file_path, json).unwrap();
+
+        let second = vec![Lesson {
+            what_went_wrong: "second mistake".to_owned(),
+            what_should_have_been_done: "second fix".to_owned(),
+        }];
+        append_lessons_to_file(agent, &second, tmp.path()).unwrap();
+
+        let contents = fs::read_to_string(tmp.path().join("clone-agent.json")).unwrap();
+        let lessons: Vec<Lesson> = serde_json::from_str(&contents).unwrap();
+        assert_eq!(lessons.len(), 2);
+    }
 }
diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs
index 57c5809f7..97d0eb7c1 100644
--- a/evals/jbench/src/lib.rs
+++ b/evals/jbench/src/lib.rs
@@ -17,3 +17,8 @@ pub mod agent_runner;
 pub mod judge;
 pub mod lessons;
 pub mod types;
+
+pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults};
+pub use agent_runner::AgentRunConfig;
+pub use judge::JudgeConfig;
+pub use lessons::LessonsConfig;
diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs
index 3f3a9e763..1cb51e17e 100644
--- a/evals/jbench/src/types.rs
+++ b/evals/jbench/src/types.rs
@@ -128,6 +128,19 @@ pub struct JudgingResult {
     pub overall_score: f64,
 }
 
+impl Default for JudgingResult {
+    fn default() -> Self {
+        Self {
+            analysis: String::new(),
+            strengths: Vec::new(),
+            weaknesses: Vec::new(),
+            completion_score: 0.0,
+            code_quality_score: 0.0,
+            overall_score: 0.0,
+        }
+    }
+}
+
 /// Outcome of running one agent on one eval commit.
 ///
 /// `error` is `Some` when the agent crashed, timed out, or otherwise

From c0bcaca4c280934f911c6a54cf3021ec8f20101a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?=
 <tranquangdang21@gmail.com>
Date: Thu, 28 May 2026 01:50:55 +0000
Subject: [PATCH 06/22] fix(agent-runtime): address PR #313 review issues

Bugs fixed:

1. JudgingResult deserialization (jbench/types.rs)
   The judge prompt schema asks for camelCase fields
   (completionScore, codeQualityScore, overallScore) but
   the Rust struct used snake_case without serde rename.
   parse_scorecard would fail on every real judge response.

   Fix: add #[serde(alias = ...)] on each score field so
   on-disk JSON stays snake_case while LLM-returned
   camelCase still deserializes cleanly.

2. Anthropic judge authentication (jbench/judge.rs)
   run_anthropic_judge used Authorization: Bearer <key>
   which always 401s on the Anthropic Messages API.

   Fix: switch to x-api-key header (Anthropic standard).
   Also split JudgeConfig::api_base / api_key from new
   anthropic_api_base / anthropic_api_key so the Anthropic
   branch can target api.anthropic.com without breaking
   the OpenAI-compatible path. Plumbed through
   run_single_judge.

3. Duplicate substitute_placeholders (src/prompt_placeholders.rs)
   Conflicts with the existing
   prompt_templates::substitute_placeholders. Different
   semantics (fixed context vs HashMap bindings) but same
   name made grep / jump-to-def ambiguous.

   Fix: rename the new one to
   substitute_context_placeholders and document the
   relationship in the doc comment.

4. meta_analyze .run.json filter (jbench/bin/jbench.rs)
   path.extension() returns only the final extension
   ('json'), so matching against "run.json" never fired.
   meta-analyze would always report zero runs.

   Fix: match against file_name().ends_with(".run.json").

Plus:
- Run cargo fmt --all to clear the Format CI job that PR
  #313 was failing.
- Add tests parse_scorecard_accepts_camelcase_from_llm and
  parse_scorecard_accepts_snake_case_from_disk to lock in
  the wire-format contract.
---
 crates/jcode-agent-runtime/src/definition.rs  |  30 +---
 crates/jcode-agent-runtime/src/lib.rs         |   6 +-
 crates/jcode-agent-runtime/src/output.rs      |   5 +-
 crates/jcode-agent-runtime/src/reasoning.rs   |  10 +-
 crates/jcode-agent-runtime/src/registry.rs    |  57 ++++----
 crates/jcode-agent-runtime/src/tier.rs        |  10 +-
 .../tests/sample_agents.rs                    |  22 ++-
 evals/jbench/src/agent_runner.rs              |   9 +-
 evals/jbench/src/bin/jbench.rs                |  69 +++++++---
 evals/jbench/src/judge.rs                     | 130 ++++++++++++------
 evals/jbench/src/lessons.rs                   |  13 +-
 evals/jbench/src/lib.rs                       |   2 +-
 evals/jbench/src/types.rs                     |   9 ++
 evals/jbench/tests/types.rs                   |   4 +-
 src/agent/prompting.rs                        |   1 -
 src/prompt_placeholders.rs                    |  24 ++--
 src/tui/app/commands.rs                       |   3 +-
 tests/tool_fixtures.rs                        |   7 +-
 18 files changed, 250 insertions(+), 161 deletions(-)

diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index a067668c6..4adeeabbd 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -174,9 +174,7 @@ fn default_version() -> String {
 /// invariants. Displayed to users when a TOML file fails to load.
 #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
 pub enum DefinitionError {
-    #[error(
-        "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen"
-    )]
+    #[error("agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen")]
     InvalidId(String),
 
     #[error(
@@ -184,9 +182,7 @@ pub enum DefinitionError {
     )]
     SystemPromptConflict { id: String },
 
-    #[error(
-        "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing"
-    )]
+    #[error("agent `{id}` has `output_mode = structured_output` but `output_schema` is missing")]
     StructuredOutputMissingSchema { id: String },
 
     #[error("agent `{id}` references itself in `spawnable_agents`")]
@@ -209,9 +205,7 @@ pub enum DefinitionError {
 /// agent spawn time.
 #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
 pub enum ReferenceError {
-    #[error(
-        "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}"
-    )]
+    #[error("agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}")]
     UnknownTools {
         id: String,
         unknown: String,
@@ -245,8 +239,7 @@ impl AgentDefinition {
         }
 
         // 3. structured_output requires schema
-        if matches!(self.output_mode, OutputMode::StructuredOutput)
-            && self.output_schema.is_none()
+        if matches!(self.output_mode, OutputMode::StructuredOutput) && self.output_schema.is_none()
         {
             return Err(DefinitionError::StructuredOutputMissingSchema {
                 id: self.id.clone(),
@@ -422,30 +415,21 @@ mod tests {
     fn id_validation_rejects_uppercase() {
         let mut d = minimal_definition("File-Picker");
         d.id = "File-Picker".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
     fn id_validation_rejects_underscore() {
         let mut d = minimal_definition("file_picker");
         d.id = "file_picker".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
     fn id_validation_rejects_leading_hyphen() {
         let mut d = minimal_definition("ok");
         d.id = "-bad".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index b78ad983f..80979a845 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -38,10 +38,8 @@ pub use signals::{
 };
 
 // New public surface (Phase 0).
-pub use definition::{
-    AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION,
-};
+pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError};
 pub use output::OutputMode;
 pub use reasoning::ReasoningEffort;
 pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind};
-pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource};
+pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source};
diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs
index 1ba93dd1a..93dc60a93 100644
--- a/crates/jcode-agent-runtime/src/output.rs
+++ b/crates/jcode-agent-runtime/src/output.rs
@@ -53,7 +53,10 @@ mod tests {
 
     #[test]
     fn parse_accepts_aliases() {
-        assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage));
+        assert_eq!(
+            OutputMode::parse("last_message"),
+            Some(OutputMode::LastMessage)
+        );
         assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages));
         assert_eq!(
             OutputMode::parse("structured"),
diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs
index d48bafaeb..7cdf8d010 100644
--- a/crates/jcode-agent-runtime/src/reasoning.rs
+++ b/crates/jcode-agent-runtime/src/reasoning.rs
@@ -79,9 +79,15 @@ mod tests {
             ReasoningEffort::parse("minimal"),
             Some(ReasoningEffort::Minimal)
         );
-        assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal));
+        assert_eq!(
+            ReasoningEffort::parse("OFF"),
+            Some(ReasoningEffort::Minimal)
+        );
         assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High));
-        assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium));
+        assert_eq!(
+            ReasoningEffort::parse("default"),
+            Some(ReasoningEffort::Medium)
+        );
         assert_eq!(ReasoningEffort::parse(""), None);
         assert_eq!(ReasoningEffort::parse("absurd"), None);
     }
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index 71cab810d..82f182b2d 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -90,9 +90,7 @@ pub enum LoadError {
         source: DefinitionError,
     },
 
-    #[error(
-        "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`."
-    )]
+    #[error("filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`.")]
     FileNameMismatch { path: PathBuf, id: String },
 }
 
@@ -175,10 +173,7 @@ impl AgentRegistry {
 
     /// Register a builtin agent. Builtins have the lowest priority and
     /// are overridable by both user and project files of the same id.
-    pub fn register_builtin(
-        &mut self,
-        definition: AgentDefinition,
-    ) -> Result<(), DefinitionError> {
+    pub fn register_builtin(&mut self, definition: AgentDefinition) -> Result<(), DefinitionError> {
         definition.validate()?;
         self.insert(LoadedAgent {
             definition,
@@ -233,10 +228,7 @@ impl AgentRegistry {
                             AgentSource::ProjectLocal { path: path.clone() }
                         }
                     };
-                    self.insert(LoadedAgent {
-                        definition,
-                        source,
-                    });
+                    self.insert(LoadedAgent { definition, source });
                     loaded += 1;
                 }
                 Err(err) => {
@@ -333,7 +325,10 @@ mod tests {
     fn missing_dir_is_zero_load_not_error() {
         let mut reg = AgentRegistry::new();
         let n = reg
-            .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal)
+            .load_directory(
+                Path::new("/nonexistent/jcode-test-dir"),
+                SourceKind::UserGlobal,
+            )
             .unwrap();
         assert_eq!(n, 0);
         assert!(reg.is_empty());
@@ -382,7 +377,10 @@ mod tests {
             output_schema: None,
         };
         reg.register_builtin(builtin_def.clone()).unwrap();
-        assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor");
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "Builtin Editor"
+        );
 
         // User
         let user_dir = temp_dir("user");
@@ -394,8 +392,12 @@ mod tests {
                 display_name = "User Editor"
             "#,
         );
-        reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap();
-        assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor");
+        reg.load_directory(&user_dir, SourceKind::UserGlobal)
+            .unwrap();
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "User Editor"
+        );
 
         // Project
         let proj_dir = temp_dir("proj");
@@ -407,7 +409,8 @@ mod tests {
                 display_name = "Project Editor"
             "#,
         );
-        reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap();
+        reg.load_directory(&proj_dir, SourceKind::ProjectLocal)
+            .unwrap();
         assert_eq!(
             reg.get("editor").unwrap().definition.display_name,
             "Project Editor"
@@ -432,10 +435,7 @@ mod tests {
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
         assert!(reg.is_empty(), "no agents registered");
         assert_eq!(reg.load_errors().len(), 1);
-        assert!(matches!(
-            reg.load_errors()[0],
-            LoadError::Parse { .. }
-        ));
+        assert!(matches!(reg.load_errors()[0], LoadError::Parse { .. }));
     }
 
     #[test]
@@ -453,10 +453,7 @@ mod tests {
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
         assert!(reg.is_empty());
         assert_eq!(reg.load_errors().len(), 1);
-        assert!(matches!(
-            reg.load_errors()[0],
-            LoadError::Invalid { .. }
-        ));
+        assert!(matches!(reg.load_errors()[0], LoadError::Invalid { .. }));
     }
 
     #[test]
@@ -506,14 +503,20 @@ mod tests {
             write_toml(
                 &dir,
                 &format!("{id}.toml"),
-                &format!(r#"id = "{id}"
+                &format!(
+                    r#"id = "{id}"
 display_name = "{id}"
-"#),
+"#
+                ),
             );
         }
         let mut reg = AgentRegistry::new();
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
-        let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect();
+        let ids: Vec<_> = reg
+            .iter_sorted()
+            .iter()
+            .map(|a| a.definition.id.clone())
+            .collect();
         assert_eq!(ids, vec!["alpha", "mid", "zeta"]);
     }
 
diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs
index 200f511ed..33ee6288b 100644
--- a/crates/jcode-agent-runtime/src/tier.rs
+++ b/crates/jcode-agent-runtime/src/tier.rs
@@ -135,16 +135,10 @@ pub enum ResolutionSource {
     /// Used `agent.model_override` directly.
     Override(String),
     /// Used the env var backing `tier`.
-    Tier {
-        tier: ModelTier,
-        model: String,
-    },
+    Tier { tier: ModelTier, model: String },
     /// Tier was preferred but the env var was unset, so fell back to the
     /// session's current model.
-    TierFallback {
-        tier: ModelTier,
-        model: String,
-    },
+    TierFallback { tier: ModelTier, model: String },
     /// No override or tier preference; using the session's current model.
     SessionDefault(String),
 }
diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs
index e850495d5..ee6ee7034 100644
--- a/crates/jcode-agent-runtime/tests/sample_agents.rs
+++ b/crates/jcode-agent-runtime/tests/sample_agents.rs
@@ -9,9 +9,7 @@
 
 use std::path::PathBuf;
 
-use jcode_agent_runtime::{
-    AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind,
-};
+use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind};
 
 /// Path to the project-root sample agents directory, relative to the
 /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so
@@ -20,7 +18,12 @@ use jcode_agent_runtime::{
 fn samples_dir() -> PathBuf {
     let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
     // crates/jcode-agent-runtime → ../../ .jcode/agents
-    crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents")
+    crate_dir
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap()
+        .join(".jcode/agents")
 }
 
 #[test]
@@ -37,7 +40,11 @@ fn loads_bundled_sample_agents() {
         .load_directory(&dir, SourceKind::ProjectLocal)
         .expect("load_directory");
     assert!(n >= 2, "expected at least 2 sample agents, got {n}");
-    assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors());
+    assert!(
+        reg.load_errors().is_empty(),
+        "load errors: {:?}",
+        reg.load_errors()
+    );
 }
 
 #[test]
@@ -56,7 +63,10 @@ fn file_picker_sample_has_expected_shape() {
     assert_eq!(agent.display_name, "Fletcher the File Fetcher");
     assert_eq!(agent.prefer_tier, Some(ModelTier::Routine));
     assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal));
-    assert!(!agent.include_message_history, "file picker uses clean slate");
+    assert!(
+        !agent.include_message_history,
+        "file picker uses clean slate"
+    );
     assert!(!agent.inherit_parent_system_prompt);
     assert_eq!(agent.output_mode, OutputMode::LastMessage);
     assert!(agent.tool_names.iter().any(|t| t == "read"));
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index de922e71d..3763ee4c2 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -83,9 +83,12 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         .current_dir(&config.repo_path)
         .envs(&env_vars)
         .args([
-            "agent", "run",
-            "--agent", &config.agent_id,
-            "--output-mode", "stream",
+            "agent",
+            "run",
+            "--agent",
+            &config.agent_id,
+            "--output-mode",
+            "stream",
             "--no-interactive",
         ])
         .stdin(Stdio::piped())
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index 90808dc60..2e54b50e7 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -9,8 +9,8 @@ use clap::{Parser, Subcommand};
 
 use jcode_jbench::{
     agent_runner::AgentRunConfig,
-    judge::{judge_with_three_models, JudgeConfig},
-    lessons::{append_lessons_to_file, extract_lessons, LessonsConfig},
+    judge::{JudgeConfig, judge_with_three_models},
+    lessons::{LessonsConfig, append_lessons_to_file, extract_lessons},
     types::{AgentEvalResults, EvalDataV2, EvalRun},
 };
 
@@ -100,16 +100,40 @@ enum Command {
 async fn main() -> Result<()> {
     let cli = Cli::parse();
     match cli.command {
-        Command::PickCommits { repo_url, min_msg_len, max_picks, output } => {
+        Command::PickCommits {
+            repo_url,
+            min_msg_len,
+            max_picks,
+            output,
+        } => {
             pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?;
         }
         Command::GenEvals { input, output } => {
             gen_evals_impl(&input, &output).await?;
         }
-        Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => {
-            run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?;
+        Command::Run {
+            eval_file,
+            agent_id,
+            output_dir,
+            jcode_binary,
+            max_turns,
+            timeout_secs,
+        } => {
+            run_impl(
+                &eval_file,
+                &agent_id,
+                &output_dir,
+                jcode_binary.as_ref(),
+                max_turns,
+                timeout_secs,
+            )
+            .await?;
         }
-        Command::Judge { runs_dir, api_base, api_key } => {
+        Command::Judge {
+            runs_dir,
+            api_base,
+            api_key,
+        } => {
             judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?;
         }
         Command::MetaAnalyze { runs_dir, output } => {
@@ -141,8 +165,8 @@ async fn run_impl(
     timeout_secs: u64,
 ) -> Result<()> {
     use std::fs;
-    use tokio::time::timeout as tk_timeout;
     use std::time::Duration;
+    use tokio::time::timeout as tk_timeout;
 
     // Load eval data
     let eval_data: EvalDataV2 = {
@@ -199,22 +223,28 @@ async fn judge_impl(
     _api_base: Option<&str>,
     _api_key: Option<&str>,
 ) -> Result<()> {
-    todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields")
+    todo_step(
+        "Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields",
+    )
 }
 
-async fn meta_analyze_impl(
-    runs_dir: &PathBuf,
-    output: Option<&PathBuf>,
-) -> Result<()> {
-    use std::fs;
+async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Result<()> {
     use jcode_jbench::types::AgentEvalResults;
+    use std::fs;
 
     let mut all_runs = Vec::new();
 
     for entry in fs::read_dir(runs_dir)? {
         let entry = entry?;
         let path = entry.path();
-        if path.extension().and_then(|s| s.to_str()) == Some("run.json") {
+        // `Path::extension` returns only the trailing component (`json`),
+        // so matching against `"run.json"` never fires. Match on the full
+        // file name suffix instead.
+        let is_run_file = path
+            .file_name()
+            .and_then(|s| s.to_str())
+            .is_some_and(|s| s.ends_with(".run.json"));
+        if is_run_file {
             let text = fs::read_to_string(&path)?;
             if let Ok(run) = serde_json::from_str::<EvalRun>(&text) {
                 all_runs.push(run);
@@ -226,12 +256,13 @@ async fn meta_analyze_impl(
         anyhow::bail!("No .run.json files found in {}", runs_dir.display());
     }
 
-    let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::<f64>()
-        / all_runs.len() as f64;
-    let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>()
+    let avg_score = all_runs
+        .iter()
+        .map(|r| r.judging.overall_score)
+        .sum::<f64>()
         / all_runs.len() as f64;
-    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>()
-        / all_runs.len() as u64;
+    let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>() / all_runs.len() as f64;
+    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>() / all_runs.len() as u64;
 
     let summary = AgentEvalResults {
         agent_id: "unknown".to_owned(),
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
index 589e5749d..8f9437f47 100644
--- a/evals/jbench/src/judge.rs
+++ b/evals/jbench/src/judge.rs
@@ -52,10 +52,18 @@ impl JudgeProviderKind {
 /// Configuration for the judging pipeline.
 #[derive(Debug, Clone)]
 pub struct JudgeConfig {
-    /// API base URL for the judge backend (e.g. OpenAI-compatible).
+    /// API base URL for the OpenAI-compatible judge backend.
     pub api_base: String,
-    /// API key secret.
+    /// API key for the OpenAI-compatible judge backend.
     pub api_key: String,
+    /// Optional separate base URL for Anthropic-routed judges (e.g.
+    /// `https://api.anthropic.com`). Falls back to `api_base` when
+    /// `None`, which only makes sense if the OpenAI-compatible host
+    /// proxies the Anthropic Messages API too.
+    pub anthropic_api_base: Option<String>,
+    /// Optional separate API key for Anthropic-routed judges. Falls
+    /// back to `api_key` when `None`.
+    pub anthropic_api_key: Option<String>,
     /// Model IDs for the three judges. Order determines the median
     /// computation.
     pub models: [String; 3],
@@ -72,6 +80,8 @@ impl Default for JudgeConfig {
             api_base: std::env::var("JBENCH_API_BASE")
                 .unwrap_or_else(|_| "https://api.openai.com".to_owned()),
             api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(),
+            anthropic_api_base: std::env::var("JBENCH_ANTHROPIC_API_BASE").ok(),
+            anthropic_api_key: std::env::var("JBENCH_ANTHROPIC_API_KEY").ok(),
             models: [
                 "gpt-5-2026-05".to_owned(),
                 "google/gemini-3.1-pro".to_owned(),
@@ -84,17 +94,15 @@ impl Default for JudgeConfig {
 }
 
 /// Render the full judge prompt from commit + diff + context.
-fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap<String, String>) -> String {
+fn render_judge_prompt(
+    commit: &EvalCommit,
+    agent_diff: &str,
+    context_files: &HashMap<String, String>,
+) -> String {
     let ground_truth_diffs = commit
         .file_diffs
         .iter()
-        .map(|fd| {
-            format!(
-                "### {}\n```diff\n{}\n```",
-                fd.path,
-                fd.diff
-            )
-        })
+        .map(|fd| format!("### {}\n```diff\n{}\n```", fd.path, fd.diff))
         .collect::<Vec<_>>()
         .join("\n\n");
 
@@ -106,10 +114,7 @@ fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &Ha
 
     format!(
         "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```",
-        commit.prompt,
-        context_content,
-        ground_truth_diffs,
-        agent_diff
+        commit.prompt, context_content, ground_truth_diffs, agent_diff
     )
 }
 
@@ -168,12 +173,18 @@ struct JudgeResponse {
 
 /// Invoke a single judge model with a fully-rendered prompt.
 ///
+/// `anthropic_api_base` / `anthropic_api_key` are only consulted when
+/// the model routes through `JudgeProviderKind::Anthropic`; OpenAI-bound
+/// requests always use the primary `api_base` / `api_key`.
+///
 /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`).
 pub async fn run_single_judge(
     model: &str,
     prompt: &str,
     api_base: &str,
     api_key: &str,
+    anthropic_api_base: Option<&str>,
+    anthropic_api_key: Option<&str>,
     http_client: &Client,
 ) -> Result<Scorecard> {
     let kind = JudgeProviderKind::for_model(model);
@@ -182,7 +193,12 @@ pub async fn run_single_judge(
     if kind == JudgeProviderKind::OpenAI {
         run_openai_judge(model, prompt, system, api_base, api_key, http_client).await
     } else {
-        run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await
+        // Fall back to the primary host/key only if no Anthropic-specific
+        // overrides were configured. The caller is expected to set both
+        // overrides when targeting `api.anthropic.com` directly.
+        let base = anthropic_api_base.unwrap_or(api_base);
+        let key = anthropic_api_key.unwrap_or(api_key);
+        run_anthropic_judge(model, prompt, system, base, key, http_client).await
     }
 }
 
@@ -292,10 +308,14 @@ async fn run_anthropic_judge(
         },
     });
 
+    // Anthropic Messages API authenticates via `x-api-key`, not
+    // `Authorization: Bearer ...`. Using the wrong header returns 401
+    // even with a valid key, which previously made this branch
+    // permanently dead.
     let url = format!("{api_base}/v1/messages");
     let response = http_client
         .post(&url)
-        .header("Authorization", format!("Bearer {api_key}"))
+        .header("x-api-key", api_key)
         .header("Content-Type", "application/json")
         .header("anthropic-version", "2023-06-01")
         .json(&request_body)
@@ -318,15 +338,14 @@ async fn run_anthropic_judge(
         .and_then(|t| t.as_str())
         .unwrap_or_default();
 
-    let parsed = serde_json::from_str::<serde_json::Value>(text)
-        .unwrap_or(serde_json::json!({
-            "analysis": text.to_owned(),
-            "strengths": [],
-            "weaknesses": ["Could not parse structured output from Anthropic judge"],
-            "completionScore": 0,
-            "codeQualityScore": 0,
-            "overallScore": 0
-        }));
+    let parsed = serde_json::from_str::<serde_json::Value>(text).unwrap_or(serde_json::json!({
+        "analysis": text.to_owned(),
+        "strengths": [],
+        "weaknesses": ["Could not parse structured output from Anthropic judge"],
+        "completionScore": 0,
+        "codeQualityScore": 0,
+        "overallScore": 0
+    }));
 
     parse_scorecard(parsed)
 }
@@ -365,22 +384,21 @@ pub async fn judge_with_three_models(
                 &prompt,
                 &config.api_base,
                 &config.api_key,
+                config.anthropic_api_base.as_deref(),
+                config.anthropic_api_key.as_deref(),
                 http,
             )
         })
         .collect();
 
     // Run all three judges in parallel with an overall timeout
-    let valid: Vec<Scorecard> = timeout(
-        timeout_duration,
-        futures::future::join_all(judge_futures),
-    )
-    .await
-    .ok()
-    .into_iter()           // IntoIterator<Item = Vec<Result<Scorecard>>>
-    .flatten()            // Iterator<Item = Result<Scorecard>>
-    .filter_map(|r| r.ok())
-    .collect();
+    let valid: Vec<Scorecard> = timeout(timeout_duration, futures::future::join_all(judge_futures))
+        .await
+        .ok()
+        .into_iter() // IntoIterator<Item = Vec<Result<Scorecard>>>
+        .flatten() // Iterator<Item = Result<Scorecard>>
+        .filter_map(|r| r.ok())
+        .collect();
 
     if valid.len() < MIN_JUDGE_SUCCESS_COUNT {
         return Ok(Scorecard {
@@ -390,11 +408,7 @@ pub async fn judge_with_three_models(
                 3
             ),
             strengths: vec![],
-            weaknesses: vec![format!(
-                "Only {}/{} judges succeeded",
-                valid.len(),
-                3
-            )],
+            weaknesses: vec![format!("Only {}/{} judges succeeded", valid.len(), 3)],
             completion_score: 0.0,
             code_quality_score: 0.0,
             overall_score: 0.0,
@@ -453,4 +467,40 @@ mod tests {
             JudgeProviderKind::Anthropic
         );
     }
+
+    /// Locks the wire-format contract: the LLM judge returns camelCase
+    /// (`completionScore`, etc.) per the request schema. Deserialization
+    /// must accept that even though the on-disk JSON form is snake_case.
+    #[test]
+    fn parse_scorecard_accepts_camelcase_from_llm() {
+        let camel = serde_json::json!({
+            "analysis": "looks good",
+            "strengths": ["clean diff"],
+            "weaknesses": [],
+            "completionScore": 8.5,
+            "codeQualityScore": 7.0,
+            "overallScore": 7.8
+        });
+        let parsed = parse_scorecard(camel).expect("camelCase must deserialize");
+        assert_eq!(parsed.completion_score, 8.5);
+        assert_eq!(parsed.code_quality_score, 7.0);
+        assert_eq!(parsed.overall_score, 7.8);
+    }
+
+    /// snake_case (on-disk eval JSON) must round-trip as well.
+    #[test]
+    fn parse_scorecard_accepts_snake_case_from_disk() {
+        let snake = serde_json::json!({
+            "analysis": "",
+            "strengths": [],
+            "weaknesses": [],
+            "completion_score": 1.0,
+            "code_quality_score": 2.0,
+            "overall_score": 3.0
+        });
+        let parsed = parse_scorecard(snake).expect("snake_case must deserialize");
+        assert_eq!(parsed.completion_score, 1.0);
+        assert_eq!(parsed.code_quality_score, 2.0);
+        assert_eq!(parsed.overall_score, 3.0);
+    }
 }
diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs
index 31bf1b661..f9cc09d06 100644
--- a/evals/jbench/src/lessons.rs
+++ b/evals/jbench/src/lessons.rs
@@ -245,16 +245,15 @@ pub fn append_lessons_to_file(
     }
 
     if !lessons_dir.exists() {
-        fs::create_dir_all(lessons_dir)
-            .context("failed to create lessons directory")?;
+        fs::create_dir_all(lessons_dir).context("failed to create lessons directory")?;
     }
 
     let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_");
     let file_path = lessons_dir.join(format!("{safe_id}.json"));
 
     let existing: Vec<Lesson> = if file_path.exists() {
-        let contents = fs::read_to_string(&file_path)
-            .context("failed to read existing lessons file")?;
+        let contents =
+            fs::read_to_string(&file_path).context("failed to read existing lessons file")?;
         serde_json::from_str(&contents).unwrap_or_default()
     } else {
         Vec::new()
@@ -265,11 +264,9 @@ pub fn append_lessons_to_file(
         .chain(lessons.iter().cloned())
         .collect();
 
-    let json = serde_json::to_string_pretty(&all_lessons)
-        .context("failed to serialize lessons")?;
+    let json = serde_json::to_string_pretty(&all_lessons).context("failed to serialize lessons")?;
 
-    fs::write(&file_path, json)
-        .context("failed to write lessons file")?;
+    fs::write(&file_path, json).context("failed to write lessons file")?;
 
     Ok(())
 }
diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs
index 97d0eb7c1..36363dc5b 100644
--- a/evals/jbench/src/lib.rs
+++ b/evals/jbench/src/lib.rs
@@ -18,7 +18,7 @@ pub mod judge;
 pub mod lessons;
 pub mod types;
 
-pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults};
 pub use agent_runner::AgentRunConfig;
 pub use judge::JudgeConfig;
 pub use lessons::LessonsConfig;
+pub use types::{AgentEvalResults, EvalCommit, EvalDataV2, EvalRun, JudgingResult};
diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs
index 1cb51e17e..39d4645c5 100644
--- a/evals/jbench/src/types.rs
+++ b/evals/jbench/src/types.rs
@@ -112,6 +112,12 @@ pub struct EvalDataV2 {
 /// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is
 /// used so we can also store the *averaged* per-dimension scores when
 /// aggregating multiple judges (see `judge::judge_with_three_models`).
+///
+/// On-disk JSON stays `snake_case` to match the rest of jcode's eval
+/// outputs, but each score field also accepts the `camelCase` spelling
+/// (`completionScore`, etc.) via `serde(alias = ...)` so we can
+/// deserialize LLM judge responses directly without an intermediate
+/// wire-format struct.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct JudgingResult {
     /// Free-form prose comparing the agent's diff to the ground truth.
@@ -121,10 +127,13 @@ pub struct JudgingResult {
     /// Bullet-point weaknesses called out by the judge.
     pub weaknesses: Vec<String>,
     /// How completely the prompt was addressed, `[0.0, 10.0]`.
+    #[serde(alias = "completionScore")]
     pub completion_score: f64,
     /// Code structure / maintainability, `[0.0, 10.0]`.
+    #[serde(alias = "codeQualityScore")]
     pub code_quality_score: f64,
     /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric.
+    #[serde(alias = "overallScore")]
     pub overall_score: f64,
 }
 
diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs
index 2a8efd02e..fcaa832fb 100644
--- a/evals/jbench/tests/types.rs
+++ b/evals/jbench/tests/types.rs
@@ -4,9 +4,7 @@
 //! and write, and they fail loudly if anyone changes a field's
 //! `snake_case` name without updating consumers.
 
-use jcode_jbench::types::{
-    EvalCommit, FileDiff, FileDiffStatus, JudgingResult,
-};
+use jcode_jbench::types::{EvalCommit, FileDiff, FileDiffStatus, JudgingResult};
 
 #[test]
 fn eval_commit_round_trips_through_json() {
diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs
index ba9719985..6107a314f 100644
--- a/src/agent/prompting.rs
+++ b/src/agent/prompting.rs
@@ -122,7 +122,6 @@ impl Agent {
     }
 }
 
-
 /// Wrap a step prompt body in `<system_reminder>...</system_reminder>` tags.
 ///
 /// Step prompts are emitted by the harness (not typed by the user), but they
diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs
index 68cee139a..635beb8cc 100644
--- a/src/prompt_placeholders.rs
+++ b/src/prompt_placeholders.rs
@@ -77,7 +77,12 @@ fn truncate_chars(s: &str, max_chars: usize) -> String {
 ///
 /// Length caps documented on [`PlaceholderContext`] are enforced here, so
 /// callers may pass un-truncated input and trust the output to be bounded.
-pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String {
+///
+/// This is the **context-driven** substitution path used for built-in
+/// Phase 4 placeholders. For user-supplied template bindings (arbitrary
+/// `HashMap<String, String>`), use
+/// [`crate::prompt_templates::substitute_placeholders`] instead.
+pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String {
     if prompt.is_empty() {
         return String::new();
     }
@@ -123,11 +128,8 @@ mod tests {
                      k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \
                      date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \
                      sys=[{{SYSTEM_INFO}}]";
-        let out = substitute_placeholders(input, &ctx);
-        assert_eq!(
-            out,
-            "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"
-        );
+        let out = substitute_context_placeholders(input, &ctx);
+        assert_eq!(out, "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]");
     }
 
     #[test]
@@ -136,11 +138,11 @@ mod tests {
             current_date: "2026-05-25".to_string(),
             ..Default::default()
         };
-        let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx);
+        let out = substitute_context_placeholders("today is {{CURRENT_DATE}}.", &ctx);
         assert_eq!(out, "today is 2026-05-25.");
 
         // Unrelated placeholder stays empty in the same call.
-        let out2 = substitute_placeholders(
+        let out2 = substitute_context_placeholders(
             "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}",
             &ctx,
         );
@@ -161,7 +163,7 @@ mod tests {
                      {{KNOWLEDGE_FILES}}\n\n## Meta\n\
                      date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \
                      sys={{SYSTEM_INFO}}";
-        let out = substitute_placeholders(input, &ctx);
+        let out = substitute_context_placeholders(input, &ctx);
         let expected = "## Tree\nsrc/\n  lib.rs\n\n## Knowledge\n\
                         AGENTS.md contents\n\n## Meta\n\
                         date=2026-05-25 steps=7 sys=linux x86_64";
@@ -176,7 +178,7 @@ mod tests {
         };
         let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \
                      other={{ALSO_BOGUS}}";
-        let out = substitute_placeholders(input, &ctx);
+        let out = substitute_context_placeholders(input, &ctx);
         assert_eq!(
             out,
             "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}"
@@ -191,7 +193,7 @@ mod tests {
             file_tree_small: big.clone(),
             ..Default::default()
         };
-        let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx);
+        let out = substitute_context_placeholders("[{{FILE_TREE_SMALL}}]", &ctx);
         // Two bracket characters plus the cap.
         assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2);
         assert!(out.starts_with('['));
diff --git a/src/tui/app/commands.rs b/src/tui/app/commands.rs
index 536d50231..4238e65fe 100644
--- a/src/tui/app/commands.rs
+++ b/src/tui/app/commands.rs
@@ -1925,7 +1925,8 @@ pub(super) fn handle_session_command(app: &mut App, trimmed: &str) -> bool {
             Ok(out) if out.status.success() => {
                 let _ = std::fs::remove_file(&tmp_path);
                 let url = String::from_utf8_lossy(&out.stdout)
-                    .lines().rfind(|l| l.starts_with("https://"))
+                    .lines()
+                    .rfind(|l| l.starts_with("https://"))
                     .unwrap_or("")
                     .trim()
                     .to_string();
diff --git a/tests/tool_fixtures.rs b/tests/tool_fixtures.rs
index 9a7d98e97..6c7fc0318 100644
--- a/tests/tool_fixtures.rs
+++ b/tests/tool_fixtures.rs
@@ -105,9 +105,10 @@ fn collect_fixtures() -> Vec<(String, Fixture)> {
             .unwrap_or("")
             .to_string();
         if let Some(needle) = filter.as_deref()
-            && !stem.contains(needle) {
-                continue;
-            }
+            && !stem.contains(needle)
+        {
+            continue;
+        }
         let raw = std::fs::read_to_string(&path).expect("read fixture");
         let fixture: Fixture =
             serde_json::from_str(&raw).unwrap_or_else(|e| panic!("parse fixture {}: {}", stem, e));

From b215afab3ceb522e6b4f2c9bd3b72314adb04a23 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Thu, 4 Jun 2026 08:06:45 +0700
Subject: [PATCH 07/22] docs(multi-agent): add master implementation plan (1145
 lines, 12 sections)

Synthesizes best patterns from 9 reference repos:
- AgentPath tree + mailbox (codex)
- Tool-based agent delegation (CC)
- DAG wave parallelism (oh-my-pi)
- Role-based config bundles (opencode + codex)
- Team pipeline lifecycle (oh-my-claudecode)
- Cost aggregation + ancestry tracking (codebuff)

Covers: architecture, types, pseudocode, Rust implementation,
CLI commands, config wiring, test cases, benchmarks, rollout
---
 .omo/plans/multi-agent-master-plan.md | 1145 +++++++++++++++++++++++++
 1 file changed, 1145 insertions(+)
 create mode 100644 .omo/plans/multi-agent-master-plan.md

diff --git a/.omo/plans/multi-agent-master-plan.md b/.omo/plans/multi-agent-master-plan.md
new file mode 100644
index 000000000..87e5ef7cd
--- /dev/null
+++ b/.omo/plans/multi-agent-master-plan.md
@@ -0,0 +1,1145 @@
+# Implementation Plan: Multi-Agent System for jcode
+> Generated from research across 9 repos + jcode codebase analysis
+> Goal: Full multi-agent orchestration — model-driven delegation, team pipeline, DAG parallelism, agent tree lifecycle
+
+---
+
+## 1. Executive Summary
+
+jcode currently has swarm visualization infrastructure (TUI, protocol, prompts) but **zero agent spawning/driving logic**. The LLM can talk about swarm helpers in prompts, but there's no actual `agent` tool, no agent tree, no sub-agent lifecycle, and no team pipeline.
+
+This plan builds a production-grade multi-agent system by synthesizing the best patterns from codex (AgentPath tree + mailbox, proven in Rust), Claude Code (tool-based delegation, the model drives everything), oh-my-pi (DAG wave parallelism), codebuff (LLM-derived pipeline + cost aggregation), and oh-my-claudecode (team lifecycle + file-based shared state). The result is a three-surface system: **model-driven delegation** (LLM calls `agent` tool), **team pipeline** (CLI-driven multi-step workflow), and **batch processing** (programmatic multi-agent jobs).
+
+---
+
+## 2. Architecture Decision
+
+### Chosen Approach: Hybrid Tree + Tool + Wave
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    AgentControl                           │
+│  (central registry: tree, threads, names, mailboxes)     │
+│                                                          │
+│  ┌──────────────┐  ┌──────────────┐  ┌──────────────┐   │
+│  │ /root        │  │ /root/       │  │ /root/       │   │
+│  │ (user        │  │ explorer     │  │ worker       │   │
+│  │  session)    │  │ (read-only)  │  │ (execute)    │   │
+│  └──────┬───────┘  └──────────────┘  └──────────────┘   │
+│         │                                                │
+│  ┌──────┴───────┐                                        │
+│  │ /root/worker │                                        │
+│  │ /code-review │                                        │
+│  │ (sub-task)   │                                        │
+│  └──────────────┘                                        │
+└─────────────────────────────────────────────────────────┘
+```
+
+Three delegation modes, one agent tree:
+
+| Mode | Trigger | Use Case | Parallelism |
+|------|---------|----------|-------------|
+| **Tool-based** | LLM calls `agent` tool | Model decides to delegate | Sync/async/fork |
+| **Team pipeline** | `jcode team` CLI | Plan→PRD→Exec→Verify→Fix | DAG wave |
+| **Batch** | `jcode agent batch` CSV | Parallel research/review jobs | FuturesUnordered |
+
+### Alternatives Considered
+
+| Approach | Source Repo | Pros | Cons | Decision |
+|----------|-------------|------|------|----------|
+| AgentPath tree + mailbox | codex | Hierarchical addressing, async decoupling, Rust-native, production-tested | Higher initial complexity | **PRIMARY** — best fit for Rust codebase |
+| Tool-based delegation | CC | Model drives everything, simple mental model, proven UX | No automated pipeline | **PRIMARY** — best UX for interactive use |
+| DAG wave parallelism | oh-my-pi | Clean dependency resolution, parallel by default | Requires DAG definition upfront | **SECONDARY** — for team pipeline only |
+| Centralized orchestrator | codebuff | LLM-pipeline means flexible | Spawning overhead per step | **SECONDARY** — for team pipeline |
+| Tmux teams | oh-my-claudecode | Pragmatic, visible | OS-level coupling, fragile | **REFERENCE** — file-based state pattern |
+| Single monolithic agent | pi-agent-rust | Simplest, zero overhead | No delegation at all | **REJECTED** — doesn't meet goal |
+| Protocol-first | opencode | Clean abstraction | Over-engineered for our needs | **REJECTED** — too abstract |
+
+---
+
+## 3. Data Structures & Types
+
+```rust
+// === Core Agent Tree ===
+
+/// Unique path in the agent tree.
+/// Examples: "/root", "/root/explorer", "/root/worker/code-review"
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentPath(Arc<str>);
+
+impl AgentPath {
+    pub fn root() -> Self { Self("/root".into()) }
+    pub fn parent(&self) -> Option<AgentPath>;
+    pub fn child(&self, name: &str) -> AgentPath;
+    pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool;
+}
+
+/// Agent identity — registered in AgentControl.
+#[derive(Debug, Clone)]
+pub struct AgentEntry {
+    pub id: AgentId,              // UUID
+    pub path: AgentPath,          // Tree position
+    pub name: String,             // Human-readable nickname (unique pool)
+    pub role: AgentRole,
+    pub config: AgentConfig,
+    pub state: AgentState,
+    pub created_at: Instant,
+    pub ancestry: AgentAncestry,  // parent_id, ancestor_ids
+    pub mailbox: Option<MailboxSender>,
+}
+
+/// Role determines default model, tools, and permissions.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+pub enum AgentRole {
+    /// General agent — full tool access, plans and executes
+    Default,
+    /// Read-only investigator — grep, read, glob, websearch only
+    Explorer,
+    /// Execute known plan — limited tools, no planning
+    Worker,
+    /// Orchestrator — delegates subtasks, synthesizes results
+    Orchestrator,
+}
+
+/// Agent config bundle — inspired by opencode + codex role profiles.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentConfig {
+    pub model: Option<String>,           // None = inherit parent
+    pub system_prompt: Option<String>,   // None = inherit, Some = override
+    pub tools: AgentToolPolicy,
+    pub permissions: AgentPermissionBound,
+    pub max_turns: u32,                  // Hard stop
+    pub max_cost: Option<f64>,           // Cost cap (USD)
+    pub timeout: Option<Duration>,       // Wall-clock timeout
+}
+
+/// What tools this agent can use.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum AgentToolPolicy {
+    /// Inherit parent's tool policy
+    Inherit,
+    /// Explicit allow list
+    Allow(HashSet<String>),
+    /// Inherit + add
+    Extend(HashSet<String>),
+    /// No tools (chat-only)
+    None,
+}
+
+/// Permission boundary — bubble model from CC.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentPermissionBound {
+    pub max_risk_level: RiskLevel,       // Can't exceed this
+    pub allow_approve: bool,             // Can approve own requests
+    pub pre_approved: Vec<String>,       // Always-ok tool calls
+}
+
+// === Mailbox (from codex) ===
+
+/// One-shot channel for agent communication.
+type MailboxSender = tokio::sync::oneshot::Sender<AgentMessage>;
+type MailboxReceiver = tokio::sync::oneshot::Receiver<AgentMessage>;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct AgentMessage {
+    pub from: AgentPath,
+    pub kind: AgentMessageKind,
+    pub payload: serde_json::Value,
+    pub timestamp: Instant,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub enum AgentMessageKind {
+    /// "Do this subtask, report back"
+    Task { prompt: String, max_turns: u32 },
+    /// "Here are the results"
+    Result { output: String, cost: Option<f64> },
+    /// "I need more context"
+    RequestInfo { question: String },
+    /// "Here's the info you requested"
+    Info { data: serde_json::Value },
+    /// "Stop what you're doing"
+    Cancel,
+}
+
+// === Agent spawn tool input/output ===
+
+/// The `agent` tool that the LLM calls.
+#[derive(Debug, Deserialize)]
+pub struct AgentToolInput {
+    /// Role: "explorer", "worker", "orchestrator", or "default"
+    pub role: String,
+    /// What to do
+    pub prompt: String,
+    /// Sync (wait), async (fire-and-forget), fork (share prompt cache)
+    #[serde(default = "default_mode")]
+    pub mode: AgentSpawnMode,
+    /// Optional tools to add beyond role defaults
+    #[serde(default)]
+    pub extra_tools: Vec<String>,
+    /// Optional max turns for this sub-agent
+    #[serde(default = "default_subagent_turns")]
+    pub max_turns: u32,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, Default)]
+pub enum AgentSpawnMode {
+    #[default]
+    /// Wait for completion, return result
+    Sync,
+    /// Fire and forget — results logged but not returned
+    Async,
+    /// Spawn with current prompt cache — zero cold start
+    Fork,
+}
+
+/// What the LLM sees after `agent` tool completes.
+#[derive(Debug, Serialize)]
+pub struct AgentToolOutput {
+    pub agent_id: String,
+    pub agent_path: String,
+    pub result: Option<String>,        // None for async
+    pub turn_count: u32,
+    pub cost: Option<f64>,
+    pub timed_out: bool,
+}
+
+// === Agent tree registry ===
+
+/// Central agent tree — thread-safe, tree-addressed.
+pub struct AgentControl {
+    tree: Arc<RwLock<AgentTreeInner>>,
+    name_pool: Arc<Mutex<HashSet<String>>>,
+    thread_limits: AgentThreadLimits,
+}
+
+struct AgentTreeInner {
+    agents: HashMap<AgentPath, AgentEntry>,
+    parent_children: HashMap<AgentPath, Vec<AgentPath>>,
+    next_id: u64,
+}
+
+pub struct AgentThreadLimits {
+    pub max_depth: u32,                // Default: 5
+    pub max_siblings: u32,             // Default: 10
+    pub max_total: u32,                // Default: 50
+}
+
+// === DAG pipeline (from oh-my-pi) ===
+
+/// A plan step in the DAG.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PlanStep {
+    pub id: String,
+    pub agent_role: AgentRole,
+    pub prompt: String,
+    pub depends_on: Vec<String>,       // Step IDs that must complete first
+    pub timeout: Option<Duration>,
+}
+
+/// Wave = set of steps that can run in parallel.
+pub struct ExecutionWave {
+    pub wave_index: usize,
+    pub steps: Vec<PlanStep>,
+}
+```
+
+---
+
+## 4. Pseudocode — Core Algorithm
+
+### 4a. Spawn Sub-Agent (Tool-Based Delegation)
+
+```
+FUNCTION spawn_agent(parent_session, input: AgentToolInput):
+  // 1. Validate
+  role  = RESOLVE_ROLE(input.role)
+  VALIDATE parent_session can spawn(role)
+  CHECK AgentControl.thread_limits (depth < max_depth, siblings < max_siblings)
+
+  // 2. Build AgentConfig from role defaults + input overrides
+  config = AgentConfig {
+    model:     role.default_model ?? parent_session.model,
+    tools:     role.default_tools + input.extra_tools,
+    permissions: role.default_permissions,
+    max_turns: input.max_turns,
+    ...
+  }
+
+  // 3. Create mailbox
+  (tx, rx) = oneshot::channel()
+
+  // 4. Register in AgentTree
+  path = parent_session.path.child(autoname())
+  entry = AgentEntry { path, role, config, mailbox: tx, ... }
+  AgentControl.register(entry)
+
+  // 5. Fire SubagentStart hook
+  FIRE_HOOK(SubagentStart { parent_path: parent.path, child_path: path, role })
+
+  // 6. Handle mode:
+  IF input.mode == Sync:
+    // Run sub-agent in same task, await result
+    result = RUN_AGENT_SESSION(config, input.prompt, parent_context)
+    AgentControl.complete(path)
+    FIRE_HOOK(SubagentStop { path, result })
+    RETURN AgentToolOutput { result, ... }
+
+  ELIF input.mode == Async:
+    // Spawn separate tokio task, no waiting
+    task = tokio::spawn(async {
+      result = RUN_AGENT_SESSION(config, input.prompt, parent_context)
+      AgentControl.complete(path)
+      FIRE_HOOK(SubagentStop { path, result })
+    })
+    RETURN AgentToolOutput { agent_id: path, result: None, ... }
+
+  ELIF input.mode == Fork:
+    // Share parent's prompt cache, zero cold start
+    cached_prompt = parent_session.get_prompt_cache()
+    task = tokio::spawn(async {
+      result = RUN_AGENT_SESSION(config, input.prompt,
+                                  parent_context, cached_prompt)
+      AgentControl.complete(path)
+      FIRE_HOOK(SubagentStop { path, result })
+    })
+    RETURN AgentToolOutput { agent_id: path, result: None, ... }
+
+  END
+END
+```
+
+### 4b. Agent Turn Loop (Sub-Agent Runtime)
+
+```
+FUNCTION run_agent_session(config, prompt, parent_context, cached_prompt?):
+  // 1. Create isolated session context
+  session = AgentSession {
+    config,
+    context: parent_context.clone(),
+    prompt_cache: cached_prompt,
+    turn_count: 0,
+    accumulated_cost: 0.0,
+    mailbox: rx from spawn,
+  }
+
+  // 2. Execute turn loop
+  WHILE session.turn_count < config.max_turns:
+    // Check mailbox for parent messages
+    IF session.mailbox has message:
+      IF message.kind == Cancel:
+        RETURN Result { output: "cancelled", ... }
+      ELIF message.kind == RequestInfo:
+        SEND response back via oneshot
+        CONTINUE
+
+    // Normal LLM turn
+    response = LLM_CALL(session.context)
+    session.turn_count++
+    session.accumulated_cost += response.cost
+
+    // Process tool calls
+    FOR tool_call in response.tool_calls:
+      IF tool_call.name == "agent":
+        // Nested delegation — recursive spawn
+        sub_result = spawn_agent(session, tool_call.input)
+        ADD sub_result to session.context
+      ELSE:
+        result = EXECUTE_TOOL(tool_call)
+        ADD result to session.context
+
+      // Check cost cap
+      IF config.max_cost && session.accumulated_cost > config.max_cost:
+        RETURN Result { output: "cost limit exceeded", ... }
+
+    // Check if done (no tool calls = final answer)
+    IF response.tool_calls is empty:
+      RETURN Result { output: response.text, cost: session.accumulated_cost }
+
+  RETURN Result { output: "max turns reached", ... }
+END
+```
+
+### 4c. Team Pipeline (DAG Wave Execution)
+
+```
+FUNCTION execute_team_pipeline(steps: Vec<PlanStep>):
+  // 1. Build DAG from depends_on edges
+  dag = BUILD_DAG(steps)  // adjacency list + in-degree count
+
+  // 2. Decompose into topological waves
+  waves = TOPOLOGICAL_WAVES(dag)
+  // Wave 0: steps with no dependencies
+  // Wave 1: steps whose deps are all in wave 0
+  // ...
+
+  // 3. Execute wave by wave
+  step_results = Map<StepId, AgentToolOutput>
+
+  FOR wave in waves:
+    // Run all steps in this wave in parallel
+    handles = []
+    FOR step in wave:
+      handle = tokio::spawn(async {
+        // Inherit context from parent + prev wave results
+        context = BUILD_CONTEXT(step, step_results)
+        result = spawn_agent(parent, {
+          role: step.agent_role,
+          prompt: step.prompt,
+          mode: Sync,
+        })
+        // Store result for dependent steps
+        step_results[step.id] = result
+      })
+      handles.push(handle)
+
+    // Wait for entire wave (fail-one = fail-wave)
+    FOR handle in handles:
+      await handle
+
+    // Fire wave-complete hook
+    FIRE_HOOK(WaveComplete { wave_index: wave.wave_index })
+
+  RETURN step_results
+END
+```
+
+---
+
+## 5. Implementation Code & Modules
+
+### New Cargo Crate: `jcode-agent-tree`
+
+```
+crates/jcode-agent-tree/
+  Cargo.toml
+  src/
+    lib.rs           — re-exports
+    path.rs          — AgentPath type
+    entry.rs         — AgentEntry, AgentConfig, AgentRole
+    control.rs       — AgentControl (registry, thread limits)
+    mailbox.rs       — MailboxSender/Receiver, AgentMessage
+    serialization.rs — tree save/restore
+```
+
+### `path.rs`
+
+```rust
+use std::sync::Arc;
+use serde::{Serialize, Deserialize};
+
+/// Tree-addressed agent path.
+/// Always starts with "/root". Examples:
+///   "/root"
+///   "/root/explorer"
+///   "/root/worker/code-review"
+#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)]
+pub struct AgentPath(Arc<str>);
+
+impl AgentPath {
+    pub fn root() -> Self {
+        Self("/root".into())
+    }
+
+    /// Parse from string — validates format.
+    pub fn parse(s: &str) -> Result<Self, AgentPathError> {
+        if !s.starts_with('/') {
+            return Err(AgentPathError::InvalidFormat);
+        }
+        if s == "/" {
+            return Err(AgentPathError::TooShort);
+        }
+        // Must not end with /
+        if s.ends_with('/') && s.len() > 1 {
+            return Err(AgentPathError::TrailingSlash);
+        }
+        Ok(Self(s.into()))
+    }
+
+    /// Create child path: /root/foo + "bar" = /root/foo/bar
+    pub fn child(&self, name: &str) -> Self {
+        let parent = self.0.as_ref();
+        if parent.ends_with('/') {
+            Self(format!("{}{}", parent, name).into())
+        } else {
+            Self(format!("{}/{}", parent, name).into())
+        }
+    }
+
+    /// Parent path or None if root.
+    pub fn parent(&self) -> Option<Self> {
+        let s = self.0.as_ref();
+        if s == "/root" {
+            return None;
+        }
+        let last_slash = s.rfind('/')?;
+        if last_slash == 0 {
+            return Some(Self("/root".into()));
+        }
+        Some(Self(s[..last_slash].into()))
+    }
+
+    /// Depth: /root = 0, /root/explorer = 1
+    pub fn depth(&self) -> usize {
+        self.0.chars().filter(|&c| c == '/').count().saturating_sub(1)
+    }
+
+    /// Is this path a descendant of ancestor?
+    pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool {
+        let self_s = self.0.as_ref();
+        let anc_s = ancestor.0.as_ref();
+        self_s.starts_with(anc_s) && self_s.len() > anc_s.len()
+            && self_s.as_bytes().get(anc_s.len()) == Some(&b'/')
+    }
+
+    pub fn as_str(&self) -> &str {
+        self.0.as_ref()
+    }
+}
+```
+
+### `control.rs`
+
+```rust
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::{RwLock, Mutex, oneshot};
+use std::time::Instant;
+
+use crate::path::AgentPath;
+use crate::entry::{AgentEntry, AgentRole, AgentConfig, AgentState};
+
+/// Maximum thread limits for safety.
+const MAX_DEPTH: u32 = 10;
+const MAX_SIBLINGS: u32 = 32;
+const MAX_TOTAL: u32 = 200;
+
+/// Central agent tree — thread-safe singleton.
+pub struct AgentControl {
+    inner: Arc<RwLock<AgentTreeInner>>,
+    name_pool: Arc<Mutex<NamePool>>,
+    limits: AgentThreadLimits,
+}
+
+struct AgentTreeInner {
+    agents: HashMap<AgentPath, AgentEntry>,
+    parent_children: HashMap<AgentPath, Vec<AgentPath>>,
+    next_id: u64,
+}
+
+pub struct AgentThreadLimits {
+    pub max_depth: u32,
+    pub max_siblings: u32,
+    pub max_total: u32,
+}
+
+impl Default for AgentThreadLimits {
+    fn default() -> Self {
+        Self {
+            max_depth: MAX_DEPTH,
+            max_siblings: MAX_SIBLINGS,
+            max_total: MAX_TOTAL,
+        }
+    }
+}
+
+impl AgentControl {
+    pub fn new() -> Self {
+        let inner = AgentTreeInner {
+            agents: HashMap::new(),
+            parent_children: HashMap::new(),
+            next_id: 1,
+        };
+        Self {
+            inner: Arc::new(RwLock::new(inner)),
+            name_pool: Arc::new(Mutex::new(NamePool::new())),
+            limits: AgentThreadLimits::default(),
+        }
+    }
+
+    /// Register a new agent in the tree.
+    /// Returns error if thread limits would be exceeded.
+    pub async fn register(
+        &self,
+        parent_path: &AgentPath,
+        name: &str,
+        role: AgentRole,
+        config: AgentConfig,
+        mailbox: oneshot::Sender<...>,
+    ) -> Result<AgentPath, AgentControlError> {
+        let mut inner = self.inner.write().await;
+
+        // Check max total
+        if inner.agents.len() as u32 >= self.limits.max_total {
+            return Err(AgentControlError::MaxTotalAgents);
+        }
+
+        // Check depth
+        let depth = parent_path.depth() + 1;
+        if depth > self.limits.max_depth {
+            return Err(AgentControlError::MaxDepth(depth));
+        }
+
+        // Check siblings
+        let siblings = inner.parent_children.get(parent_path)
+            .map(|v| v.len())
+            .unwrap_or(0);
+        if siblings >= self.limits.max_siblings as usize {
+            return Err(AgentControlError::MaxSiblings(siblings));
+        }
+
+        // Generate unique name
+        let unique_name = self.name_pool.lock().unwrap()
+            .allocate(name);
+
+        let path = parent_path.child(&unique_name);
+        let id = inner.next_id;
+
+        let entry = AgentEntry {
+            id,
+            path: path.clone(),
+            name: unique_name.clone(),
+            role,
+            config,
+            state: AgentState::Spawning,
+            created_at: Instant::now(),
+            mailbox,
+        };
+
+        inner.agents.insert(path.clone(), entry);
+        inner.parent_children
+            .entry(parent_path.clone())
+            .or_default()
+            .push(path.clone());
+        inner.next_id += 1;
+
+        Ok(path)
+    }
+
+    /// Find agent by path.
+    pub async fn get(&self, path: &AgentPath) -> Option<AgentEntry> {
+        self.inner.read().await.agents.get(path).cloned()
+    }
+
+    /// List children of a path.
+    pub async fn children(&self, path: &AgentPath) -> Vec<AgentPath> {
+        self.inner.read().await
+            .parent_children.get(path)
+            .cloned()
+            .unwrap_or_default()
+    }
+
+    /// Shutdown an agent and all its descendants (recursive).
+    pub async fn shutdown_tree(&self, path: &AgentPath) {
+        let mut inner = self.inner.write().await;
+        let children = inner.parent_children.get(path).cloned().unwrap_or_default();
+
+        for child_path in &children {
+            if let Some(entry) = inner.agents.get(child_path) {
+                if let Some(tx) = &entry.mailbox {
+                    let _ = tx.send(AgentMessage::shutdown());
+                }
+            }
+        }
+        // Remove from parent's children list
+        if let Some(parent) = path.parent() {
+            if let Some(siblings) = inner.parent_children.get_mut(&parent) {
+                siblings.retain(|p| p != path);
+            }
+        }
+        inner.agents.remove(path);
+    }
+
+    /// Complete an agent (success or failure)
+    pub async fn complete(&self, path: &AgentPath, state: AgentState) {
+        let mut inner = self.inner.write().await;
+        if let Some(entry) = inner.agents.get_mut(path) {
+            entry.state = state;
+        }
+    }
+
+    /// Serialize the agent tree for display.
+    pub async fn snapshot(&self) -> Vec<AgentEntry> {
+        self.inner.read().await.agents.values().cloned().collect()
+    }
+}
+
+// === Name pool (unique agent nicknames) ===
+
+struct NamePool {
+    used: HashSet<String>,
+    counters: HashMap<String, u64>,
+}
+
+impl NamePool {
+    fn new() -> Self {
+        Self {
+            used: HashSet::new(),
+            counters: HashMap::new(),
+        }
+    }
+
+    fn allocate(&mut self, base: &str) -> String {
+        let counter = self.counters.entry(base.to_string()).or_insert(0);
+        *counter += 1;
+        let name = format!("{}-{}", base, *counter);
+        self.used.insert(name.clone());
+        name
+    }
+}
+```
+
+### Modifications to Existing Files
+
+#### `crates/jcode-app-core/src/agent/mod.rs` — New `agent` tool
+
+```rust
+/// The `agent` tool — lets the LLM spawn sub-agents.
+pub struct AgentTool {
+    agent_control: Arc<AgentControl>,
+    session_registry: Arc<SessionRegistry>,
+}
+
+#[async_trait]
+impl Tool for AgentTool {
+    fn name(&self) -> &str { "agent" }
+    fn description(&self) -> &str {
+        "Spawn a sub-agent to work on a task. Use sync mode to get the result back, \
+         async for fire-and-forget, fork to reuse the current prompt cache. \
+         Roles: explorer (read-only), worker (execute), orchestrator (plan+delegate)."
+    }
+
+    async fn execute(&self, input: Value, ctx: ToolContext) -> ToolOutput {
+        let input: AgentToolInput = serde_json::from_value(input)?;
+        // Validate role
+        let role = AgentRole::from_str(&input.role)
+            .map_err(|_| ToolError::InvalidParam("role"))?;
+
+        // Build config from role defaults + overrides
+        let config = self.build_config(&ctx, role, &input);
+
+        // Create mailbox
+        let (tx, rx) = oneshot::channel();
+
+        // Register in tree
+        let parent_path = ctx.agent_path();  // from session runtime
+        let path = self.agent_control.register(
+            &parent_path, &role.to_string(), role, config, tx
+        ).await?;
+
+        // Fire hook
+        fire_hook(HookEvent::SubagentStart {
+            parent: parent_path.to_string(),
+            child: path.to_string(),
+            role: role.to_string(),
+        }).await;
+
+        // ... spawn session and run ...
+    }
+}
+```
+
+#### `src/cli/args.rs` — New subcommands
+
+```rust
+pub(crate) enum Command {
+    // ... existing ...
+    /// Multi-agent team orchestration
+    #[command(subcommand)]
+    Team(TeamCommand),
+    /// Sub-agent tree management
+    #[command(subcommand)]
+    Agent(AgentCommand),
+}
+
+#[derive(Subcommand)]
+pub(crate) enum TeamCommand {
+    /// Start a team pipeline from a plan file
+    Start {
+        /// Path to plan file (YAML/TOML)
+        plan: PathBuf,
+        /// Number of parallel workers
+        #[arg(long, default_value = "4")]
+        workers: u32,
+    },
+    /// Show team status
+    Status,
+    /// Stop a running team
+    Stop {
+        /// Team ID (from `team start`)
+        team_id: String,
+    },
+}
+
+#[derive(Subcommand)]
+pub(crate) enum AgentCommand {
+    /// List all sub-agents in tree
+    List,
+    /// Show agent tree
+    Tree,
+    /// Kill a sub-agent by path
+    Kill {
+        path: String,
+    },
+    /// Get agent status
+    Status {
+        path: String,
+    },
+}
+```
+
+#### `src/cli/dispatch.rs` — Route new commands
+
+```rust
+Command::Team(cmd) => {
+    match cmd {
+        TeamCommand::Start { plan, workers } => {
+            let plan = parse_plan_file(&plan)?;
+            runtime.execute_team_pipeline(plan, workers).await?;
+        }
+        TeamCommand::Status => {
+            let tree = runtime.agent_control().snapshot().await;
+            // Print formatted table
+        }
+        TeamCommand::Stop { team_id } => {
+            runtime.agent_control()
+                .shutdown_tree(&AgentPath::parse(&format!("/root/{}", team_id))?)
+                .await;
+        }
+    }
+}
+```
+
+#### Integration into Agent Turn Loop
+
+In `turn_streaming_mpsc.rs`, the existing soft-interrupt points already provide hooks for sub-agent injection:
+
+- **Point A (pre-API)**: Check sub-agent mailbox for incoming messages (Cancel, RequestInfo)
+- **Point B (post-response)**: Process `agent` tool calls from the model
+- **Point C (between tools)**: Check for sub-agent result availability
+- **Point D (after all tools)**: Fire SubagentStop hooks, propagate results
+
+```rust
+// In the agent turn loop, after tool call processing:
+if tool_call.name == "agent" {
+    let input: AgentToolInput = serde_json::from_value(tool_call.input)?;
+    let result = AgentTool::execute(input, ctx).await;
+    // result goes back as a regular tool result
+    context.add_tool_result(tool_call.id, result);
+}
+```
+
+---
+
+## 6. Configuration & Wiring
+
+### `~/.jcode/config.toml` — Agent section
+
+```toml
+[agents]
+# Max sub-agents in the tree
+max_total = 50
+# Max delegation depth
+max_depth = 5
+# Max siblings per parent
+max_siblings = 10
+# Default agent timeout
+default_timeout = "300s"
+# Default max turns
+default_max_turns = 50
+
+[agents.roles.explorer]
+model = "claude-sonnet-4-20250514"
+tools = ["read", "grep", "glob", "websearch", "web_fetch"]
+max_turns = 20
+permissions = { max_risk_level = "read_only", allow_approve = false }
+
+[agents.roles.worker]
+model = "claude-sonnet-4-20250514"
+tools = ["read", "write", "edit", "bash", "grep", "glob"]
+max_turns = 50
+permissions = { max_risk_level = "standard", allow_approve = false }
+
+[agents.roles.orchestrator]
+model = "claude-opus-4-20250514"
+tools = "*"    # All available tools
+max_turns = 30
+permissions = { max_risk_level = "elevated", allow_approve = true }
+```
+
+### Env Vars (in `disable-registry` style)
+
+| Env Var | Effect |
+|---------|--------|
+| `JCODE_DISABLE_AGENT_TREE=1` | Disable all multi-agent features |
+| `JCODE_MAX_AGENTS=10` | Override max_total at process level |
+| `JCODE_AGENT_TIMEOUT_MS=60000` | Per-agent timeout override |
+
+### Integration Points Checklist
+
+| File | Change | Priority |
+|------|--------|----------|
+| `Cargo.toml` (workspace) | Add `jcode-agent-tree` crate | P0 |
+| `crates/jcode-agent-tree/src/lib.rs` | New crate — AgentPath, AgentTree, Mailbox | P0 |
+| `crates/jcode-app-core/src/tool/mod.rs` | Register `AgentTool` | P0 |
+| `crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs` | Handle `agent` tool calls in turn loop | P0 |
+| `src/cli/args.rs` | Add `Team` + `Agent` subcommands | P1 |
+| `src/cli/dispatch.rs` | Route team/agent commands | P1 |
+| `crates/jcode-base/src/config.rs` | Add `[agents]` config section | P1 |
+| `crates/jcode-protocol/src/wire.rs` | Add SubagentStart/Stop events | P1 |
+| `crates/jcode-tui/src/tui/app.rs` | Display agent tree in side panel | P2 |
+| `crates/jcode-tui/src/tui/ui.rs` | Agent tree widget | P2 |
+
+---
+
+## 7. Repo References
+
+| Feature Aspect | Repo | File | Link |
+|----------------|------|------|------|
+| AgentPath tree | codex | cli/kernel/agents/agent_path.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_path.rs |
+| Mailbox | codex | cli/kernel/agents/mailbox.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/mailbox.rs |
+| AgentControl | codex | cli/kernel/agents/agent_control.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_control.rs |
+| Batch CSV | codex | cli/kernel/agents/spawn.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/spawn.rs |
+| Agent tool | CC | src/tools/agent.ts | https://github.com/claude-code-best/claude-code/blob/main/src/tools/agent.ts |
+| Subagent hooks | CC | src/services/hooks.ts | https://github.com/claude-code-best/claude-code/blob/main/src/services/hooks.ts |
+| DAG wave | oh-my-pi | src/agent/swarm/DAGSwarm.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/swarm/DAGSwarm.ts |
+| EventBus | oh-my-pi | src/agent/EventBus.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/EventBus.ts |
+| Pipeline orchestration | codebuff | src/orchestrator/Buffy.ts | https://github.com/CodebuffAI/codebuff/blob/main/src/orchestrator/Buffy.ts |
+| Team pipeline | oh-my-claudecode | src/team/index.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/index.ts |
+| Spawn agent | oh-my-openagent | src/agents/agentOrchestration.ts | https://github.com/code-yeongyu/oh-my-openagent/blob/main/src/agents/agentOrchestration.ts |
+| Fork subagent | oh-my-claudecode | src/team/agents.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/agents.ts |
+| Agent posture gating | oh-my-codex | src/orchestrator/posture.ts | https://github.com/Yeachan-Heo/oh-my-codex/blob/main/src/orchestrator/posture.ts |
+| jcode existing swarm TUI | jcode | crates/jcode-tui/src/tui/app.rs | — |
+| jcode existing orchestration API | jcode | src/orchestration_api.rs | — |
+
+---
+
+## 8. Test Cases
+
+### Unit Tests
+
+```rust
+// === AgentPath tests ===
+#[test]
+fn test_agent_path_root() {
+    let root = AgentPath::root();
+    assert_eq!(root.as_str(), "/root");
+    assert_eq!(root.depth(), 0);
+    assert!(root.parent().is_none());
+}
+
+#[test]
+fn test_agent_path_child() {
+    let root = AgentPath::root();
+    let explorer = root.child("explorer");
+    assert_eq!(explorer.as_str(), "/root/explorer");
+    assert_eq!(explorer.depth(), 1);
+    assert_eq!(explorer.parent().unwrap().as_str(), "/root");
+}
+
+#[test]
+fn test_agent_path_is_descendant() {
+    let root = AgentPath::root();
+    let worker = root.child("worker");
+    let task = worker.child("code-review");
+    assert!(task.is_descendant_of(&root));
+    assert!(task.is_descendant_of(&worker));
+    assert!(!worker.is_descendant_of(&task));
+}
+
+#[test]
+fn test_agent_path_parse_valid() {
+    let p = AgentPath::parse("/root/explorer").unwrap();
+    assert_eq!(p.as_str(), "/root/explorer");
+}
+
+#[test]
+fn test_agent_path_parse_invalid() {
+    assert!(AgentPath::parse("/").is_err());
+    assert!(AgentPath::parse("root").is_err());
+}
+
+// === AgentControl tests ===
+
+#[tokio::test]
+async fn test_register_agent() {
+    let ctrl = AgentControl::new();
+    let root = AgentPath::root();
+    let (tx, _rx) = oneshot::channel();
+
+    let path = ctrl.register(&root, "explorer", AgentRole::Explorer,
+        AgentConfig::default(), tx).await.unwrap();
+
+    assert!(path.as_str().starts_with("/root/explorer-"));
+    assert!(ctrl.get(&path).await.is_some());
+}
+
+#[tokio::test]
+async fn test_max_depth_enforced() {
+    let ctrl = AgentControl::new();
+    let mut path = AgentPath::root();
+    for i in 0..12 {   // max_depth = 10
+        let (tx, _rx) = oneshot::channel();
+        let result = ctrl.register(&path, "deep", AgentRole::Worker,
+            AgentConfig::default(), tx).await;
+        if i >= 10 {
+            assert!(result.is_err());
+        } else {
+            path = result.unwrap();
+        }
+    }
+}
+
+#[tokio::test]
+async fn test_shutdown_tree() {
+    let ctrl = AgentControl::new();
+    let root = AgentPath::root();
+    let (tx1, _rx1) = oneshot::channel();
+    let (tx2, _rx2) = oneshot::channel();
+    let p1 = ctrl.register(&root, "a", AgentRole::Explorer,
+        AgentConfig::default(), tx1).await.unwrap();
+    let p2 = ctrl.register(&p1, "b", AgentRole::Worker,
+        AgentConfig::default(), tx2).await.unwrap();
+
+    ctrl.shutdown_tree(&root).await;
+    assert!(ctrl.get(&p1).await.is_none());
+    assert!(ctrl.get(&p2).await.is_none());
+}
+
+// === AgentTool tests ===
+
+#[tokio::test]
+async fn test_agent_tool_spawn_sync() {
+    // Setup: create session, register AgentTool, call with input
+    let tool = AgentTool::new(agent_control, session_registry);
+    let input = serde_json::json!({
+        "role": "explorer",
+        "prompt": "Check if Cargo.toml exists",
+        "mode": "sync"
+    });
+    let ctx = ToolContext::test();
+    let output = tool.execute(input, ctx).await;
+    assert!(output.result.is_some());
+    assert!(output.turn_count > 0);
+}
+
+#[tokio::test]
+async fn test_agent_tool_invalid_role() {
+    let tool = AgentTool::new(agent_control, session_registry);
+    let input = serde_json::json!({
+        "role": "superhero",  // Invalid
+        "prompt": "Do something"
+    });
+    let result = tool.execute(input, ToolContext::test()).await;
+    assert!(result.is_err());
+}
+```
+
+### Integration Tests
+
+```rust
+#[tokio::test]
+async fn test_subagent_result_propagates_to_parent() {
+    // 1. Start parent session via orchestration API
+    // 2. Parent calls `agent` tool with sync mode
+    // 3. Sub-agent runs, does some work, returns result
+    // 4. Verify parent's next turn includes sub-agent result
+    todo!("End-to-end: spawn parent → parent spawns child → child returns → parent sees result");
+}
+
+#[tokio::test]
+async fn test_agent_tree_persistence() {
+    // 1. Create agent tree with multiple agents
+    // 2. Serialize to JSON
+    // 3. Deserialize
+    // 4. Verify all paths and entries match
+    todo!("Agent tree save/restore round-trip");
+}
+
+#[tokio::test]
+async fn test_team_pipeline_dag_wave() {
+    // 1. Define 5-step DAG: step2 depends on step1, step3 on step1, step4 on step2+3
+    // 2. Execute pipeline
+    // 3. Verify wave order: wave0=[step1], wave1=[step2,step3], wave2=[step4]
+    // 4. Verify all results present
+    todo!("DAG execution respects topological order");
+}
+```
+
+---
+
+## 9. Benchmarks
+
+| Metric | Baseline (no multi-agent) | Target | How to Measure |
+|--------|---------------------------|--------|----------------|
+| Sub-agent spawn latency | N/A | < 100ms (in-process) | `time` before/after `register()` call |
+| Sub-agent LLM first-token | N/A | Same as parent (fork) + 500ms (sync) | Measure TTFT of sub-agent vs parent |
+| Memory per sub-agent | N/A | < 50MB baseline + 10MB per active agent | `alloc` profiling |
+| Agent tree — 100 agents | N/A | Lookup < 1µs, register < 10µs | Criterion bench |
+| DAG wave — 20 steps / 4 waves | N/A | Total < serial time / 3 | Integration timer |
+| Cost tracking overhead | N/A | < 0.1% of total API cost | Differential measurement |
+
+---
+
+## 10. Migration / Rollout
+
+**Phase 1 — Foundation (estimate: 1-2 weeks)**
+- New crate `jcode-agent-tree` with AgentPath, AgentControl, Mailbox
+- Unit tests for tree operations
+- No agent tool yet — infrastructure only
+- **Risk**: None (new crate, no existing code touched)
+
+**Phase 2 — Agent Tool (estimate: 1 week)**
+- `AgentTool` implementation: sync + async + fork modes
+- Integration into agent turn loop
+- Wire hooks (SubagentStart/SubagentStop) to existing hook system
+- **Risk**: Medium — turn loop changes must not break single-agent mode
+
+**Phase 3 — CLI + Config (estimate: 1 week)**
+- `jcode agent list/tree/kill/status` commands
+- `jcode team start/status/stop` commands
+- `[agents]` config section in config.toml
+- **Risk**: Low — CLI and config are additive
+
+**Phase 4 — Team Pipeline + Batch (estimate: 1 week)**
+- DAG pipeline executor (plan file → waves → results)
+- Batch CSV agent spawning
+- TUI agent tree visualization
+- **Risk**: Low — builds on Phase 1-3 foundation
+
+### Feature Flag
+All multi-agent functionality gated behind `JCODE_DISABLE_AGENT_TREE` kill-switch (from disable-env system). When disabled, `agent` tool returns "multi-agent disabled" error, team CLI commands error out, and agent tree stays empty.
+
+---
+
+## 11. Known Limitations & Future Work
+
+- [ ] **Cross-process sub-agents**: Current design is in-process only. Future: sub-agents as separate `jcode` processes via the protocol layer.
+- [ ] **Agent checkpoint/resume**: Sub-agents that survive parent restart — requires session persistence.
+- [ ] **Prompt cache sharing (Fork)**: Full fork mode requires the LLM provider to support prompt cache snapshots. Phase 1 fork = copy context (not true cache sharing).
+- [ ] **Inter-agent streaming**: Sub-agents can only communicate via mailbox messages (discrete), not streaming. Future: SSE-based streaming between agents.
+- [ ] **Cost optimization**: No sub-agent cost optimization yet (e.g., cheaper model for explorer).
+- [ ] **Agent governance**: No per-user agent quotas, no team-based agent pools.
+- [ ] **Swarm replay export**: jcode already has `export_swarm_video()` in the TUI — tie this into agent tree history.
+
+---
+
+## 12. Success Criteria Checklist
+
+- [ ] `AgentPath` type supports hierarchical addressing, parent/child traversal, depth checks
+- [ ] `AgentControl` enforces thread limits (depth, siblings, total)
+- [ ] Mailbox-based communication works: parent sends task, agent receives, agent sends result, parent receives
+- [ ] `agent` tool call spawns a sub-agent with correct role defaults
+- [ ] Sync mode: parent waits, gets result with turn count + cost
+- [ ] Async mode: parent continues immediately, result logged
+- [ ] SubagentStart/SubagentStop hooks fire correctly
+- [ ] `jcode agent list` shows all active agents with paths
+- [ ] `jcode agent kill /root/worker-1` terminates agent + children
+- [ ] `jcode agent tree` prints hierarchical tree view
+- [ ] `jcode team start` reads plan file, executes waves, reports results
+- [ ] `jcode team stop <id>` cancels all running agents in team
+- [ ] DAG pipeline executes steps in correct topological wave order
+- [ ] Cost aggregation: parent's cost includes all children's costs
+- [ ] `JCODE_DISABLE_AGENT_TREE=1` disables all multi-agent features
+- [ ] Existing single-agent behavior unchanged (regression test pass)
+- [ ] 50 concurrent agents don't overwhelm the runtime

From 8feff0a2c072cdac62cc158aae76e8343b93d46b Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Thu, 4 Jun 2026 12:27:10 +0700
Subject: [PATCH 08/22] =?UTF-8?q?fix(pr-313):=20apply=20review-swarm=20fix?=
 =?UTF-8?q?es=20=E2=80=94=20feature=20gate,=20per-model=20timeout,=20field?=
 =?UTF-8?q?=20caps,=20serde=20strictness?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Gate agent_runner behind 'agent-runner' feature flag
- Add KNOWLEDGE_FILES_MAX_CHARS = 100_000 constant with truncation
- Add #[serde(deny_unknown_fields)] to AgentDefinition
- Per-model timeout in judge_with_three_models (join_all with individual timeouts)
- Fix integer truncation in meta_analyze_impl avg_duration
- Remove stray merge conflict marker in src/lib.rs
---
 Cargo.lock                                   | 128 ++++++++++++++++---
 crates/jcode-agent-runtime/src/definition.rs |  17 ++-
 evals/jbench/Cargo.toml                      |   4 +
 evals/jbench/src/agent_runner.rs             |  22 +++-
 evals/jbench/src/bin/jbench.rs               |  55 +++++---
 evals/jbench/src/judge.rs                    |  40 +++---
 evals/jbench/src/lib.rs                      |   2 +
 src/lib.rs                                   |   1 -
 src/prompt_placeholders.rs                   |  24 +++-
 9 files changed, 228 insertions(+), 65 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b7397afe3..19624196e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1469,7 +1469,7 @@ dependencies = [
  "bitflags 1.3.2",
  "core-foundation 0.9.4",
  "core-graphics-types",
- "foreign-types",
+ "foreign-types 0.5.0",
  "libc",
 ]
 
@@ -2809,6 +2809,15 @@ dependencies = [
  "ttf-parser 0.25.1",
 ]
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared 0.1.1",
+]
+
 [[package]]
 name = "foreign-types"
 version = "0.5.0"
@@ -2816,7 +2825,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
 dependencies = [
  "foreign-types-macros",
- "foreign-types-shared",
+ "foreign-types-shared 0.3.1",
 ]
 
 [[package]]
@@ -2830,6 +2839,12 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "foreign-types-shared"
 version = "0.3.1"
@@ -4332,6 +4347,22 @@ dependencies = [
  "webpki-roots",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper 1.8.1",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.19"
@@ -5218,17 +5249,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "jcode-logging"
-version = "0.1.0"
-dependencies = [
- "chrono",
- "jcode-core",
- "jcode-storage",
- "serde_json",
- "tokio",
-]
-
 [[package]]
 name = "jcode-jbench"
 version = "0.1.0"
@@ -5237,13 +5257,24 @@ dependencies = [
  "clap",
  "futures",
  "jcode-agent-runtime",
- "reqwest",
+ "reqwest 0.12.28",
  "serde",
  "serde_json",
  "tempfile",
  "tokio",
 ]
 
+[[package]]
+name = "jcode-logging"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "jcode-core",
+ "jcode-storage",
+ "serde_json",
+ "tokio",
+]
+
 [[package]]
 name = "jcode-memory-types"
 version = "0.1.0"
@@ -6340,7 +6371,7 @@ dependencies = [
  "bitflags 2.11.1",
  "block",
  "core-graphics-types",
- "foreign-types",
+ "foreign-types 0.5.0",
  "log",
  "objc",
  "paste",
@@ -6444,6 +6475,23 @@ dependencies = [
  "unicode-xid",
 ]
 
+[[package]]
+name = "native-tls"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
+dependencies = [
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe 0.2.1",
+ "openssl-sys",
+ "schannel",
+ "security-framework 3.6.0",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "ndarray"
 version = "0.16.1"
@@ -6863,6 +6911,31 @@ dependencies = [
  "pathdiff",
 ]
 
+[[package]]
+name = "openssl"
+version = "0.10.80"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967"
+dependencies = [
+ "bitflags 2.11.1",
+ "cfg-if",
+ "foreign-types 0.3.2",
+ "libc",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.6"
@@ -6875,6 +6948,18 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -7988,10 +8073,12 @@ dependencies = [
  "http-body-util",
  "hyper 1.8.1",
  "hyper-rustls 0.27.7",
+ "hyper-tls",
  "hyper-util",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "percent-encoding",
  "pin-project-lite",
  "quinn",
@@ -8002,6 +8089,7 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper",
  "tokio",
+ "tokio-native-tls",
  "tokio-rustls 0.26.4",
  "tokio-util",
  "tower",
@@ -9529,6 +9617,16 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.1"
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index 4adeeabbd..6304a66ed 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -46,6 +46,7 @@ pub const DEFAULT_AGENT_VERSION: &str = "0.1.0";
 /// Intentionally `Clone` so the runtime can hand each spawn its own copy
 /// without locking the registry. Definitions are small (a few KB at most).
 #[derive(Debug, Clone, Serialize, Deserialize)]
+#[serde(deny_unknown_fields)]
 pub struct AgentDefinition {
     // -----------------------------------------------------------------
     // Identity
@@ -572,20 +573,18 @@ mod tests {
 
     #[test]
     fn toml_unknown_field_is_rejected() {
-        // We DO NOT use `#[serde(deny_unknown_fields)]` because forward-compat
-        // matters when older binaries read newer TOML. But typo'd known fields
-        // are silently ignored — that's a UX hazard. Document the tradeoff
-        // here: if this becomes a problem, switch to deny_unknown_fields and
-        // version the schema explicitly.
-        //
-        // For now, this test just verifies unknown fields don't crash.
         let src = r#"
             id = "ok"
             display_name = "ok"
             unknown_future_field = "value"
         "#;
-        let d: AgentDefinition = toml::from_str(src).expect("parse");
-        d.validate().expect("validate");
+        let err = toml::from_str::<AgentDefinition>(src).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field")
+                || err.to_string().contains("unknown")
+                || err.to_string().contains("`unknown_future_field`"),
+            "expected denial of unknown field, got: {err}"
+        );
     }
 
     // -----------------------------------------------------------------
diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml
index b9db6899a..6a360ffc8 100644
--- a/evals/jbench/Cargo.toml
+++ b/evals/jbench/Cargo.toml
@@ -22,6 +22,10 @@ futures = "0.3"
 reqwest = { version = "0.12", features = ["json"] }
 clap = { version = "4", features = ["derive", "env"] }
 
+[features]
+default = []
+agent-runner = []
+
 [dev-dependencies]
 serde_json = "1"
 tempfile = "3"
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index 3763ee4c2..d9391cc20 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -112,12 +112,30 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
     let mut trace_lines = Vec::new();
     let reader = BufReader::new(stdout);
     let mut lines_stream = reader.lines();
-    loop {
+    let timed_out = loop {
         let line = timeout(timeout_duration, lines_stream.next_line()).await;
         match line {
             Ok(Ok(Some(l))) => trace_lines.push(l),
-            _ => break,
+            Ok(Ok(None)) => break false,     // EOF — clean exit
+            Ok(Err(_)) => break false,       // read error
+            Err(_) => break true,            // timeout
         }
+    };
+
+    if timed_out {
+        // Kill the child process so it doesn't become an orphan
+        let _ = child.kill().await;
+        // Consume the exit status after kill
+        let _ = child.wait().await;
+        return Ok(EvalRun {
+            commit_sha: String::new(),
+            prompt: config.prompt,
+            diff: extract_diff_from_repo(&config.repo_path).unwrap_or_default(),
+            judging: Default::default(),
+            cost_usd: 0.0,
+            duration_ms: start.elapsed().as_millis() as u64,
+            error: Some("Timed out waiting for jcode subprocess".to_owned()),
+        });
     }
 
     let status = child
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index 2e54b50e7..160b84d26 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -7,8 +7,9 @@ use std::path::PathBuf;
 use anyhow::{Context, Result};
 use clap::{Parser, Subcommand};
 
+#[cfg(feature = "agent-runner")]
+use jcode_jbench::agent_runner::AgentRunConfig;
 use jcode_jbench::{
-    agent_runner::AgentRunConfig,
     judge::{JudgeConfig, judge_with_three_models},
     lessons::{LessonsConfig, append_lessons_to_file, extract_lessons},
     types::{AgentEvalResults, EvalDataV2, EvalRun},
@@ -119,15 +120,20 @@ async fn main() -> Result<()> {
             max_turns,
             timeout_secs,
         } => {
-            run_impl(
-                &eval_file,
-                &agent_id,
-                &output_dir,
-                jcode_binary.as_ref(),
-                max_turns,
-                timeout_secs,
-            )
-            .await?;
+            #[cfg(feature = "agent-runner")]
+            {
+                run_impl(
+                    &eval_file,
+                    &agent_id,
+                    &output_dir,
+                    jcode_binary.as_ref(),
+                    max_turns,
+                    timeout_secs,
+                )
+                .await?;
+            }
+            #[cfg(not(feature = "agent-runner"))]
+            anyhow::bail!("'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner");
         }
         Command::Judge {
             runs_dir,
@@ -156,6 +162,7 @@ async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> {
     todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON")
 }
 
+#[cfg(feature = "agent-runner")]
 async fn run_impl(
     eval_file: &PathBuf,
     agent_id: &str,
@@ -182,7 +189,7 @@ async fn run_impl(
         let config = AgentRunConfig {
             agent_id: agent_id.to_owned(),
             prompt: commit.prompt.clone(),
-            repo_path: output_dir.join(&commit.id), // per-commit working dir
+            repo_path: output_dir.join(&commit.id),
             max_turns,
             timeout_secs,
             env: eval_data.env.clone(),
@@ -190,15 +197,23 @@ async fn run_impl(
             ..Default::default()
         };
 
-        let result = tk_timeout(
+        let result = match tk_timeout(
             Duration::from_secs(timeout_secs),
             jcode_jbench::agent_runner::run_agent_in_repo(config),
         )
         .await
-        .into_iter()
-        .next()
-        .unwrap_or_else(|| {
-            Ok(jcode_jbench::types::EvalRun {
+        {
+            Ok(Ok(run)) => run,
+            Ok(Err(err)) => EvalRun {
+                commit_sha: commit.sha.clone(),
+                prompt: commit.prompt.clone(),
+                diff: String::new(),
+                judging: Default::default(),
+                cost_usd: 0.0,
+                duration_ms: 0,
+                error: Some(format!("Agent error: {err:#}")),
+            },
+            Err(_elapsed) => EvalRun {
                 commit_sha: commit.sha.clone(),
                 prompt: commit.prompt.clone(),
                 diff: String::new(),
@@ -206,8 +221,8 @@ async fn run_impl(
                 cost_usd: 0.0,
                 duration_ms: 0,
                 error: Some("Timed out waiting for run_agent_in_repo".to_owned()),
-            })
-        })?;
+            },
+        };
 
         let run_file = output_dir.join(format!("{}.run.json", commit.id));
         let json = serde_json::to_string_pretty(&result).context("failed to serialize EvalRun")?;
@@ -262,7 +277,9 @@ async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Resu
         .sum::<f64>()
         / all_runs.len() as f64;
     let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>() / all_runs.len() as f64;
-    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>() / all_runs.len() as u64;
+    let avg_duration = (all_runs.iter().map(|r| r.duration_ms as f64).sum::<f64>()
+        / all_runs.len() as f64)
+        .round() as u64;
 
     let summary = AgentEvalResults {
         agent_id: "unknown".to_owned(),
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
index 8f9437f47..0d4d36f72 100644
--- a/evals/jbench/src/judge.rs
+++ b/evals/jbench/src/judge.rs
@@ -375,29 +375,37 @@ pub async fn judge_with_three_models(
 
     let timeout_duration = Duration::from_secs(config.timeout_secs.unwrap_or(JUDGE_TIMEOUT_SECS));
 
+    // Each judge gets its own timeout so a slow model doesn't starve the others.
     let judge_futures: Vec<_> = config
         .models
         .iter()
         .map(|model| {
-            run_single_judge(
-                model,
-                &prompt,
-                &config.api_base,
-                &config.api_key,
-                config.anthropic_api_base.as_deref(),
-                config.anthropic_api_key.as_deref(),
-                http,
-            )
+            let http = http.clone();
+            let prompt = prompt.clone();
+            async move {
+                timeout(
+                    timeout_duration,
+                    run_single_judge(
+                        model,
+                        &prompt,
+                        &config.api_base,
+                        &config.api_key,
+                        config.anthropic_api_base.as_deref(),
+                        config.anthropic_api_key.as_deref(),
+                        &http,
+                    ),
+                )
+                .await
+                .ok()
+                .and_then(|r| r.ok())
+            }
         })
         .collect();
 
-    // Run all three judges in parallel with an overall timeout
-    let valid: Vec<Scorecard> = timeout(timeout_duration, futures::future::join_all(judge_futures))
+    let valid: Vec<Scorecard> = futures::future::join_all(judge_futures)
         .await
-        .ok()
-        .into_iter() // IntoIterator<Item = Vec<Result<Scorecard>>>
-        .flatten() // Iterator<Item = Result<Scorecard>>
-        .filter_map(|r| r.ok())
+        .into_iter()
+        .filter_map(|r| r)
         .collect();
 
     if valid.len() < MIN_JUDGE_SUCCESS_COUNT {
@@ -417,7 +425,7 @@ pub async fn judge_with_three_models(
 
     // Median analysis — sort by overall_score and pick the middle
     let mut sorted = valid.clone();
-    sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap());
+    sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap_or(std::cmp::Ordering::Equal));
     let median_idx = sorted.len() / 2;
     let median = &sorted[median_idx];
 
diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs
index 36363dc5b..48860cdcb 100644
--- a/evals/jbench/src/lib.rs
+++ b/evals/jbench/src/lib.rs
@@ -13,11 +13,13 @@
 
 #![forbid(unsafe_code)]
 
+#[cfg(feature = "agent-runner")]
 pub mod agent_runner;
 pub mod judge;
 pub mod lessons;
 pub mod types;
 
+#[cfg(feature = "agent-runner")]
 pub use agent_runner::AgentRunConfig;
 pub use judge::JudgeConfig;
 pub use lessons::LessonsConfig;
diff --git a/src/lib.rs b/src/lib.rs
index e3039b5c8..dad287a05 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -31,7 +31,6 @@ pub mod model_failover;
 pub mod model_routing;
 pub mod orchestration_api;
 pub mod prefix_cache_stable;
-<<<<<<< HEAD
 pub mod process_memory;
 pub mod process_title;
 pub mod prompt;
diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs
index 635beb8cc..386201dae 100644
--- a/src/prompt_placeholders.rs
+++ b/src/prompt_placeholders.rs
@@ -11,7 +11,7 @@
 //!
 //! - `{{FILE_TREE_SMALL}}`   — truncated project tree, max 2500 chars.
 //! - `{{FILE_TREE}}`         — fuller project tree, max 10000 chars.
-//! - `{{KNOWLEDGE_FILES}}`   — concatenated knowledge / context files (no limit).
+//! - `{{KNOWLEDGE_FILES}}`   — concatenated knowledge / context files, max 100000 chars.
 //! - `{{GIT_CHANGES}}`       — `git diff` / status summary, max 30000 chars.
 //! - `{{CURRENT_DATE}}`      — ISO `YYYY-MM-DD` date string.
 //! - `{{REMAINING_STEPS}}`   — remaining-step counter (u32, decimal).
@@ -32,6 +32,9 @@ pub const FILE_TREE_MAX_CHARS: usize = 10_000;
 /// Maximum char count retained for [`PlaceholderContext::git_changes`].
 pub const GIT_CHANGES_MAX_CHARS: usize = 30_000;
 
+/// Maximum char count retained for [`PlaceholderContext::knowledge_files`].
+pub const KNOWLEDGE_FILES_MAX_CHARS: usize = 100_000;
+
 /// Container for values that can be substituted into prompt templates.
 ///
 /// All `String` fields default to empty and `remaining_steps` defaults to 0.
@@ -45,7 +48,8 @@ pub struct PlaceholderContext {
     /// Fuller project file tree. Truncated to [`FILE_TREE_MAX_CHARS`] chars
     /// during substitution.
     pub file_tree: String,
-    /// Concatenated knowledge/context files. No length limit is applied.
+    /// Concatenated knowledge/context files. Truncated to [`KNOWLEDGE_FILES_MAX_CHARS`]
+    /// chars during substitution.
     pub knowledge_files: String,
     /// Git diff / status summary. Truncated to [`GIT_CHANGES_MAX_CHARS`]
     /// chars during substitution.
@@ -89,6 +93,7 @@ pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -
 
     let file_tree_small = truncate_chars(&ctx.file_tree_small, FILE_TREE_SMALL_MAX_CHARS);
     let file_tree = truncate_chars(&ctx.file_tree, FILE_TREE_MAX_CHARS);
+    let knowledge_files = truncate_chars(&ctx.knowledge_files, KNOWLEDGE_FILES_MAX_CHARS);
     let git_changes = truncate_chars(&ctx.git_changes, GIT_CHANGES_MAX_CHARS);
     let remaining_steps = if ctx.remaining_steps == 0 {
         String::new()
@@ -101,7 +106,7 @@ pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -
     let replacements: [(&str, &str); 7] = [
         ("{{FILE_TREE_SMALL}}", file_tree_small.as_str()),
         ("{{FILE_TREE}}", file_tree.as_str()),
-        ("{{KNOWLEDGE_FILES}}", ctx.knowledge_files.as_str()),
+        ("{{KNOWLEDGE_FILES}}", knowledge_files.as_str()),
         ("{{GIT_CHANGES}}", git_changes.as_str()),
         ("{{CURRENT_DATE}}", ctx.current_date.as_str()),
         ("{{REMAINING_STEPS}}", remaining_steps.as_str()),
@@ -199,4 +204,17 @@ mod tests {
         assert!(out.starts_with('['));
         assert!(out.ends_with(']'));
     }
+
+    #[test]
+    fn knowledge_files_truncated_when_exceeds_cap() {
+        let big: String = "k".repeat(KNOWLEDGE_FILES_MAX_CHARS + 5000);
+        let ctx = PlaceholderContext {
+            knowledge_files: big.clone(),
+            ..Default::default()
+        };
+        let out = substitute_context_placeholders("[{{KNOWLEDGE_FILES}}]", &ctx);
+        assert_eq!(out.chars().count(), KNOWLEDGE_FILES_MAX_CHARS + 2);
+        assert!(out.starts_with('['));
+        assert!(out.ends_with(']'));
+    }
 }

From 60a61f0b7d6e6b9ca7c9b3398f5f16740b250f7e Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 00:20:12 +0700
Subject: [PATCH 09/22] fix(merge): reconcile src/lib.rs with master layout

- Revert src/lib.rs to master (remove stale 36-module list)
- Move prompt_placeholders.rs from src/ into crates/jcode-app-core/src/
- Add pub mod prompt_placeholders to jcode-app-core/src/lib.rs
- Resolve Cargo.lock merge conflict (hyper/hyper-rustls versions)

Build verified: cargo check --bin jcode passes.
Tests: jcode-agent-runtime 55 pass, jcode-jbench 3 pass.
---
 Cargo.lock                                    | 123 +++++++++++++++++-
 crates/jcode-app-core/src/lib.rs              |   1 +
 .../src}/prompt_placeholders.rs               |   0
 src/lib.rs                                    |  21 ---
 4 files changed, 121 insertions(+), 24 deletions(-)
 rename {src => crates/jcode-app-core/src}/prompt_placeholders.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index be5c1ef76..2a9a22c9c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1468,7 +1468,7 @@ dependencies = [
  "bitflags 1.3.2",
  "core-foundation 0.9.4",
  "core-graphics-types",
- "foreign-types",
+ "foreign-types 0.5.0",
  "libc",
 ]
 
@@ -2842,6 +2842,15 @@ dependencies = [
  "ttf-parser 0.25.1",
 ]
 
+[[package]]
+name = "foreign-types"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
+dependencies = [
+ "foreign-types-shared 0.1.1",
+]
+
 [[package]]
 name = "foreign-types"
 version = "0.5.0"
@@ -2849,7 +2858,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965"
 dependencies = [
  "foreign-types-macros",
- "foreign-types-shared",
+ "foreign-types-shared 0.3.1",
 ]
 
 [[package]]
@@ -2863,6 +2872,12 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "foreign-types-shared"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
+
 [[package]]
 name = "foreign-types-shared"
 version = "0.3.1"
@@ -4369,6 +4384,22 @@ dependencies = [
  "webpki-roots",
 ]
 
+[[package]]
+name = "hyper-tls"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
+dependencies = [
+ "bytes",
+ "http-body-util",
+ "hyper 1.10.1",
+ "hyper-util",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.20"
@@ -4903,8 +4934,12 @@ dependencies = [
 name = "jcode-agent-runtime"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "serde",
+ "serde_json",
  "thiserror 1.0.69",
  "tokio",
+ "toml",
 ]
 
 [[package]]
@@ -5250,6 +5285,21 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "jcode-jbench"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "clap",
+ "futures",
+ "jcode-agent-runtime",
+ "reqwest 0.12.28",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "tokio",
+]
+
 [[package]]
 name = "jcode-logging"
 version = "0.1.0"
@@ -6360,7 +6410,7 @@ dependencies = [
  "bitflags 2.11.1",
  "block",
  "core-graphics-types",
- "foreign-types",
+ "foreign-types 0.5.0",
  "log",
  "objc",
  "paste",
@@ -6464,6 +6514,23 @@ dependencies = [
  "unicode-xid",
 ]
 
+[[package]]
+name = "native-tls"
+version = "0.2.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2"
+dependencies = [
+ "libc",
+ "log",
+ "openssl",
+ "openssl-probe 0.2.1",
+ "openssl-sys",
+ "schannel",
+ "security-framework 3.7.0",
+ "security-framework-sys",
+ "tempfile",
+]
+
 [[package]]
 name = "ndarray"
 version = "0.16.1"
@@ -6874,6 +6941,31 @@ dependencies = [
  "pathdiff",
 ]
 
+[[package]]
+name = "openssl"
+version = "0.10.80"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967"
+dependencies = [
+ "bitflags 2.11.1",
+ "cfg-if",
+ "foreign-types 0.3.2",
+ "libc",
+ "openssl-macros",
+ "openssl-sys",
+]
+
+[[package]]
+name = "openssl-macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.117",
+]
+
 [[package]]
 name = "openssl-probe"
 version = "0.1.6"
@@ -6886,6 +6978,18 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
+[[package]]
+name = "openssl-sys"
+version = "0.9.116"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4"
+dependencies = [
+ "cc",
+ "libc",
+ "pkg-config",
+ "vcpkg",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -8026,10 +8130,12 @@ dependencies = [
  "http-body-util",
  "hyper 1.10.1",
  "hyper-rustls 0.27.9",
+ "hyper-tls",
  "hyper-util",
  "js-sys",
  "log",
  "mime",
+ "native-tls",
  "percent-encoding",
  "pin-project-lite",
  "quinn",
@@ -8040,6 +8146,7 @@ dependencies = [
  "serde_urlencoded",
  "sync_wrapper",
  "tokio",
+ "tokio-native-tls",
  "tokio-rustls 0.26.4",
  "tokio-util",
  "tower",
@@ -9560,6 +9667,16 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.1"
diff --git a/crates/jcode-app-core/src/lib.rs b/crates/jcode-app-core/src/lib.rs
index b4cb41d24..1e23d83ee 100644
--- a/crates/jcode-app-core/src/lib.rs
+++ b/crates/jcode-app-core/src/lib.rs
@@ -40,6 +40,7 @@ pub mod notifications;
 pub mod overnight;
 pub mod perf;
 pub mod prompt_templates;
+pub mod prompt_placeholders;
 pub mod replay;
 pub mod restart_snapshot;
 pub mod sandbox;
diff --git a/src/prompt_placeholders.rs b/crates/jcode-app-core/src/prompt_placeholders.rs
similarity index 100%
rename from src/prompt_placeholders.rs
rename to crates/jcode-app-core/src/prompt_placeholders.rs
diff --git a/src/lib.rs b/src/lib.rs
index dad287a05..101cfdade 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -31,27 +31,6 @@ pub mod model_failover;
 pub mod model_routing;
 pub mod orchestration_api;
 pub mod prefix_cache_stable;
-pub mod process_memory;
-pub mod process_title;
-pub mod prompt;
-pub mod prompt_placeholders;
-pub mod prompt_templates;
-pub mod protocol;
-pub mod provider;
-pub mod provider_catalog;
-pub mod registry;
-pub mod replay;
-pub mod restart_snapshot;
-pub mod runtime_memory_log;
-pub mod safety;
-pub mod sandbox;
-pub mod scoped_models;
-pub mod server;
-pub mod session;
-pub mod setup_hints;
-pub mod side_panel;
-pub mod sidecar;
-pub mod skill;
 pub mod skill_disable;
 pub mod skill_distillation;
 pub mod theme;

From d294249891d7f140576da0df625af94a896bf4e1 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 07:04:30 +0700
Subject: [PATCH 10/22] =?UTF-8?q?docs(review):=20comprehensive=20PR=20#313?=
 =?UTF-8?q?=20review=20=E2=80=94=20jcode=20vs=209=20repos=20comparison=20t?=
 =?UTF-8?q?ables=20+=20roadmap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

feature-planning skill analysis across codebuff, codex, claude-code,
opencode, oh-my-pi, oh-my-openagent, oh-my-claudecode, pi-agent-rust,
oh-my-codex.

Includes:
- 9 per-dimension comparison tables (schema, registry, routing, lifecycle,
  permission, tool, eval, prompt, session)
- Top 5 gaps ranked by ROI
- Wire-up plan for SafetySystem + AgentDefinition.permissionMode
- Phase roadmap (Phase 1 → Phase 5)
- 5 actionable issues with severity and fix suggestions
---
 .omo/plans/pr-313-review.md | 235 ++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 .omo/plans/pr-313-review.md

diff --git a/.omo/plans/pr-313-review.md b/.omo/plans/pr-313-review.md
new file mode 100644
index 000000000..725144ef0
--- /dev/null
+++ b/.omo/plans/pr-313-review.md
@@ -0,0 +1,235 @@
+# PR #313 Review: jcode Multi-Agent Foundation vs 9 Reference Repos
+
+> **Date**: 2026-06-05
+> **Reviewer**: Claude Opus 4.8 (feature-planning skill)
+> **PR**: #313 — `experimental/multi-agent-foundation` → `master`
+> **Scope**: +5775 / -94 lines, 28 files, 7 commits
+
+---
+
+## 1. Per-Dimension Comparison Tables
+
+### 1A. Agent Definition Schema
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Format** | TOML | TS imperative + `handleSteps` | N/A (TUI) | Markdown + YAML frontmatter | Markdown + YAML | TS imperative | Markdown + YAML | Markdown + YAML | Rust runtime | N/A |
+| **Schema validation** | `serde(deny_unknown_fields)` | Zod runtime | TS types | Zod (lazy) | Effect `Schema.Class` | TS types | YAML parse | YAML parse | serde derive | N/A |
+| **`model` field** | optional (`model_override` + `prefer_tier`) | **required** | N/A | optional (`inherit`) | optional | **required** | optional | optional | N/A | env var stack |
+| **`reasoning`/`effort`** | `ReasoningEffort` enum (4 levels) | `reasoningOptions.effort` (5 levels) + `max_tokens` | N/A | `effort` enum + integer | `variant` per-model | `Effort` enum | `ModelV2.VariantID` | N/A | N/A | N/A |
+| **`outputMode`** | `last_message`/`all_messages`/`structured_output` | identical | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`tool_names`** | whitelist (deny-by-default) | whitelist + MCP servers | built-in list | `tools` + `disallowedTools` | optional from registry | `loadMode` + `tier` | tool registry | tool allowlist | optional | N/A |
+| **`spawnable_agents`** | whitelist | `publisher/agent@version` | N/A | N/A (model drives) | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`inherit_parent_system_prompt`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`include_message_history`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`handleSteps`** | N/A (Phase 2) | ✅ Generator | N/A | N/A | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A |
+| **`permissionMode`** | N/A | N/A | N/A | ✅ per-agent | ✅ per-agent | `ToolTier` per-tool | N/A | N/A | N/A | N/A |
+| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A |
+| **`isolation`** | N/A | N/A | N/A | `worktree`/`remote` | N/A | N/A | N/A | `worktree` (git) | N/A | N/A |
+| **`mcpServers`** | N/A | ✅ per-agent | N/A | ✅ per-agent | N/A | N/A | N/A | ✅ MCP server | N/A | N/A |
+| **`hooks`** | N/A | N/A | N/A | ✅ per-agent | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`memory` scope** | N/A | N/A | N/A | `user`/`project`/`local` | N/A | N/A | N/A | N/A | N/A | N/A |
+
+---
+
+### 1B. Agent Registry / Discovery
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Discovery paths** | 3-tier: project > user > builtin | `.agents/` local | N/A | `.claude/agents/*.md` + settings | `.opencode/agents/*.md` + `modes/` | N/A | N/A | N/A | N/A | N/A |
+| **Priority order** | project > user > builtin | built-in first | N/A | built-in first | primary source glob | N/A | N/A | N/A | N/A | N/A |
+| **Filename == id check** | ✅ enforced | ❌ | N/A | ❌ | ❌ | N/A | N/A | N/A | N/A | N/A |
+| **Non-fatal errors** | ✅ collected for `doctor` | throws | N/A | log + skip | throws | N/A | N/A | N/A | N/A | N/A |
+| **On-disk format** | TOML | TS | N/A | Markdown | Markdown | N/A | N/A | N/A | N/A | N/A |
+| **Reload at runtime** | not yet | no | N/A | cache + plugin invalidation | `update` API | N/A | N/A | N/A | N/A | N/A |
+
+---
+
+### 1C. Model Routing / Tier
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Approach** | env-var slots + session inherit | OpenRouter catalog | `JCODE_ROUTING_*` env vars | `inherit` | `ModelV2.parse` | dynamic `ModelV2` | `ModelResolutionPipeline` (5 stages) | via Claude session | direct | env var stack |
+| **Slot/tier concept** | `Routine`/`Thinking` | no (literal model id) | `ROUTINE`+`THINKING`+`THRESHOLD` | no | variant per-provider | model string | catalog aliases | no | no | default + fallback |
+| **Fallback chain** | 3-level: override > env > session | OpenRouter routing | N/A | N/A | provider fallback | `resolveModelWithFallback` | 5-stage pipeline | N/A | per-provider | 2-tier fallback |
+| **Predefined catalog** | **no** (intentional) | yes (100+ models) | no | no | yes (`models-dev.ts`) | no | yes (60+ models) | no | no | no |
+| **Provider abstraction** | no (single OAuth) | OpenRouter | multi-provider | Anthropic | multi-provider | 40+ providers | multi-provider | Anthropic | 15+ providers | Codex only |
+
+---
+
+### 1D. Agent Lifecycle / Spawn
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Agent tree** | N/A | N/A | ✅ `AgentPath` + `ThreadSpawnEdgeStatus` | `team_name` (1:1 TaskList) | `mode: subagent/primary/all` | runtime | `boulder-state` (worktrees) | `team jobs` | session tree | N/A |
+| **Spawn tool** | N/A (schema only) | `spawn_agents` | `SpawnAgent`/`WaitAgent`/`CloseAgent`/`SendMessage`/`AssignAgentTask` | `Agent` tool + `TeamCreate` | delegation via tools | N/A | `delegate_task` | `omc_team_start` CLI | N/A | N/A |
+| **Message bus** | N/A | output return | `InterAgentCommunication` + delivery edges | `SendMessage` tool | N/A | N/A | `shared-state.ts` | `omc-team-state.ts` | N/A | N/A |
+| **Parallel execution** | N/A | `Promise.all` | DAG traversal | concurrent teammates | concurrent | DAG wave | sequential | sequential | N/A | N/A |
+| **Worktree isolation** | N/A | N/A | N/A | ✅ `isolation: worktree/remote` | N/A | N/A | N/A | ✅ git worktree cleanup | N/A | N/A |
+| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A |
+| **Job persistence** | N/A | N/A | ✅ SQLite `agent_jobs` | team config JSON | N/A | N/A | `boulder-state` file | `OMC_JOBS_DIR` artifacts | session JSONL | N/A |
+
+---
+
+### 1E. Permission / Safety
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Permission system** | **existing** `SafetySystem` + `ActionTier` | none | sandbox | `PermissionMode` per-agent (default/auto/ask/deny) | `PermissionV2.Ruleset` (allow/deny/ask) per-agent | `ToolTier` (read/write/exec) + approval modes | MCP allowlist | plugin/team scopes | none | `OMX_*` env controls |
+| **Per-agent policy** | **gap** — tool whitelist only | tool whitelist | N/A | ✅ `permissionMode` field | ✅ `permissions` array | ✅ `tier` on each tool | N/A | N/A | N/A | N/A |
+| **Classification levels** | 2 (auto/permission) | N/A | N/A | 4 (default/auto/ask/deny) | 3 (allow/deny/ask) | 3 (read/write/exec) | N/A | N/A | N/A | N/A |
+| **Auto-approve for sub-agents** | **not wired** | via `handleSteps` | N/A | via `permissionMode` | N/A | tool-tier-based | N/A | N/A | N/A | N/A |
+| **TUI permission flow** | ✅ `PermissionsApp` (existing) | none | none | none (CLI only) | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`disallowedTools`** | N/A | N/A | N/A | ✅ | N/A | `hidden` field | N/A | N/A | N/A | N/A |
+
+---
+
+### 1F. Tool Execution
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Tool registry** | whitelist strings in TOML | typed `ToolName` union | hard-coded | `getTools()` config | `ToolsProvider` | `AgentTool<TParams>` interface | tool discovery | MCP servers | typed `Tool` trait | sparkshell bridge |
+| **Concurrency control** | N/A | N/A | N/A | N/A | N/A | ✅ `shared`/`exclusive` | N/A | N/A | N/A | N/A |
+| **`loadMode`** | N/A | N/A | N/A | N/A | N/A | ✅ `essential`/`discoverable` | N/A | N/A | N/A | N/A |
+| **`deferrable`** | N/A | ✅ | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A |
+| **`nonAbortable`** | N/A | N/A | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A |
+| **Validation** | runtime (registry) | Zod args | sandbox | Zod | Effect Schema | Zod (`zodToWireSchema`) | Zod | Zod | typed Rust | typed Rust |
+| **`beforeToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (block/transform) | N/A | N/A | N/A | N/A |
+| **`afterToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (override) | N/A | N/A | N/A | N/A |
+| **Structured output** | ✅ `OutputMode::StructuredOutput` | ✅ `set_output` + `outputSchema` | N/A | N/A | N/A | `set_output` | N/A | N/A | N/A | N/A |
+
+---
+
+### 1G. Eval / Benchmark
+
+| Aspect | **jcode PR #313** | codebuff (BuffBench) | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|---------------------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Approach** | git-commit reconstruction (scaffold) | git-commit reconstruction (production) | e2e + bench scripts | N/A | N/A | LSP+DAP benchmarks | smoke tests | integration tests | N/A | sparkshell benchmark |
+| **Multi-judge** | ✅ 3 judges + per-model timeout | 2 judges (20 min shared) | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **Median scoring** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **Lessons extractor** | ✅ scaffold | ✅ production | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **`meta-analyze`** | ✅ implemented | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| **Feature flag** | ✅ `agent-runner` gate | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+
+---
+
+### 1H. Prompt Utilities
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Placeholder substitution** | ✅ `prompt_placeholders.rs` (pure utility) | `PLACEHOLDER` constants | N/A | prompt templates | mode prompts | atlas prompts | `prompts-core` package | `atlas-prompts.ts` | N/A | `build_summary_prompt()` |
+| **Supported tokens** | 7 tokens with length caps | `PLACEHOLDER` enum | N/A | env vars + dynamic | template engine | context-based | variant resolver | markdown | N/A | shell output |
+| **Length caps** | ✅ 2500/10k/30k/100k chars | `FILE_TREE_PROMPT` only | N/A | N/A | N/A | provider-specific | model caps | N/A | N/A | N/A |
+| **System reminder wrap** | ✅ `wrap_as_system_reminder()` | `<system_reminder>` tags | N/A | injection | N/A | N/A | prompt-injection.ts | prompt-injection.ts | N/A | N/A |
+| **Frontmatter parse** | N/A (TOML) | N/A | N/A | ✅ `parseAgentToolsFromFrontmatter` | ✅ `ConfigMarkdown.parseOption` | N/A | `shared/frontmatter.ts` | N/A | N/A | N/A |
+
+---
+
+### 1I. Session / Persistence
+
+| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex |
+|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------|
+| **Session format** | N/A (existing) | in-memory | SQLite + JSONL | config JSON | SQLite (Effect) | runtime state | `boulder-state` file | `OMC_JOBS_DIR` JSON | **JSONL + SHA-256 chain** | N/A |
+| **Branching/history** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ tree structure | N/A |
+| **Indexed search** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ `SessionIndex` | N/A |
+| **Chain integrity** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ SHA-256 per-entry | N/A |
+
+---
+
+## 2. Top 5 Gaps (ROI-ranked)
+
+| Rank | Gap | Effort | Impact | Source repos | Concrete action |
+|------|-----|--------|--------|--------------|-----------------|
+| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | Add `permission_mode: Option<PermissionMode>` to `AgentDefinition`; during tool execution, call `SafetySystem.classify()` then check agent's override; default = inherit from parent |
+| **2** | `Agent` tool — model-driven spawn | 1-2 weeks | 🔴 Critical (core feature) | codex (`SpawnAgent`/`WaitAgent`), claude-code (`AgentTool` + `TeamCreateTool`), codebuff (`spawn_agents`) | Phase 2: add `agent` tool that LLM calls; wire `spawnable_agents` whitelist; implement `AgentPath` tree from codex |
+| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | Add `max_turns: Option<u32>` to `AgentDefinition`; runtime checks after each turn |
+| **4** | `handleSteps` — programmatic agents | 1 week | 🟡 Important (flexibility) | codebuff (`handleSteps` Generator), oh-my-pi (`beforeToolCall`/`afterToolCall`) | Phase 2: add optional `handle_steps` field with Rust async generator or callback approach |
+| **5** | Tool concurrency (`shared`/`exclusive`) | 2-3 days | 🟢 Nice-to-have (perf) | oh-my-pi (`AgentTool.concurrency`) | Add `concurrency` field to tool definition; runtime scheduler respects exclusive locks |
+
+---
+
+## 3. Wire-up Plan: SafetySystem + AgentDefinition.permissionMode
+
+### Current state
+- `SafetySystem` (crates/jcode-base/src/safety.rs): `ActionTier` = `AutoAllowed | RequiresPermission`
+- `AgentDefinition` (crates/jcode-agent-runtime/src/definition.rs): `tool_names` whitelist only
+- `PermissionsApp` (crates/jcode-tui/src/tui/permissions.rs): TUI approval flow exists
+
+### Proposed addition
+
+```rust
+// crates/jcode-agent-runtime/src/definition.rs
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum PermissionMode {
+    /// Inherit approval from parent agent (default for sub-agents).
+    Inherit,
+    /// Auto-approve all tool calls for this agent.
+    AutoApprove,
+    /// Always ask user for permission.
+    Ask,
+    /// Deny all tool calls (read-only agent).
+    Deny,
+}
+
+impl Default for PermissionMode {
+    fn default() -> Self { PermissionMode::Inherit }
+}
+
+// Add to AgentDefinition:
+// pub permission_mode: Option<PermissionMode>,
+```
+
+### Resolution algorithm (runtime)
+
+```
+fn resolve_permission(action, tool_name, agent_def, parent_approval):
+    mode = agent_def.permission_mode.unwrap_or(Inherit)
+    match mode:
+        Deny → block
+        AutoApprove → approve
+        Ask → prompt user via PermissionsApp
+        Inherit → use parent_approval (or session-level classify)
+```
+
+### Migration path
+- Default `None` = `Inherit` = existing behavior unchanged
+- TOML agents opt-in: `permission_mode = "auto_approve"` for leaf agents
+- Phase 2: auto-wire `bash` tool in `basher.toml` with `permission_mode = "auto_approve"`
+
+---
+
+## 4. Roadmap: Phases After PR #313
+
+| Phase | Scope | Dependencies | Estimated |
+|-------|-------|--------------|-----------|
+| **Phase 1** (this PR) | AgentDefinition + tier + registry + JBench scaffold | — | ✅ Done |
+| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | 2-3 days |
+| **Phase 2** | Agent runtime engine: spawn, parent-child tree, `Agent` tool, `AgentPath` | Phase 1 | 2-3 weeks |
+| **Phase 2.5** | `handleSteps` (programmatic agents), tool concurrency | Phase 2 | 1-2 weeks |
+| **Phase 3** | Team pipeline (claude-code-style `TeamCreateTool`) | Phase 2 | 1 week |
+| **Phase 4** | JBench production (full `pick-commits` → `gen-evals` → `run` → `judge` → `lessons` pipeline) | Phase 2 | 1-2 weeks |
+| **Phase 5** | Multi-provider support (extend tier to per-provider catalogs) | Phase 2 | 1 week |
+
+---
+
+## 5. PR #313 Strengths
+
+1. **Best-in-class agent discovery** — 3-tier priority, filename==id enforcement, non-fatal error collection
+2. **Correct model routing philosophy** — slots not catalog, matches single-OAuth reality
+3. **JBench exceeds BuffBench** — 3 judges with per-model timeout (vs BuffBench's shared 20-min timeout)
+4. **Rust-idiomatic crate structure** — feature gates, clean separation, `serde(deny_unknown_fields)`
+5. **Comprehensive documentation** — every module has a doc comment explaining WHY, not just WHAT
+
+---
+
+## 6. PR #313 Actionable Issues
+
+| # | Issue | Severity | File | Fix |
+|---|-------|----------|------|-----|
+| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | Use `tokio::task::spawn_blocking` |
+| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | Use non-zero exit code or `todo!()` |
+| 3 | `file-picker.toml` missing explicit `inherit_parent_system_prompt = false` | Low | .jcode/agents/file-picker.toml | Add for consistency with `basher.toml` |
+| 4 | `edition = "2024"` in jbench may cause toolchain issues if workspace uses 2021 | Low | evals/jbench/Cargo.toml | Verify workspace edition consistency |
+| 5 | `meta_analyze_impl` reads all `.run.json` files into memory | Low | evals/jbench/src/bin/jbench.rs:268 | Streaming deserializer for large runs |

From f84cc127b13a718436866b48b9afa02b1755df99 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 07:24:17 +0700
Subject: [PATCH 11/22] feat(agent-runtime): add per-agent permissionMode field

Add PermissionMode enum to jcode-agent-runtime (mirrors dcg_core::Mode):
- Default: rule-based classification (legacy AUTO_ALLOWED list)
- AcceptEdits: file ops auto-allowed, network/spawn prompt
- Plan: read-only, writes denied without prompting
- DontAsk: allow-listed tools pass, never prompt
- BypassPermissions: skip all evaluation
- Auto: LLM-based classifier decides per call

Add permission_mode: Option<PermissionMode> to AgentDefinition.
When None, agent inherits session-global mode.

Update sample TOML agents:
- basher: accept-edits (auto-approve bash)
- editor: accept-edits (auto-approve file ops)
- file-picker: plan (read-only)
- code-reviewer: plan (read-only)

Tests: 54 unit + 6 integration = 60 passed, 0 failed.

Wire-up plan: at spawn time, convert PermissionMode to dcg_core::Mode
and pass to SubagentTool/SessionToolPolicy for per-agent override.
---
 .jcode/agents/basher.toml                     |   4 +
 .jcode/agents/code-reviewer.toml              |   3 +
 .jcode/agents/editor.toml                     |   4 +
 .jcode/agents/file-picker.toml                |   3 +
 crates/jcode-agent-runtime/src/definition.rs  |  20 ++
 crates/jcode-agent-runtime/src/lib.rs         |   2 +
 crates/jcode-agent-runtime/src/permission.rs  | 187 ++++++++++++++++++
 crates/jcode-agent-runtime/src/registry.rs    |   1 +
 .../tests/sample_agents.rs                    |  24 ++-
 9 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 crates/jcode-agent-runtime/src/permission.rs

diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml
index c726b51db..6c933b65d 100644
--- a/.jcode/agents/basher.toml
+++ b/.jcode/agents/basher.toml
@@ -37,6 +37,10 @@ version = "0.1.0"
 prefer_tier = "routine"
 reasoning = "minimal"
 
+# Basher runs terminal commands — auto-approve file ops so the parent
+# doesn't need to re-approve every bash call. Network/spawn still prompt.
+permission_mode = "accept-edits"
+
 include_message_history = false
 inherit_parent_system_prompt = false
 output_mode = "last_message"
diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml
index 22b7e5e38..9734537db 100644
--- a/.jcode/agents/code-reviewer.toml
+++ b/.jcode/agents/code-reviewer.toml
@@ -31,6 +31,9 @@ inherit_parent_system_prompt = true
 include_message_history = true
 output_mode = "last_message"
 
+# Reviewer is read-only — plan mode denies writes without prompting.
+permission_mode = "plan"
+
 tool_names = [
     "read",
     "grep",
diff --git a/.jcode/agents/editor.toml b/.jcode/agents/editor.toml
index 28aed4d01..4ab1e83d8 100644
--- a/.jcode/agents/editor.toml
+++ b/.jcode/agents/editor.toml
@@ -38,6 +38,10 @@ version = "0.1.0"
 prefer_tier = "thinking"
 reasoning = "medium"
 
+# Editor makes code edits — auto-approve file operations so the parent
+# agent doesn't need to re-approve every str_replace/write call.
+permission_mode = "accept-edits"
+
 inherit_parent_system_prompt = true
 include_message_history = true
 output_mode = "all_messages"
diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml
index b6365a84d..c4a719abb 100644
--- a/.jcode/agents/file-picker.toml
+++ b/.jcode/agents/file-picker.toml
@@ -26,6 +26,9 @@ reasoning = "minimal"
 include_message_history = false
 output_mode = "last_message"
 
+# File picker is read-only — plan mode denies writes without prompting.
+permission_mode = "plan"
+
 # Tools required: read project file tree + glob fallback. Whitelist is
 # checked at runtime against the tool registry; unknown tools fail loudly
 # rather than silently degrading.
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index 6304a66ed..1f1561255 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -33,6 +33,7 @@
 //!   Phase 2); for now agents are pure prompted.
 
 use crate::output::OutputMode;
+use crate::permission::PermissionMode;
 use crate::reasoning::ReasoningEffort;
 use crate::tier::ModelTier;
 
@@ -152,6 +153,24 @@ pub struct AgentDefinition {
     #[serde(default)]
     pub include_message_history: bool,
 
+    // -----------------------------------------------------------------
+    // Permissions
+    // -----------------------------------------------------------------
+    /// Optional permission mode override for this agent's tool execution.
+    /// When set, the agent runs under this permission mode instead of the
+    /// session-global mode (set via CLI `--permission-mode` or cycled in
+    /// the TUI).
+    ///
+    /// Useful for:
+    /// - Restricting sub-agents: reviewer runs in `Plan` (read-only).
+    /// - Elevating leaf agents: `basher` runs in `AcceptEdits`.
+    /// - Background agents: CI runner uses `DontAsk`.
+    ///
+    /// If `None`, the agent inherits the session's current permission mode.
+    /// See `permission.rs` for the full mode descriptions.
+    #[serde(default)]
+    pub permission_mode: Option<PermissionMode>,
+
     // -----------------------------------------------------------------
     // Output
     // -----------------------------------------------------------------
@@ -407,6 +426,7 @@ mod tests {
             spawner_prompt: None,
             inherit_parent_system_prompt: false,
             include_message_history: false,
+            permission_mode: None,
             output_mode: OutputMode::LastMessage,
             output_schema: None,
         }
diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index 80979a845..818082509 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -25,6 +25,7 @@
 
 pub mod definition;
 pub mod output;
+pub mod permission;
 pub mod reasoning;
 pub mod registry;
 pub mod signals;
@@ -40,6 +41,7 @@ pub use signals::{
 // New public surface (Phase 0).
 pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError};
 pub use output::OutputMode;
+pub use permission::PermissionMode;
 pub use reasoning::ReasoningEffort;
 pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind};
 pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source};
diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs
new file mode 100644
index 000000000..41db95a72
--- /dev/null
+++ b/crates/jcode-agent-runtime/src/permission.rs
@@ -0,0 +1,187 @@
+//! Per-agent permission mode for tool execution safety.
+//!
+//! Mirrors `dcg_core::Mode` but is intentionally self-contained in the
+//! dependency-light `jcode-agent-runtime` crate. The runtime converts
+//! this enum to `dcg_core::Mode` at spawn time.
+//!
+//! ## Design
+//!
+//! The permission mode controls how tool calls are evaluated during an
+//! agent's execution:
+//!
+//! - `Default` — rule-based: read-only tools auto-allowed, writes prompt.
+//! - `AcceptEdits` — file operations auto-allowed, network/spawn prompt.
+//! - `Plan` — read-only: writes denied without prompting.
+//! - `DontAsk` — allow-listed tools pass, never prompt.
+//! - `BypassPermissions` — skip all evaluation.
+//! - `Auto` — LLM-based classifier decides per call.
+//!
+//! When `AgentDefinition.permission_mode` is `None`, the agent inherits
+//! the session's current permission mode (set via CLI `--permission-mode`
+//! or cycled at runtime in the TUI).
+
+use serde::{Deserialize, Serialize};
+use std::fmt;
+
+/// Per-agent permission mode for tool execution safety.
+///
+/// This enum intentionally mirrors `dcg_core::Mode` (from the
+/// `destructive_command_guard` crate) so that `jcode-agent-runtime`
+/// does not need to depend on `dcg-core` directly.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PermissionMode {
+    /// Rule-based classification using the legacy `AUTO_ALLOWED` list.
+    /// Read-only tools auto-allowed; writes require permission.
+    Default,
+    /// File operations (edit, write, patch) auto-allowed. Network,
+    /// spawn, and irreversible operations still prompt.
+    AcceptEdits,
+    /// Read-only mode: write operations denied without prompting.
+    /// Useful for reviewer/observer agents.
+    Plan,
+    /// Only allow-listed tools pass; never prompt the user.
+    /// Useful for unattended/CI agents.
+    DontAsk,
+    /// Skip all permission evaluation. Use with caution.
+    BypassPermissions,
+    /// LLM-based classifier decides per tool call.
+    Auto,
+}
+
+impl Default for PermissionMode {
+    fn default() -> Self {
+        PermissionMode::Default
+    }
+}
+
+impl PermissionMode {
+    /// String representation matching the wire format used by TOML
+    /// definitions and the CLI.
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            PermissionMode::Default => "default",
+            PermissionMode::AcceptEdits => "accept-edits",
+            PermissionMode::Plan => "plan",
+            PermissionMode::DontAsk => "dont-ask",
+            PermissionMode::BypassPermissions => "bypass-permissions",
+            PermissionMode::Auto => "auto",
+        }
+    }
+
+    /// Parse a permission mode from a string, accepting common variants.
+    pub fn parse(s: &str) -> Option<PermissionMode> {
+        match s.trim().to_ascii_lowercase().as_str() {
+            "default" => Some(PermissionMode::Default),
+            "acceptedits" | "accept_edits" | "accept-edits" => Some(PermissionMode::AcceptEdits),
+            "plan" => Some(PermissionMode::Plan),
+            "dontask" | "dont_ask" | "dont-ask" => Some(PermissionMode::DontAsk),
+            "bypasspermissions" | "bypass_permissions" | "bypass-permissions" => {
+                Some(PermissionMode::BypassPermissions)
+            }
+            "auto" => Some(PermissionMode::Auto),
+            _ => None,
+        }
+    }
+}
+
+impl fmt::Display for PermissionMode {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_accepts_common_variants() {
+        assert_eq!(
+            PermissionMode::parse("default"),
+            Some(PermissionMode::Default)
+        );
+        assert_eq!(
+            PermissionMode::parse("AcceptEdits"),
+            Some(PermissionMode::AcceptEdits)
+        );
+        assert_eq!(
+            PermissionMode::parse("accept_edits"),
+            Some(PermissionMode::AcceptEdits)
+        );
+        assert_eq!(
+            PermissionMode::parse("accept-edits"),
+            Some(PermissionMode::AcceptEdits)
+        );
+        assert_eq!(
+            PermissionMode::parse("plan"),
+            Some(PermissionMode::Plan)
+        );
+        assert_eq!(
+            PermissionMode::parse("DONTASK"),
+            Some(PermissionMode::DontAsk)
+        );
+        assert_eq!(
+            PermissionMode::parse("dont_ask"),
+            Some(PermissionMode::DontAsk)
+        );
+        assert_eq!(
+            PermissionMode::parse("bypass_permissions"),
+            Some(PermissionMode::BypassPermissions)
+        );
+        assert_eq!(
+            PermissionMode::parse("bypass-permissions"),
+            Some(PermissionMode::BypassPermissions)
+        );
+        assert_eq!(
+            PermissionMode::parse("auto"),
+            Some(PermissionMode::Auto)
+        );
+        assert_eq!(PermissionMode::parse(""), None);
+        assert_eq!(PermissionMode::parse("nonsense"), None);
+    }
+
+    #[test]
+    fn default_is_default() {
+        assert_eq!(PermissionMode::default(), PermissionMode::Default);
+    }
+
+    #[test]
+    fn serde_roundtrip_kebab_case() {
+        // TOML wire format uses kebab-case per serde(rename_all)
+        let s = serde_json::to_string(&PermissionMode::AcceptEdits).unwrap();
+        assert_eq!(s, "\"accept-edits\"");
+        let back: PermissionMode = serde_json::from_str("\"accept-edits\"").unwrap();
+        assert_eq!(back, PermissionMode::AcceptEdits);
+    }
+
+    #[test]
+    fn serde_roundtrip_all_variants() {
+        for variant in [
+            PermissionMode::Default,
+            PermissionMode::AcceptEdits,
+            PermissionMode::Plan,
+            PermissionMode::DontAsk,
+            PermissionMode::BypassPermissions,
+            PermissionMode::Auto,
+        ] {
+            let json = serde_json::to_string(&variant).unwrap();
+            let back: PermissionMode = serde_json::from_str(&json).unwrap();
+            assert_eq!(back, variant);
+        }
+    }
+
+    #[test]
+    fn display_matches_as_str() {
+        for variant in [
+            PermissionMode::Default,
+            PermissionMode::AcceptEdits,
+            PermissionMode::Plan,
+            PermissionMode::DontAsk,
+            PermissionMode::BypassPermissions,
+            PermissionMode::Auto,
+        ] {
+            assert_eq!(format!("{variant}"), variant.as_str());
+        }
+    }
+}
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index 82f182b2d..a1a41a79d 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -373,6 +373,7 @@ mod tests {
             spawner_prompt: None,
             inherit_parent_system_prompt: false,
             include_message_history: false,
+            permission_mode: None,
             output_mode: OutputMode::LastMessage,
             output_schema: None,
         };
diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs
index ee6ee7034..d2bf77d4d 100644
--- a/crates/jcode-agent-runtime/tests/sample_agents.rs
+++ b/crates/jcode-agent-runtime/tests/sample_agents.rs
@@ -9,7 +9,9 @@
 
 use std::path::PathBuf;
 
-use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind};
+use jcode_agent_runtime::{
+    AgentRegistry, ModelTier, OutputMode, PermissionMode, ReasoningEffort, SourceKind,
+};
 
 /// Path to the project-root sample agents directory, relative to the
 /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so
@@ -71,6 +73,11 @@ fn file_picker_sample_has_expected_shape() {
     assert_eq!(agent.output_mode, OutputMode::LastMessage);
     assert!(agent.tool_names.iter().any(|t| t == "read"));
     assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+    assert_eq!(
+        agent.permission_mode,
+        Some(PermissionMode::Plan),
+        "file-picker is read-only (plan mode)"
+    );
 
     // Resolve model with no env vars set should fall back to the
     // session's current model.
@@ -106,6 +113,11 @@ fn code_reviewer_uses_inherit_parent_system_prompt_for_cache_hit() {
         "reviewer needs context of the change it's reviewing"
     );
     assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking));
+    assert_eq!(
+        agent.permission_mode,
+        Some(PermissionMode::Plan),
+        "code-reviewer is read-only (plan mode)"
+    );
 }
 
 #[test]
@@ -151,6 +163,11 @@ fn basher_sample_has_expected_shape() {
     assert_eq!(agent.output_mode, OutputMode::LastMessage);
     assert_eq!(agent.tool_names, vec!["bash"]);
     assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+    assert_eq!(
+        agent.permission_mode,
+        Some(PermissionMode::AcceptEdits),
+        "basher auto-approves file ops"
+    );
 
     // No tier env var set → resolve falls back to the session model.
     let resolved = agent.resolve_model("session-model");
@@ -207,4 +224,9 @@ fn editor_sample_has_expected_shape() {
         );
     }
     assert!(agent.spawnable_agents.is_empty(), "leaf agent");
+    assert_eq!(
+        agent.permission_mode,
+        Some(PermissionMode::AcceptEdits),
+        "editor auto-approves file ops"
+    );
 }

From 844fc412a361c359ad9b89b5da8afbf62c594898 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 08:11:57 +0700
Subject: [PATCH 12/22] feat(agent-runtime): add max_turns field to
 AgentDefinition

Add optional max_turns: Option<u32> field that limits the number of
agentic turns an agent may execute before being stopped. Prevents
runaway agents from consuming unbounded tokens/time.

When None, the agent has no per-agent turn limit (session global
limit still applies).

Tests: 56 unit + 6 integration = 62 passed, 0 failed.
---
 crates/jcode-agent-runtime/src/definition.rs | 31 ++++++++++++++++++++
 crates/jcode-agent-runtime/src/registry.rs   |  1 +
 2 files changed, 32 insertions(+)

diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index 1f1561255..f148c0c16 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -171,6 +171,15 @@ pub struct AgentDefinition {
     #[serde(default)]
     pub permission_mode: Option<PermissionMode>,
 
+    /// Optional maximum number of agentic turns this agent may execute
+    /// before being stopped. Prevents runaway agents from consuming
+    /// unbounded tokens/time.
+    ///
+    /// If `None`, the agent has no per-agent turn limit (the session
+    /// global limit still applies).
+    #[serde(default)]
+    pub max_turns: Option<u32>,
+
     // -----------------------------------------------------------------
     // Output
     // -----------------------------------------------------------------
@@ -427,6 +436,7 @@ mod tests {
             inherit_parent_system_prompt: false,
             include_message_history: false,
             permission_mode: None,
+            max_turns: None,
             output_mode: OutputMode::LastMessage,
             output_schema: None,
         }
@@ -690,4 +700,25 @@ mod tests {
             _ => unreachable!(),
         }
     }
+
+    #[test]
+    fn toml_max_turns_parses() {
+        let src = r#"
+            id = "test"
+            display_name = "Test"
+            max_turns = 50
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        assert_eq!(d.max_turns, Some(50));
+    }
+
+    #[test]
+    fn toml_max_turns_none_when_absent() {
+        let src = r#"
+            id = "test"
+            display_name = "Test"
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        assert_eq!(d.max_turns, None);
+    }
 }
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index a1a41a79d..cab80c514 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -374,6 +374,7 @@ mod tests {
             inherit_parent_system_prompt: false,
             include_message_history: false,
             permission_mode: None,
+            max_turns: None,
             output_mode: OutputMode::LastMessage,
             output_schema: None,
         };

From 6d8ecbc6b8056224514a254be5306b4107bc0b86 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 08:12:29 +0700
Subject: [PATCH 13/22] chore(agents): add max_turns to sample TOML agents

- basher: max_turns = 10 (quick shell commands)
- file-picker: max_turns = 5 (find files, done fast)
- code-reviewer: max_turns = 15 (review needs more context)
- editor: no limit (complex edits may need many turns)
---
 .jcode/agents/basher.toml        | 1 +
 .jcode/agents/code-reviewer.toml | 1 +
 .jcode/agents/file-picker.toml   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml
index 6c933b65d..da53e515a 100644
--- a/.jcode/agents/basher.toml
+++ b/.jcode/agents/basher.toml
@@ -40,6 +40,7 @@ reasoning = "minimal"
 # Basher runs terminal commands — auto-approve file ops so the parent
 # doesn't need to re-approve every bash call. Network/spawn still prompt.
 permission_mode = "accept-edits"
+max_turns = 10
 
 include_message_history = false
 inherit_parent_system_prompt = false
diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml
index 9734537db..7d44e08ba 100644
--- a/.jcode/agents/code-reviewer.toml
+++ b/.jcode/agents/code-reviewer.toml
@@ -33,6 +33,7 @@ output_mode = "last_message"
 
 # Reviewer is read-only — plan mode denies writes without prompting.
 permission_mode = "plan"
+max_turns = 15
 
 tool_names = [
     "read",
diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml
index c4a719abb..f958b7c4e 100644
--- a/.jcode/agents/file-picker.toml
+++ b/.jcode/agents/file-picker.toml
@@ -28,6 +28,7 @@ output_mode = "last_message"
 
 # File picker is read-only — plan mode denies writes without prompting.
 permission_mode = "plan"
+max_turns = 5
 
 # Tools required: read project file tree + glob fallback. Whitelist is
 # checked at runtime against the tool registry; unknown tools fail loudly

From 2d7a020c50043f55a96fd2fd4df8dcfb51bc2420 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 08:15:04 +0700
Subject: [PATCH 14/22] fix(jbench): address PR #313 review issues

- extract_diff_from_repo: wrap sync std::process::Command in
  tokio::task::spawn_blocking to avoid blocking the async runtime
- todo_step: use exit code 2 (not implemented) instead of 0 (success)
- Fix unused variable warnings (max_turns, timeout_secs)
- cfg-gate unused imports behind agent-runner feature
---
 evals/jbench/src/agent_runner.rs | 35 ++++++++++++++++++--------------
 evals/jbench/src/bin/jbench.rs   | 34 +++++++++++++++----------------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index d9391cc20..5fd3f3031 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -130,7 +130,7 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         return Ok(EvalRun {
             commit_sha: String::new(),
             prompt: config.prompt,
-            diff: extract_diff_from_repo(&config.repo_path).unwrap_or_default(),
+            diff: extract_diff_from_repo(&config.repo_path).await.unwrap_or_default(),
             judging: Default::default(),
             cost_usd: 0.0,
             duration_ms: start.elapsed().as_millis() as u64,
@@ -143,7 +143,7 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         .await
         .context("failed to wait for jcode subprocess")?;
 
-    let diff = extract_diff_from_repo(&config.repo_path)?;
+    let diff = extract_diff_from_repo(&config.repo_path).await?;
     let error = if !status.success() {
         Some(format!("jcode exited with status {:?}", status))
     } else {
@@ -163,19 +163,24 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
 
 /// Produce a unified diff describing all uncommitted changes in
 /// `repo_path` against its currently-checked-out HEAD.
-pub fn extract_diff_from_repo(repo_path: &Path) -> Result<String> {
-    let output = std::process::Command::new("git")
-        .args(["diff", "--no-color", "HEAD"])
-        .current_dir(repo_path)
-        .output()
-        .context("git diff failed")?;
-
-    if !output.status.success() {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        anyhow::bail!("git diff exited with error: {stderr}");
-    }
+pub async fn extract_diff_from_repo(repo_path: &Path) -> Result<String> {
+    let repo_path = repo_path.to_owned();
+    tokio::task::spawn_blocking(move || {
+        let output = std::process::Command::new("git")
+            .args(["diff", "--no-color", "HEAD"])
+            .current_dir(&repo_path)
+            .output()
+            .context("git diff failed")?;
+
+        if !output.status.success() {
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            anyhow::bail!("git diff exited with error: {stderr}");
+        }
 
-    Ok(String::from_utf8_lossy(&output.stdout).to_string())
+        Ok(String::from_utf8_lossy(&output.stdout).to_string())
+    })
+    .await
+    .context("spawn_blocking panicked")?
 }
 
 #[cfg(test)]
@@ -184,7 +189,7 @@ mod tests {
 
     #[tokio::test]
     async fn extract_diff_from_repo_nonexistent() {
-        let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist"));
+        let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist")).await;
         assert!(result.is_err());
     }
 }
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index 160b84d26..35b9d31c5 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -9,11 +9,9 @@ use clap::{Parser, Subcommand};
 
 #[cfg(feature = "agent-runner")]
 use jcode_jbench::agent_runner::AgentRunConfig;
-use jcode_jbench::{
-    judge::{JudgeConfig, judge_with_three_models},
-    lessons::{LessonsConfig, append_lessons_to_file, extract_lessons},
-    types::{AgentEvalResults, EvalDataV2, EvalRun},
-};
+#[cfg(feature = "agent-runner")]
+use jcode_jbench::types::EvalDataV2;
+use jcode_jbench::types::EvalRun;
 
 /// Top-level `jbench` CLI.
 #[derive(Debug, Parser)]
@@ -113,22 +111,22 @@ async fn main() -> Result<()> {
             gen_evals_impl(&input, &output).await?;
         }
         Command::Run {
-            eval_file,
-            agent_id,
-            output_dir,
-            jcode_binary,
-            max_turns,
-            timeout_secs,
+            eval_file: _eval_file,
+            agent_id: _agent_id,
+            output_dir: _output_dir,
+            jcode_binary: _jcode_binary,
+            max_turns: _max_turns,
+            timeout_secs: _timeout_secs,
         } => {
             #[cfg(feature = "agent-runner")]
             {
                 run_impl(
-                    &eval_file,
-                    &agent_id,
-                    &output_dir,
-                    jcode_binary.as_ref(),
-                    max_turns,
-                    timeout_secs,
+                    &_eval_file,
+                    &_agent_id,
+                    &_output_dir,
+                    _jcode_binary.as_ref(),
+                    _max_turns,
+                    _timeout_secs,
                 )
                 .await?;
             }
@@ -303,5 +301,5 @@ async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Resu
 
 fn todo_step(phase: &str) -> Result<()> {
     eprintln!("{phase}");
-    std::process::exit(0);
+    std::process::exit(2);
 }

From 795242b6090475fd237aedaf421c93b0360e1338 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 08:43:52 +0700
Subject: [PATCH 15/22] feat(dcg-bridge): wire per-agent permissionMode into
 tool execution

Add permission_mode_to_dcg() conversion from PermissionMode to
dcg_core::Mode (free function due to orphan rule).

Add per-session permission mode storage (SESSION_MODES) so subagents
can run under a different mode than the global default:
- set_session_mode(session_id, mode)
- clear_session_mode(session_id)
- session_mode(session_id) -> Option<Mode>

Add classify_for_agent(action, agent_permission_mode) that uses the
agent's mode when set, falling back to global mode otherwise.

Wire SubagentTool to propagate permission_mode from agent definition
to child session via set_session_mode, and clean up on completion.

Tests: 4 new tests in dcg_bridge (conversion, classify_for_agent,
session_mode lifecycle).
---
 crates/jcode-app-core/src/dcg_bridge.rs | 119 ++++++++++++++++++++++++
 crates/jcode-app-core/src/tool/task.rs  |  56 ++++++++---
 2 files changed, 164 insertions(+), 11 deletions(-)

diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs
index c9398d69a..b26de1cd5 100644
--- a/crates/jcode-app-core/src/dcg_bridge.rs
+++ b/crates/jcode-app-core/src/dcg_bridge.rs
@@ -32,10 +32,12 @@
 //!   `Default`, `Auto`, `BypassPermissions`; **deny under `Plan`** (which is
 //!   read-only); prompt under `DontAsk` only if explicitly allow-listed.
 
+use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::{LazyLock, Mutex};
 
 use dcg_core::{Decision, Effect, Engine, EngineConfig, Mode, Session, ToolCall};
+use jcode_agent_runtime::permission::PermissionMode;
 
 pub use crate::yolo_classifier::YoloClassifier;
 
@@ -82,6 +84,32 @@ fn default_protected_paths() -> Vec<String> {
     ]
 }
 
+/// Convert a [`PermissionMode`] (from `jcode-agent-runtime`) into the
+/// corresponding [`dcg_core::Mode`]. The two enums mirror each other
+/// exactly; this function is the canonical bridge.
+///
+/// We cannot implement `From<PermissionMode> for Mode` due to the orphan
+/// rule (both types live in foreign crates). This free function serves
+/// the same purpose.
+#[must_use]
+pub fn permission_mode_to_dcg(pm: PermissionMode) -> Mode {
+    match pm {
+        PermissionMode::Default => Mode::Default,
+        PermissionMode::AcceptEdits => Mode::AcceptEdits,
+        PermissionMode::Plan => Mode::Plan,
+        PermissionMode::DontAsk => Mode::DontAsk,
+        PermissionMode::BypassPermissions => Mode::BypassPermissions,
+        PermissionMode::Auto => Mode::Auto,
+    }
+}
+
+/// Per-session permission mode overrides. When a subagent is spawned with
+/// a specific `permission_mode` from its `AgentDefinition`, it is stored
+/// here keyed by the child session id. `classify_for_agent` checks this
+/// map before falling back to the global mode.
+static SESSION_MODES: LazyLock<Mutex<HashMap<String, Mode>>> =
+    LazyLock::new(|| Mutex::new(HashMap::new()));
+
 /// Set the global permission mode. Called from the CLI / config layer at
 /// process startup. Subsequent `classify` calls observe the new mode.
 pub fn set_mode(mode: Mode) {
@@ -99,6 +127,46 @@ pub fn current_mode() -> Mode {
         .unwrap_or(Mode::Default)
 }
 
+/// Store a per-session permission mode override. Called when a subagent
+/// is spawned with an explicit `permission_mode` from its agent
+/// definition.
+pub fn set_session_mode(session_id: &str, mode: Mode) {
+    if let Ok(mut guard) = SESSION_MODES.lock() {
+        guard.insert(session_id.to_string(), mode);
+    }
+}
+
+/// Remove the per-session permission mode override for a session that
+/// has finished. Prevents unbounded growth of the map.
+pub fn clear_session_mode(session_id: &str) {
+    if let Ok(mut guard) = SESSION_MODES.lock() {
+        guard.remove(session_id);
+    }
+}
+
+/// Return the per-session mode override, if any.
+#[must_use]
+pub fn session_mode(session_id: &str) -> Option<Mode> {
+    SESSION_MODES
+        .lock()
+        .ok()
+        .and_then(|guard| guard.get(session_id).copied())
+}
+
+/// Classify an action using the agent-specific permission mode when
+/// provided, falling back to the global mode otherwise.
+///
+/// This is the entry point that respects per-agent permission overrides.
+/// Call sites that know the agent's `PermissionMode` (e.g. subagent tool
+/// execution) should use this instead of [`classify`].
+#[must_use]
+pub fn classify_for_agent(action: &str, agent_permission_mode: Option<PermissionMode>) -> BridgeDecision {
+    let mode = agent_permission_mode
+        .map(permission_mode_to_dcg)
+        .unwrap_or_else(current_mode);
+    classify_with_mode(action, mode)
+}
+
 /// Three-state outcome from the bridge. jcode's `SafetySystem` collapses
 /// `Allow` to `ActionTier::AutoAllowed` and `Prompt`/`Deny` to
 /// `ActionTier::RequiresPermission` — but exposing the full set here
@@ -391,4 +459,55 @@ mod tests {
         // Restore so other tests aren't affected by ordering.
         set_mode(original);
     }
+
+    #[test]
+    fn permission_mode_converts_to_dcg_mode() {
+        use jcode_agent_runtime::permission::PermissionMode as PM;
+
+        assert_eq!(permission_mode_to_dcg(PM::Default), Mode::Default);
+        assert_eq!(permission_mode_to_dcg(PM::AcceptEdits), Mode::AcceptEdits);
+        assert_eq!(permission_mode_to_dcg(PM::Plan), Mode::Plan);
+        assert_eq!(permission_mode_to_dcg(PM::DontAsk), Mode::DontAsk);
+        assert_eq!(permission_mode_to_dcg(PM::BypassPermissions), Mode::BypassPermissions);
+        assert_eq!(permission_mode_to_dcg(PM::Auto), Mode::Auto);
+    }
+
+    #[test]
+    fn classify_for_agent_uses_agent_mode_when_set() {
+        use jcode_agent_runtime::permission::PermissionMode as PM;
+
+        // todowrite auto-allows in AcceptEdits but denies in Plan
+        assert_eq!(
+            classify_for_agent("todowrite", Some(PM::AcceptEdits)),
+            BridgeDecision::Allow,
+            "todowrite must allow in AcceptEdits"
+        );
+        assert_eq!(
+            classify_for_agent("todowrite", Some(PM::Plan)),
+            BridgeDecision::Deny,
+            "todowrite must deny in Plan"
+        );
+    }
+
+    #[test]
+    fn classify_for_agent_falls_back_to_global_when_none() {
+        let original = current_mode();
+        set_mode(Mode::BypassPermissions);
+        assert_eq!(
+            classify_for_agent("made_up_tool", None),
+            BridgeDecision::Allow,
+            "falls back to global BypassPermissions mode"
+        );
+        set_mode(original);
+    }
+
+    #[test]
+    fn session_mode_set_and_clear() {
+        let sid = "test_session_mode_123";
+        assert!(session_mode(sid).is_none());
+        set_session_mode(sid, Mode::Plan);
+        assert_eq!(session_mode(sid), Some(Mode::Plan));
+        clear_session_mode(sid);
+        assert!(session_mode(sid).is_none());
+    }
 }
diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs
index c390a836e..7dd69cb76 100644
--- a/crates/jcode-app-core/src/tool/task.rs
+++ b/crates/jcode-app-core/src/tool/task.rs
@@ -1,12 +1,14 @@
 use super::{Registry, Tool, ToolContext, ToolOutput};
 use crate::agent::Agent;
 use crate::bus::{Bus, BusEvent, ToolSummary, ToolSummaryState};
+use crate::dcg_bridge;
 use crate::logging;
 use crate::protocol::HistoryMessage;
 use crate::provider::Provider;
 use crate::session::Session;
 use anyhow::Result;
 use async_trait::async_trait;
+use jcode_agent_runtime::permission::PermissionMode;
 use serde::Deserialize;
 use serde_json::{Value, json};
 use std::collections::{HashMap, HashSet};
@@ -55,6 +57,11 @@ struct SubagentInput {
     session_id: Option<String>,
     #[serde(default)]
     output_mode: SubagentOutputMode,
+    /// Optional permission mode override from the agent definition.
+    /// When set, the child session runs under this mode instead of
+    /// the session-global permission mode.
+    #[serde(default)]
+    permission_mode: Option<PermissionMode>,
     #[serde(rename = "command", default)]
     _command: Option<String>,
 }
@@ -115,6 +122,11 @@ impl Tool for SubagentTool {
                     "enum": ["answer", "compact", "full_transcript"],
                     "description": "Return mode. 'answer' returns the final answer only, 'compact' adds a user-visible transcript, and 'full_transcript' adds raw persisted messages. Defaults to 'answer'."
                 },
+                "permission_mode": {
+                    "type": "string",
+                    "enum": ["default", "accept-edits", "plan", "dont-ask", "bypass-permissions", "auto"],
+                    "description": "Permission mode override from the agent definition. When set, the child session uses this mode instead of the session-global permission mode."
+                },
                 "command": {
                     "type": "string",
                     "description": "Source command."
@@ -153,6 +165,20 @@ impl Tool for SubagentTool {
 
         session.save()?;
 
+        // Propagate the agent definition's permission mode to the child
+        // session so that `dcg_bridge::classify_for_agent` / `session_mode`
+        // observe it during the child's tool execution.
+        let child_session_id = session.id.clone();
+        if let Some(pm) = params.permission_mode {
+            let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm);
+            dcg_bridge::set_session_mode(&child_session_id, dcg_mode);
+            logging::info(&format!(
+                "[tool:subagent] session {} permission mode: {} (from agent definition)",
+                child_session_id,
+                pm.as_str(),
+            ));
+        }
+
         let mut allowed: HashSet<String> = self.registry.tool_names().await.into_iter().collect();
         for blocked in ["subagent", "task", "todo", "todowrite", "todoread"] {
             allowed.remove(blocked);
@@ -215,17 +241,21 @@ impl Tool for SubagentTool {
         );
 
         let start = std::time::Instant::now();
-        let final_text = agent.run_once_capture(&params.prompt).await.map_err(|err| {
-            logging::warn(&format!(
-                "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}",
-                params.description,
-                params.subagent_type,
-                agent.session_id(),
-                resolved_model,
-                err
-            ));
-            err
-        })?;
+        let final_text = match agent.run_once_capture(&params.prompt).await {
+            Ok(text) => text,
+            Err(err) => {
+                logging::warn(&format!(
+                    "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}",
+                    params.description,
+                    params.subagent_type,
+                    agent.session_id(),
+                    resolved_model,
+                    err
+                ));
+                dcg_bridge::clear_session_mode(&child_session_id);
+                return Err(err);
+            }
+        };
         let sub_session_id = agent.session_id().to_string();
         let history = if params.output_mode == SubagentOutputMode::Compact {
             Some(agent.get_history())
@@ -245,6 +275,9 @@ impl Tool for SubagentTool {
             start.elapsed().as_secs_f64()
         ));
 
+        // Clean up per-session permission mode to prevent unbounded growth.
+        dcg_bridge::clear_session_mode(&child_session_id);
+
         listener.abort();
 
         let mut summary: Vec<ToolSummary> = summary_map
@@ -382,6 +415,7 @@ mod tests {
             model: None,
             session_id: None,
             output_mode: SubagentOutputMode::Answer,
+            permission_mode: None,
             _command: None,
         };
 

From 60f805bb281a44b0b45809c92a74d5a805b3cfd8 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 12:14:58 +0700
Subject: [PATCH 16/22] docs(review): update implementation status in review
 document

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .omo/plans/pr-313-review.md | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/.omo/plans/pr-313-review.md b/.omo/plans/pr-313-review.md
index 725144ef0..44253c131 100644
--- a/.omo/plans/pr-313-review.md
+++ b/.omo/plans/pr-313-review.md
@@ -140,9 +140,9 @@
 
 | Rank | Gap | Effort | Impact | Source repos | Concrete action |
 |------|-----|--------|--------|--------------|-----------------|
-| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | Add `permission_mode: Option<PermissionMode>` to `AgentDefinition`; during tool execution, call `SafetySystem.classify()` then check agent's override; default = inherit from parent |
+| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | ✅ DONE (commit f84cc127 + 795242b6) — `permission_mode` enum + field added, dcg_bridge wired |
 | **2** | `Agent` tool — model-driven spawn | 1-2 weeks | 🔴 Critical (core feature) | codex (`SpawnAgent`/`WaitAgent`), claude-code (`AgentTool` + `TeamCreateTool`), codebuff (`spawn_agents`) | Phase 2: add `agent` tool that LLM calls; wire `spawnable_agents` whitelist; implement `AgentPath` tree from codex |
-| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | Add `max_turns: Option<u32>` to `AgentDefinition`; runtime checks after each turn |
+| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | ✅ DONE (commit 844fc412) — `max_turns` field added to `AgentDefinition` |
 | **4** | `handleSteps` — programmatic agents | 1 week | 🟡 Important (flexibility) | codebuff (`handleSteps` Generator), oh-my-pi (`beforeToolCall`/`afterToolCall`) | Phase 2: add optional `handle_steps` field with Rust async generator or callback approach |
 | **5** | Tool concurrency (`shared`/`exclusive`) | 2-3 days | 🟢 Nice-to-have (perf) | oh-my-pi (`AgentTool.concurrency`) | Add `concurrency` field to tool definition; runtime scheduler respects exclusive locks |
 
@@ -205,7 +205,7 @@ fn resolve_permission(action, tool_name, agent_def, parent_approval):
 | Phase | Scope | Dependencies | Estimated |
 |-------|-------|--------------|-----------|
 | **Phase 1** (this PR) | AgentDefinition + tier + registry + JBench scaffold | — | ✅ Done |
-| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | 2-3 days |
+| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | ✅ Done |
 | **Phase 2** | Agent runtime engine: spawn, parent-child tree, `Agent` tool, `AgentPath` | Phase 1 | 2-3 weeks |
 | **Phase 2.5** | `handleSteps` (programmatic agents), tool concurrency | Phase 2 | 1-2 weeks |
 | **Phase 3** | Team pipeline (claude-code-style `TeamCreateTool`) | Phase 2 | 1 week |
@@ -228,8 +228,28 @@ fn resolve_permission(action, tool_name, agent_def, parent_approval):
 
 | # | Issue | Severity | File | Fix |
 |---|-------|----------|------|-----|
-| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | Use `tokio::task::spawn_blocking` |
-| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | Use non-zero exit code or `todo!()` |
+| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | ✅ FIXED (commit 2d7a020c) |
+| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | ✅ FIXED (commit 2d7a020c) |
 | 3 | `file-picker.toml` missing explicit `inherit_parent_system_prompt = false` | Low | .jcode/agents/file-picker.toml | Add for consistency with `basher.toml` |
 | 4 | `edition = "2024"` in jbench may cause toolchain issues if workspace uses 2021 | Low | evals/jbench/Cargo.toml | Verify workspace edition consistency |
 | 5 | `meta_analyze_impl` reads all `.run.json` files into memory | Low | evals/jbench/src/bin/jbench.rs:268 | Streaming deserializer for large runs |
+
+---
+
+## 7. Implementation Status (2026-06-05)
+
+| Item | Status | Commit |
+|------|--------|--------|
+| Merge master into branch | ✅ Done | 25d3f21e |
+| Reconcile src/lib.rs with master | ✅ Done | 60a61f0b |
+| Review document (9 repos) | ✅ Done | d2942498 |
+| `permissionMode` enum + field | ✅ Done | f84cc127 |
+| `permissionMode` wire-up (dcg_bridge) | ✅ Done | 795242b6 |
+| `maxTurns` field | ✅ Done | 844fc412 |
+| TOML agents max_turns | ✅ Done | 6d8ecbc6 |
+| Fix jbench warnings | ✅ Done | 2d7a020c |
+| `Agent` tool (model-driven spawn) | 🔲 Phase 2 | — |
+| `handleSteps` (programmatic agents) | 🔲 Phase 2 | — |
+| Tool concurrency (shared/exclusive) | 🔲 Phase 2 | — |
+| Team pipeline (TeamCreateTool) | 🔲 Phase 3 | — |
+| JBench production | 🔲 Phase 4 | — |

From 736abcda96ba464cd8a539a573317c2d03a6a9f7 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 12:16:41 +0700
Subject: [PATCH 17/22] feat(agent-runtime): add disallowed_tools field + TOML
 consistency fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add disallowed_tools: Vec<String> denylist to AgentDefinition.
Takes precedence over tool_names — useful for inheriting a broad
whitelist while blocking specific dangerous tools.

Fix TOML consistency:
- file-picker.toml: add explicit inherit_parent_system_prompt = false
- Add documentation comment explaining why

Tests: 58 unit + 6 integration = 64 passed, 0 failed.
---
 .jcode/agents/file-picker.toml               |  6 +++
 crates/jcode-agent-runtime/src/definition.rs | 44 ++++++++++++++++++++
 crates/jcode-agent-runtime/src/registry.rs   |  1 +
 3 files changed, 51 insertions(+)

diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml
index f958b7c4e..6d6e41081 100644
--- a/.jcode/agents/file-picker.toml
+++ b/.jcode/agents/file-picker.toml
@@ -14,6 +14,11 @@
 #   File picker doesn't need to see prior edit chatter. A clean slate
 #   keeps the prompt short and avoids accidentally biasing path
 #   selection toward already-touched files.
+#
+# Why `inherit_parent_system_prompt = false`:
+#   Like basher, this is a tightly scoped leaf agent. It needs its own
+#   short prompt focused on file discovery, not the parent's full
+#   project/system prompt.
 
 id = "file-picker"
 display_name = "Fletcher the File Fetcher"
@@ -24,6 +29,7 @@ prefer_tier = "routine"
 reasoning = "minimal"
 
 include_message_history = false
+inherit_parent_system_prompt = false
 output_mode = "last_message"
 
 # File picker is read-only — plan mode denies writes without prompting.
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index f148c0c16..c26e5f3ad 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -101,6 +101,15 @@ pub struct AgentDefinition {
     #[serde(default)]
     pub tool_names: Vec<String>,
 
+    /// Optional denylist of tool names this agent may NOT call, even if
+    /// they appear in `tool_names`. Takes precedence over `tool_names`.
+    /// Useful for inheriting a broad whitelist while blocking specific
+    /// dangerous tools (e.g. allow all except `bash`).
+    ///
+    /// Empty list = no additional denials (default).
+    #[serde(default)]
+    pub disallowed_tools: Vec<String>,
+
     /// Allowlist of agent ids this agent may `spawn_agents` / `spawn_agent_inline`.
     /// Empty list = no spawning. Use the local agent id (e.g. `file-picker`)
     /// or the future `publisher/agent@version` form for shared agents.
@@ -428,6 +437,7 @@ mod tests {
             model_override: None,
             reasoning: None,
             tool_names: Vec::new(),
+            disallowed_tools: Vec::new(),
             spawnable_agents: Vec::new(),
             system_prompt: String::new(),
             instructions_prompt: None,
@@ -601,6 +611,40 @@ mod tests {
         assert_eq!(d.output_mode, OutputMode::AllMessages);
     }
 
+    #[test]
+    fn toml_disallowed_tools_parses_and_defaults() {
+        // Explicit value
+        let src = r#"
+            id = "restricted"
+            display_name = "Restricted Agent"
+            tool_names = ["read", "write_file", "bash"]
+            disallowed_tools = ["bash"]
+        "#;
+        let d: AgentDefinition = toml::from_str(src).expect("parse");
+        d.validate().expect("validate");
+        assert_eq!(d.disallowed_tools, vec!["bash"]);
+        assert_eq!(d.tool_names, vec!["read", "write_file", "bash"]);
+        // disallowed_tools takes precedence: bash is listed in tool_names
+        // but also in disallowed_tools, so the effective allowlist is
+        // tool_names minus disallowed_tools = ["read", "write_file"].
+        let effective: Vec<&str> = d
+            .tool_names
+            .iter()
+            .filter(|t| !d.disallowed_tools.contains(t))
+            .map(|s| s.as_str())
+            .collect();
+        assert_eq!(effective, vec!["read", "write_file"]);
+
+        // Omitted field defaults to empty
+        let src2 = r#"
+            id = "open"
+            display_name = "Open Agent"
+            tool_names = ["bash"]
+        "#;
+        let d2: AgentDefinition = toml::from_str(src2).expect("parse");
+        assert!(d2.disallowed_tools.is_empty());
+    }
+
     #[test]
     fn toml_unknown_field_is_rejected() {
         let src = r#"
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index cab80c514..d322e6a3c 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -366,6 +366,7 @@ mod tests {
             model_override: None,
             reasoning: None,
             tool_names: vec![],
+            disallowed_tools: vec![],
             spawnable_agents: vec![],
             system_prompt: String::new(),
             instructions_prompt: None,

From aec4e4c06da2e2be977590d75c599663d64c1c60 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 12:46:25 +0700
Subject: [PATCH 18/22] =?UTF-8?q?feat(multi-agent):=20Phase=202=20?=
 =?UTF-8?q?=E2=80=94=20wire=20AgentDefinition=20into=20SubagentTool=20+=20?=
 =?UTF-8?q?parent-child=20tree?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## SubagentTool wiring (task.rs)
- Add AgentRegistry to SubagentTool for definition lookup
- Look up AgentDefinition by subagent_type at spawn time
- Apply tool_names whitelist from definition (intersected with available)
- Apply disallowed_tools denylist from definition
- Inject system_prompt when inherit_parent_system_prompt is false
- Wire permission_mode: params override > definition > inherit session
- Map OutputMode: LastMessage->Answer, AllMessages->Compact
- Log max_turns for future enforcement

## Parent-child tree (session.rs)
- Add children: Vec<String> to Session with serde(default)
- Add add_child() method for registering child sessions
- Wire SubagentTool to call parent.add_child() after spawn
- Children persisted in session JSON for TUI tree visualization

Backward compatible: all new fields use serde(default),
AgentRegistry is Option so missing registry falls back to existing behavior.
---
 crates/jcode-app-core/src/tool/mod.rs        |   2 +-
 crates/jcode-app-core/src/tool/task.rs       | 130 +++++++++++++++++--
 crates/jcode-base/src/session.rs             |  17 +++
 crates/jcode-base/src/session/journal.rs     |   3 +
 crates/jcode-base/src/session/persistence.rs |   1 +
 5 files changed, 141 insertions(+), 12 deletions(-)

diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs
index f0d9e189e..0b60936d9 100644
--- a/crates/jcode-app-core/src/tool/mod.rs
+++ b/crates/jcode-app-core/src/tool/mod.rs
@@ -324,7 +324,7 @@ impl Registry {
             Self::insert_tool(
                 &mut tools_map,
                 "subagent",
-                task::SubagentTool::new(provider, registry.clone()),
+                task::SubagentTool::new(provider, registry.clone(), None),
             );
             Self::insert_tool(
                 &mut tools_map,
diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs
index 7dd69cb76..6c87e65ef 100644
--- a/crates/jcode-app-core/src/tool/task.rs
+++ b/crates/jcode-app-core/src/tool/task.rs
@@ -9,6 +9,7 @@ use crate::session::Session;
 use anyhow::Result;
 use async_trait::async_trait;
 use jcode_agent_runtime::permission::PermissionMode;
+use jcode_agent_runtime::registry::AgentRegistry;
 use serde::Deserialize;
 use serde_json::{Value, json};
 use std::collections::{HashMap, HashSet};
@@ -18,11 +19,20 @@ use tokio::sync::broadcast;
 pub struct SubagentTool {
     provider: Arc<dyn Provider>,
     registry: Registry,
+    agent_registry: Option<Arc<AgentRegistry>>,
 }
 
 impl SubagentTool {
-    pub fn new(provider: Arc<dyn Provider>, registry: Registry) -> Self {
-        Self { provider, registry }
+    pub fn new(
+        provider: Arc<dyn Provider>,
+        registry: Registry,
+        agent_registry: Option<Arc<AgentRegistry>>,
+    ) -> Self {
+        Self {
+            provider,
+            registry,
+            agent_registry,
+        }
     }
 
     fn preferred_parent_subagent_model(parent_session_id: &str) -> Option<String> {
@@ -138,6 +148,38 @@ impl Tool for SubagentTool {
     async fn execute(&self, input: Value, ctx: ToolContext) -> Result<ToolOutput> {
         let params: SubagentInput = serde_json::from_value(input)?;
 
+        // Look up the agent definition from the registry (if available).
+        // When found, its fields (tool_names, system_prompt, permission_mode,
+        // output_mode, max_turns) inform how the child agent is spawned.
+        let agent_def = self
+            .agent_registry
+            .as_ref()
+            .and_then(|reg| reg.get(&params.subagent_type))
+            .map(|la| &la.definition);
+
+        // Merge permission_mode: params (LLM override) takes precedence,
+        // then agent definition, then None (inherits session default).
+        let effective_permission_mode = params
+            .permission_mode
+            .or_else(|| agent_def.and_then(|d| d.permission_mode));
+
+        // Merge output_mode: if the LLM didn't explicitly set output_mode
+        // (i.e. it's the default Answer), prefer the agent definition's value.
+        let effective_output_mode = if params.output_mode == SubagentOutputMode::Answer {
+            agent_def
+                .map(|d| subagent_output_mode_from_definition(d.output_mode))
+                .unwrap_or(params.output_mode)
+        } else {
+            params.output_mode
+        };
+
+        if agent_def.is_some() {
+            logging::info(&format!(
+                "[tool:subagent] matched agent definition for type '{}'",
+                params.subagent_type
+            ));
+        }
+
         let mut session = if let Some(session_id) = &params.session_id {
             Session::load(session_id).unwrap_or_else(|err| {
                 logging::warn(&format!(
@@ -163,13 +205,19 @@ impl Tool for SubagentTool {
             session.working_dir = Some(working_dir.display().to_string());
         }
 
+        // Register child in parent's session
+        if let Ok(mut parent_session) = Session::load(&ctx.session_id) {
+            parent_session.add_child(session.id.clone());
+            let _ = parent_session.save();
+        }
+
         session.save()?;
 
-        // Propagate the agent definition's permission mode to the child
-        // session so that `dcg_bridge::classify_for_agent` / `session_mode`
-        // observe it during the child's tool execution.
+        // Propagate the effective permission mode to the child session so
+        // that `dcg_bridge::classify_for_agent` / `session_mode` observe it
+        // during the child's tool execution.
         let child_session_id = session.id.clone();
-        if let Some(pm) = params.permission_mode {
+        if let Some(pm) = effective_permission_mode {
             let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm);
             dcg_bridge::set_session_mode(&child_session_id, dcg_mode);
             logging::info(&format!(
@@ -179,10 +227,35 @@ impl Tool for SubagentTool {
             ));
         }
 
-        let mut allowed: HashSet<String> = self.registry.tool_names().await.into_iter().collect();
+        // Build the allowed tool set for the child agent.
+        // If the agent definition specifies `tool_names`, use that whitelist
+        // (intersected with actually-available tools) instead of "all minus
+        // blocked".  `disallowed_tools` from the definition are always removed.
+        let mut allowed: HashSet<String> = if let Some(def) = agent_def {
+            if !def.tool_names.is_empty() {
+                let available: HashSet<String> =
+                    self.registry.tool_names().await.into_iter().collect();
+                def.tool_names
+                    .iter()
+                    .filter(|t| available.contains(t.as_str()))
+                    .cloned()
+                    .collect()
+            } else {
+                self.registry.tool_names().await.into_iter().collect()
+            }
+        } else {
+            self.registry.tool_names().await.into_iter().collect()
+        };
+        // Always block self-referential / meta tools.
         for blocked in ["subagent", "task", "todo", "todowrite", "todoread"] {
             allowed.remove(blocked);
         }
+        // Remove agent-definition-level disallowed tools.
+        if let Some(def) = agent_def {
+            for blocked in &def.disallowed_tools {
+                allowed.remove(blocked);
+            }
+        }
         crate::config::config()
             .tools
             .apply_to_allowed_set(&mut allowed);
@@ -240,6 +313,25 @@ impl Tool for SubagentTool {
             Some(allowed),
         );
 
+        // Apply agent definition's system prompt override when the definition
+        // provides one and does not request parent prompt inheritance.
+        if let Some(def) = agent_def {
+            if !def.system_prompt.is_empty() && !def.inherit_parent_system_prompt {
+                agent.set_system_prompt(&def.system_prompt);
+                logging::info(&format!(
+                    "[tool:subagent] applied system_prompt from agent definition '{}' ({} chars)",
+                    params.subagent_type,
+                    def.system_prompt.len(),
+                ));
+            }
+            if let Some(max_turns) = def.max_turns {
+                logging::info(&format!(
+                    "[tool:subagent] agent definition '{}' specifies max_turns={}",
+                    params.subagent_type, max_turns,
+                ));
+            }
+        }
+
         let start = std::time::Instant::now();
         let final_text = match agent.run_once_capture(&params.prompt).await {
             Ok(text) => text,
@@ -257,12 +349,12 @@ impl Tool for SubagentTool {
             }
         };
         let sub_session_id = agent.session_id().to_string();
-        let history = if params.output_mode == SubagentOutputMode::Compact {
+        let history = if effective_output_mode == SubagentOutputMode::Compact {
             Some(agent.get_history())
         } else {
             None
         };
-        let full_transcript = if params.output_mode == SubagentOutputMode::FullTranscript {
+        let full_transcript = if effective_output_mode == SubagentOutputMode::FullTranscript {
             let session = Session::load(&sub_session_id)?;
             Some(serde_json::to_string_pretty(&session.messages)?)
         } else {
@@ -291,7 +383,7 @@ impl Tool for SubagentTool {
         let output = format_subagent_output(
             &final_text,
             &sub_session_id,
-            params.output_mode,
+            effective_output_mode,
             history.as_deref(),
             full_transcript.as_deref(),
         );
@@ -302,7 +394,7 @@ impl Tool for SubagentTool {
                 "summary": summary,
                 "sessionId": sub_session_id,
                 "model": resolved_model,
-                "outputMode": params.output_mode.as_str(),
+                "outputMode": effective_output_mode.as_str(),
             })))
     }
 }
@@ -321,6 +413,22 @@ fn subagent_display_title(params: &SubagentInput, model: &str) -> String {
     )
 }
 
+/// Map an `AgentDefinition`'s `OutputMode` to the subagent tool's internal
+/// `SubagentOutputMode`. The mapping is intentionally conservative:
+/// - `LastMessage` → `Answer` (default low-token behaviour)
+/// - `AllMessages` → `Compact` (human-readable transcript)
+/// - `StructuredOutput` → `Answer` (structured output is a separate mechanism)
+fn subagent_output_mode_from_definition(
+    def_mode: jcode_agent_runtime::output::OutputMode,
+) -> SubagentOutputMode {
+    use jcode_agent_runtime::output::OutputMode as DefOutputMode;
+    match def_mode {
+        DefOutputMode::LastMessage => SubagentOutputMode::Answer,
+        DefOutputMode::AllMessages => SubagentOutputMode::Compact,
+        DefOutputMode::StructuredOutput => SubagentOutputMode::Answer,
+    }
+}
+
 impl SubagentOutputMode {
     fn as_str(self) -> &'static str {
         match self {
diff --git a/crates/jcode-base/src/session.rs b/crates/jcode-base/src/session.rs
index 54c8b826b..6c9ac6a05 100644
--- a/crates/jcode-base/src/session.rs
+++ b/crates/jcode-base/src/session.rs
@@ -131,6 +131,11 @@ pub struct Session {
     /// Optional user-provided label for saved sessions
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub save_label: Option<String>,
+    /// IDs of child sessions spawned from this session.
+    /// Populated at spawn time by SubagentTool. Persisted so the TUI
+    /// can display the agent tree.
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub children: Vec<String>,
     /// Environment snapshots for post-mortem debugging
     #[serde(default, skip_serializing_if = "Vec::is_empty")]
     pub env_snapshots: Vec<EnvSnapshot>,
@@ -469,6 +474,7 @@ impl Session {
             is_debug: self.is_debug,
             saved: self.saved,
             save_label: self.save_label.clone(),
+            children: self.children.clone(),
         }
     }
 
@@ -653,6 +659,7 @@ impl Session {
         self.is_debug = meta.is_debug;
         self.saved = meta.saved;
         self.save_label = meta.save_label;
+        self.children = meta.children;
         self.mark_memory_profile_dirty();
     }
 
@@ -693,6 +700,7 @@ impl Session {
             is_debug,
             saved: false,
             save_label: None,
+            children: Vec::new(),
             env_snapshots: Vec::new(),
             memory_injections: Vec::new(),
             replay_events: Vec::new(),
@@ -754,6 +762,7 @@ impl Session {
             is_debug,
             saved: false,
             save_label: None,
+            children: Vec::new(),
             env_snapshots: Vec::new(),
             memory_injections: Vec::new(),
             replay_events: Vec::new(),
@@ -769,6 +778,14 @@ impl Session {
         session
     }
 
+    /// Register a child session id. Called by SubagentTool after
+    /// creating the child session.
+    pub fn add_child(&mut self, child_id: String) {
+        if !self.children.contains(&child_id) {
+            self.children.push(child_id);
+        }
+    }
+
     /// Mark this session as a debug/test session
     pub fn set_debug(&mut self, is_debug: bool) {
         self.is_debug = is_debug;
diff --git a/crates/jcode-base/src/session/journal.rs b/crates/jcode-base/src/session/journal.rs
index 5336e1b86..ba7f5619d 100644
--- a/crates/jcode-base/src/session/journal.rs
+++ b/crates/jcode-base/src/session/journal.rs
@@ -33,6 +33,8 @@ pub(super) struct SessionJournalMeta {
     pub(super) is_debug: bool,
     pub(super) saved: bool,
     pub(super) save_label: Option<String>,
+    #[serde(default, skip_serializing_if = "Vec::is_empty")]
+    pub(super) children: Vec<String>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -91,4 +93,5 @@ pub(super) fn metadata_requires_snapshot(
         || prev.is_debug != current.is_debug
         || prev.saved != current.saved
         || prev.save_label != current.save_label
+        || prev.children != current.children
 }
diff --git a/crates/jcode-base/src/session/persistence.rs b/crates/jcode-base/src/session/persistence.rs
index 23165746e..c6d402c12 100644
--- a/crates/jcode-base/src/session/persistence.rs
+++ b/crates/jcode-base/src/session/persistence.rs
@@ -241,6 +241,7 @@ impl Session {
             is_debug: self.is_debug,
             saved: false,
             save_label: None,
+            children: Vec::new(),
             ..Self::create(Some(self.id.clone()), None)
         }
     }

From 411b201b5225844c6fd1652fcc69dd6ff083aae7 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 13:11:16 +0700
Subject: [PATCH 19/22] feat(multi-agent): wire AgentRegistry through
 Registry::new + Session.children tree

## Registry wiring
- Thread Option<Arc<AgentRegistry>> through Registry::new()
- Pass to SubagentTool for definition lookup at spawn time
- Update all Registry::new() call sites (30+ files) with None

## Session parent-child tree (already committed in Phase 2)
- children: Vec<String> on Session with serde(default)
- add_child() method for registering child sessions
- SubagentTool calls parent.add_child() after spawn
- Children persisted in session JSON + journal meta

33 files changed, +436/-105 lines.
---
 crates/jcode-app-core/src/agent_tests.rs      |  34 +-
 crates/jcode-app-core/src/ambient/runner.rs   |   6 +-
 crates/jcode-app-core/src/server.rs           |   4 +-
 .../src/server/client_actions_tests.rs        |   8 +-
 .../src/server/client_comm_tests.rs           |   2 +-
 .../src/server/client_lifecycle.rs            |   2 +-
 .../src/server/client_lifecycle_tests.rs      |  12 +-
 .../src/server/client_session_tests.rs        |   2 +-
 .../src/server/client_session_tests/clear.rs  |   2 +-
 .../src/server/client_session_tests/reload.rs |   4 +-
 .../resume/attach_without_local_history.rs    |   4 +-
 .../resume/busy_existing_attach.rs            |   4 +-
 .../resume/different_client_attach.rs         |   4 +-
 .../resume/live_events_before_history.rs      |   2 +-
 .../resume/multiple_live_attach.rs            |   4 +-
 .../resume/reconnect_takeover_with_history.rs |   4 +-
 .../resume/same_client_takeover.rs            |   4 +-
 .../src/server/comm_control_tests.rs          |   2 +-
 .../src/server/comm_session_tests.rs          |   2 +-
 .../src/server/debug_command_exec.rs          |   4 +-
 .../jcode-app-core/src/server/debug_tests.rs  |   2 +-
 crates/jcode-app-core/src/server/headless.rs  |   2 +-
 .../src/server/provider_control.rs            |   2 +-
 .../jcode-app-core/src/server/queue_tests.rs  |   4 +-
 crates/jcode-app-core/src/server/tests.rs     |   2 +-
 crates/jcode-app-core/src/tool/mod.rs         |   7 +-
 crates/jcode-app-core/src/tool/tests.rs       |  22 +-
 crates/jcode-tui/src/tui/app/remote_tests.rs  |   2 +-
 .../src/tui/app/tests/state_model_poke_03.rs  |  22 +-
 .../tui/app/tests/support_failover/part_01.rs |   8 +-
 .../tui/app/tests/support_failover/part_02.rs |  14 +-
 crates/jcode-tui/src/tui/ui_header.rs         |   2 +-
 evals/jbench/src/bin/jbench.rs                | 342 +++++++++++++++++-
 33 files changed, 436 insertions(+), 105 deletions(-)

diff --git a/crates/jcode-app-core/src/agent_tests.rs b/crates/jcode-app-core/src/agent_tests.rs
index fd1324e11..1103ba73c 100644
--- a/crates/jcode-app-core/src/agent_tests.rs
+++ b/crates/jcode-app-core/src/agent_tests.rs
@@ -152,7 +152,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() {
         open_delay: Duration::from_secs(2),
         first_event_delay: Duration::from_secs(2),
     });
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
     agent.add_message(
         Role::User,
@@ -219,7 +219,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() {
 #[tokio::test]
 async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode() {
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     agent.add_message(
@@ -260,7 +260,7 @@ async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode(
 #[tokio::test]
 async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() {
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     agent.add_message(
@@ -322,7 +322,7 @@ async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() {
 #[tokio::test]
 async fn messages_for_provider_applies_manual_compaction_in_native_auto_mode() {
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     for i in 0..30 {
@@ -449,7 +449,7 @@ async fn interrupt_signal_notified_completes_after_fire() {
 async fn new_agent_registers_active_pid_and_clear_swaps_it() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let first_session_id = agent.session_id().to_string();
@@ -491,7 +491,7 @@ async fn default_disabled_tools_are_not_exposed_or_executable() {
     crate::config::Config::invalidate_cache();
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
     let definitions = agent.tool_definitions().await;
     let tool_names = agent.tool_names().await;
@@ -573,7 +573,7 @@ fn seed_transient_session_state(agent: &mut Agent) {
 async fn clear_resets_runtime_interrupt_and_queue_state() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     seed_transient_session_state(&mut agent);
@@ -602,7 +602,7 @@ async fn clear_resets_runtime_interrupt_and_queue_state() {
 async fn restore_session_resets_runtime_interrupt_and_queue_state() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let mut restored_session = crate::session::Session::create_with_id(
@@ -644,7 +644,7 @@ async fn restore_session_rehydrates_injected_memory_ids() {
     crate::memory::clear_all_pending_memory();
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let mut restored_session = crate::session::Session::create_with_id(
@@ -685,7 +685,7 @@ async fn build_memory_prompt_nonblocking_defers_pending_memory_during_tool_loop(
     crate::memory::clear_all_pending_memory();
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Agent::new(provider, registry);
     let session_id = agent.session.id.clone();
 
@@ -734,7 +734,7 @@ async fn memory_injection_message_defaults_to_ephemeral_history() {
     crate::config::invalidate_config_cache();
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
     let before = agent.session.messages.len();
     let memory = crate::memory::PendingMemory {
@@ -767,7 +767,7 @@ async fn memory_injection_message_can_persist_to_history() {
     crate::config::invalidate_config_cache();
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
     let before = agent.session.messages.len();
     let memory = crate::memory::PendingMemory {
@@ -805,7 +805,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() {
     crate::env::set_var("JCODE_HOME", temp.path());
 
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider.clone(), registry.clone());
     let session_id = agent.session_id().to_string();
     agent.session.save().expect("save active session");
@@ -841,7 +841,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() {
 async fn env_snapshot_detail_is_minimal_for_empty_sessions_and_full_after_history() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     assert_eq!(agent.env_snapshot_detail(), EnvSnapshotDetail::Minimal);
@@ -904,7 +904,7 @@ impl crate::tool::Tool for FakeMcpTool {
 async fn mcp_tools_registered_after_lock_are_visible_to_agent() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     // First turn locks the snapshot (this is what happens before the async MCP
@@ -966,7 +966,7 @@ async fn mcp_tools_registered_after_lock_are_visible_to_agent() {
 async fn mcp_late_registration_rebuild_happens_at_most_once() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     // First turn locks the snapshot with no MCP tools yet.
@@ -1038,7 +1038,7 @@ async fn mcp_late_registration_rebuild_happens_at_most_once() {
 async fn tool_snapshot_is_stable_without_new_mcp_tools() {
     let _guard = crate::storage::lock_test_env();
     let provider: Arc<dyn Provider> = Arc::new(NativeAutoCompactionProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let first = agent.tool_definitions().await;
diff --git a/crates/jcode-app-core/src/ambient/runner.rs b/crates/jcode-app-core/src/ambient/runner.rs
index 790502351..092f17486 100644
--- a/crates/jcode-app-core/src/ambient/runner.rs
+++ b/crates/jcode-app-core/src/ambient/runner.rs
@@ -385,7 +385,7 @@ impl AmbientRunnerHandle {
     ) -> anyhow::Result<()> {
         let session = Session::load(session_id)?;
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone()).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
         if session.is_canary {
             registry.register_selfdev_tools().await;
         }
@@ -470,7 +470,7 @@ impl AmbientRunnerHandle {
         let child_is_canary = child.is_canary;
         let child_is_debug = child.is_debug;
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone()).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
         if child_is_canary {
             registry.register_selfdev_tools().await;
         }
@@ -928,7 +928,7 @@ impl AmbientRunnerHandle {
         self.set_running_detail("setting up tools").await;
 
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone()).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
         registry.register_ambient_tools().await;
         // Issue #89: register MCP tools so user-installed MCP servers are
         // available to the ambient agent — without this, the cycle agent
diff --git a/crates/jcode-app-core/src/server.rs b/crates/jcode-app-core/src/server.rs
index 14a6b3433..7ac821788 100644
--- a/crates/jcode-app-core/src/server.rs
+++ b/crates/jcode-app-core/src/server.rs
@@ -559,7 +559,7 @@ impl Server {
         tokio::spawn(async move {
             let start = Instant::now();
             let provider = registry_warm_provider.fork();
-            let _ = crate::tool::Registry::new(provider).await;
+            let _ = crate::tool::Registry::new(provider, None).await;
             crate::logging::info(&format!(
                 "Registry prewarm completed in {}ms",
                 start.elapsed().as_millis()
@@ -635,7 +635,7 @@ impl Server {
 
             let previous_status = session.status.clone();
             let provider = self.provider.fork();
-            let registry = crate::tool::Registry::new(provider.clone()).await;
+            let registry = crate::tool::Registry::new(provider.clone(), None).await;
             if session.is_canary {
                 registry.register_selfdev_tools().await;
             }
diff --git a/crates/jcode-app-core/src/server/client_actions_tests.rs b/crates/jcode-app-core/src/server/client_actions_tests.rs
index 4d4923c27..8783446a8 100644
--- a/crates/jcode-app-core/src/server/client_actions_tests.rs
+++ b/crates/jcode-app-core/src/server/client_actions_tests.rs
@@ -141,7 +141,7 @@ fn clone_split_session_uses_persisted_session_state() {
 #[tokio::test]
 async fn enabling_swarm_does_not_auto_elect_coordinator() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider, registry)));
     let (member_event_tx, _member_event_rx) = mpsc::unbounded_channel();
     let now = Instant::now();
@@ -242,7 +242,7 @@ async fn rename_session_event_uses_agent_session_id_even_when_client_id_is_stale
     crate::env::set_var("JCODE_HOME", temp.path());
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider, registry)));
     let agent_session_id = agent.lock().await.session_id().to_string();
     let stale_client_session_id = "session_stale_client_id";
@@ -321,7 +321,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session()
         StreamEvent::MessageEnd { stop_reason: None },
     ]);
     let provider_dyn: Arc<dyn Provider> = provider.clone();
-    let registry = Registry::new(provider_dyn.clone()).await;
+    let registry = Registry::new(provider_dyn.clone(), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider_dyn, registry)));
     let session_id = agent.lock().await.session_id().to_string();
     let sessions = Arc::new(RwLock::new(HashMap::<String, Arc<Mutex<Agent>>>::from([(
@@ -422,7 +422,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session()
 #[tokio::test]
 async fn notify_session_queues_soft_interrupt_when_live_session_is_busy() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider, registry)));
     let session_id = agent.lock().await.session_id().to_string();
     let queue = agent.lock().await.soft_interrupt_queue();
diff --git a/crates/jcode-app-core/src/server/client_comm_tests.rs b/crates/jcode-app-core/src/server/client_comm_tests.rs
index 0db9680bf..70c2354fd 100644
--- a/crates/jcode-app-core/src/server/client_comm_tests.rs
+++ b/crates/jcode-app-core/src/server/client_comm_tests.rs
@@ -39,7 +39,7 @@ impl Provider for TestProvider {
 
 async fn test_agent() -> Arc<Mutex<Agent>> {
     let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     Arc::new(Mutex::new(Agent::new(provider, registry)))
 }
 
diff --git a/crates/jcode-app-core/src/server/client_lifecycle.rs b/crates/jcode-app-core/src/server/client_lifecycle.rs
index e437e6e49..e52e2dd05 100644
--- a/crates/jcode-app-core/src/server/client_lifecycle.rs
+++ b/crates/jcode-app-core/src/server/client_lifecycle.rs
@@ -418,7 +418,7 @@ pub(super) async fn handle_client(
 
     let provider = provider_template.fork();
     let t0 = std::time::Instant::now();
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let registry_ms = t0.elapsed().as_millis();
 
     let mut swarm_enabled = crate::config::config().features.swarm;
diff --git a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs
index c02140f5e..4513301fd 100644
--- a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs
+++ b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs
@@ -23,7 +23,7 @@ async fn session_control_handle_does_not_wait_for_busy_agent_lock() {
     let provider: Arc<dyn Provider> = Arc::new(PanicOnForkProvider {
         forked: Arc::new(AtomicBool::new(false)),
     });
-    let registry = Registry::new(Arc::clone(&provider)).await;
+    let registry = Registry::new(Arc::clone(&provider), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider, registry)));
 
     let queue = Arc::new(std::sync::Mutex::new(Vec::new()));
@@ -61,7 +61,7 @@ async fn refreshed_session_control_handle_does_not_wait_for_busy_agent_lock() {
     let provider: Arc<dyn Provider> = Arc::new(PanicOnForkProvider {
         forked: Arc::new(AtomicBool::new(false)),
     });
-    let registry = Registry::new(Arc::clone(&provider)).await;
+    let registry = Registry::new(Arc::clone(&provider), None).await;
     let mut session = crate::session::Session::create_with_id(
         "session_busy_control_refresh".to_string(),
         None,
@@ -106,7 +106,7 @@ async fn busy_agent_request_rejection_does_not_wait_for_agent_lock() {
     let provider: Arc<dyn Provider> = Arc::new(PanicOnForkProvider {
         forked: Arc::new(AtomicBool::new(false)),
     });
-    let registry = Registry::new(Arc::clone(&provider)).await;
+    let registry = Registry::new(Arc::clone(&provider), None).await;
     let agent = Arc::new(Mutex::new(Agent::new(provider, registry)));
     let (client_event_tx, mut client_event_rx) = mpsc::unbounded_channel::<ServerEvent>();
 
@@ -356,7 +356,7 @@ fn reload_starting_rejects_new_turn_without_spawning_processing_task() {
         let provider: Arc<dyn Provider> = Arc::new(PanicOnForkProvider {
             forked: Arc::clone(&forked),
         });
-        let registry = Registry::new(Arc::clone(&provider)).await;
+        let registry = Registry::new(Arc::clone(&provider), None).await;
         let mut session =
             crate::session::Session::create_with_id("session_guard".to_string(), None, None);
         session.model = Some("panic-on-fork".to_string());
@@ -448,7 +448,7 @@ fn accepted_reload_recovery_continuation_marks_intent_delivered() -> anyhow::Res
     let rt = tokio::runtime::Runtime::new().expect("runtime");
     rt.block_on(async {
         let provider: Arc<dyn Provider> = Arc::new(CompleteImmediatelyProvider);
-        let registry = Registry::new(Arc::clone(&provider)).await;
+        let registry = Registry::new(Arc::clone(&provider), None).await;
         let mut session =
             crate::session::Session::create_with_id(session_id.to_string(), None, None);
         session.model = Some("complete-immediately".to_string());
@@ -537,7 +537,7 @@ fn reload_starting_rejects_new_turns_for_multiple_sessions() {
         let provider: Arc<dyn Provider> = Arc::new(PanicOnForkProvider {
             forked: Arc::clone(&forked),
         });
-        let registry = Registry::new(Arc::clone(&provider)).await;
+        let registry = Registry::new(Arc::clone(&provider), None).await;
         let swarm_members = Arc::new(RwLock::new(HashMap::new()));
         let swarms_by_id = Arc::new(RwLock::new(HashMap::new()));
         let event_history = Arc::new(RwLock::new(std::collections::VecDeque::new()));
diff --git a/crates/jcode-app-core/src/server/client_session_tests.rs b/crates/jcode-app-core/src/server/client_session_tests.rs
index d8fd02226..2471090e5 100644
--- a/crates/jcode-app-core/src/server/client_session_tests.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests.rs
@@ -90,7 +90,7 @@ fn test_agent(messages: Vec<crate::session::StoredMessage>) -> Agent {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
     let rt = tokio::runtime::Runtime::new().expect("runtime");
     let _guard = rt.enter();
-    let registry = rt.block_on(Registry::new(provider.clone()));
+    let registry = rt.block_on(Registry::new(provider.clone(), None));
     build_test_agent(provider, registry, messages)
 }
 
diff --git a/crates/jcode-app-core/src/server/client_session_tests/clear.rs b/crates/jcode-app-core/src/server/client_session_tests/clear.rs
index 758515e19..09732a67f 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/clear.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/clear.rs
@@ -8,7 +8,7 @@ async fn handle_clear_session_replaces_runtime_handles_and_updates_shutdown_regi
 
     let old_session_id = "session_before_clear";
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/reload.rs b/crates/jcode-app-core/src/server/client_session_tests/reload.rs
index aef88e3a2..4f5d37556 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/reload.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/reload.rs
@@ -303,7 +303,7 @@ fn handle_reload_queues_signal_for_canary_session() -> Result<()> {
     rt.block_on(async {
         let mut rx = crate::server::subscribe_reload_signal_for_tests();
         let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-        let registry = Registry::new(provider.clone()).await;
+        let registry = Registry::new(provider.clone(), None).await;
         let mut agent = build_test_agent(provider, registry, Vec::new());
         agent.set_canary("self-dev");
         let agent = Arc::new(Mutex::new(agent));
@@ -407,7 +407,7 @@ async fn handle_reload_does_not_wait_for_busy_agent_lock() -> Result<()> {
     let mut rx = crate::server::subscribe_reload_signal_for_tests();
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = build_test_agent(provider, registry, Vec::new());
     let agent = Arc::new(Mutex::new(agent));
     let busy_agent_lock = agent.lock().await;
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs
index d04acd44e..0057ce38a 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs
@@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<(
     persisted.save()?;
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<(
         Vec::new(),
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs
index fc5cb93ff..b79f5a724 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs
@@ -20,7 +20,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy()
     };
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -28,7 +28,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy()
         vec![persisted_message],
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs
index 96040ce38..fb134048a 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs
@@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() ->
     persisted.save()?;
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() ->
         Vec::new(),
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs
index 97558cbdd..e45296af3 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs
@@ -14,7 +14,7 @@ async fn handle_resume_session_registers_live_events_before_history_replay() ->
     persisted.save()?;
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs
index 4dd0edd5a..6293e941d 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs
@@ -7,7 +7,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> {
     let temp_session_id = "session_temp_connecting";
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -15,7 +15,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> {
         Vec::new(),
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs
index 77aa96899..775090b6b 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs
@@ -14,7 +14,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() ->
     persisted.save()?;
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -22,7 +22,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() ->
         Vec::new(),
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs
index c044f0f48..cb6ce3b16 100644
--- a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs
+++ b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs
@@ -16,7 +16,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca
     persisted.save()?;
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let existing_registry = Registry::new(provider.clone()).await;
+    let existing_registry = Registry::new(provider.clone(), None).await;
     let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         existing_registry,
@@ -24,7 +24,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca
         Vec::new(),
     )));
 
-    let new_registry = Registry::new(provider.clone()).await;
+    let new_registry = Registry::new(provider.clone(), None).await;
     let new_agent = Arc::new(Mutex::new(build_test_agent_with_id(
         provider.clone(),
         new_registry.clone(),
diff --git a/crates/jcode-app-core/src/server/comm_control_tests.rs b/crates/jcode-app-core/src/server/comm_control_tests.rs
index faddcae4f..5108018e0 100644
--- a/crates/jcode-app-core/src/server/comm_control_tests.rs
+++ b/crates/jcode-app-core/src/server/comm_control_tests.rs
@@ -124,7 +124,7 @@ impl Provider for TestProvider {
 
 async fn test_agent() -> Arc<Mutex<Agent>> {
     let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     Arc::new(Mutex::new(Agent::new(provider, registry)))
 }
 
diff --git a/crates/jcode-app-core/src/server/comm_session_tests.rs b/crates/jcode-app-core/src/server/comm_session_tests.rs
index d7cf8e678..057a21faf 100644
--- a/crates/jcode-app-core/src/server/comm_session_tests.rs
+++ b/crates/jcode-app-core/src/server/comm_session_tests.rs
@@ -70,7 +70,7 @@ fn member(
 
 async fn test_agent_with_working_dir(session_id: &str, working_dir: &str) -> Arc<Mutex<Agent>> {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None);
     session.model = Some("mock".to_string());
     session.working_dir = Some(working_dir.to_string());
diff --git a/crates/jcode-app-core/src/server/debug_command_exec.rs b/crates/jcode-app-core/src/server/debug_command_exec.rs
index d23f08176..63f7824fa 100644
--- a/crates/jcode-app-core/src/server/debug_command_exec.rs
+++ b/crates/jcode-app-core/src/server/debug_command_exec.rs
@@ -697,7 +697,7 @@ mod tests {
         let mut reload_rx = crate::server::subscribe_reload_signal_for_tests();
 
         let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-        let registry = Registry::new(provider.clone()).await;
+        let registry = Registry::new(provider.clone(), None).await;
         registry.register_selfdev_tools().await;
 
         let mut agent = Agent::new(provider, registry);
@@ -747,7 +747,7 @@ mod tests {
     #[tokio::test]
     async fn debug_cancel_does_not_wait_for_busy_agent_lock() {
         let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-        let registry = Registry::new(provider.clone()).await;
+        let registry = Registry::new(provider.clone(), None).await;
         let agent = Arc::new(AsyncMutex::new(Agent::new(provider, registry)));
         let session_id = agent.lock().await.session_id().to_string();
 
diff --git a/crates/jcode-app-core/src/server/debug_tests.rs b/crates/jcode-app-core/src/server/debug_tests.rs
index 0c32dfc26..6e7b3ba65 100644
--- a/crates/jcode-app-core/src/server/debug_tests.rs
+++ b/crates/jcode-app-core/src/server/debug_tests.rs
@@ -646,7 +646,7 @@ mod debug_execution_tests {
 
     async fn test_agent() -> Arc<AsyncMutex<Agent>> {
         let provider = Arc::new(TestProvider) as Arc<dyn provider::Provider>;
-        let registry = Registry::new(provider.clone()).await;
+        let registry = Registry::new(provider.clone(), None).await;
         Arc::new(AsyncMutex::new(Agent::new(provider, registry)))
     }
 
diff --git a/crates/jcode-app-core/src/server/headless.rs b/crates/jcode-app-core/src/server/headless.rs
index 7d7004096..965ba64da 100644
--- a/crates/jcode-app-core/src/server/headless.rs
+++ b/crates/jcode-app-core/src/server/headless.rs
@@ -49,7 +49,7 @@ pub(super) async fn create_headless_session(
     };
 
     let provider = provider_template.fork();
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
 
     registry.enable_memory_test_mode().await;
 
diff --git a/crates/jcode-app-core/src/server/provider_control.rs b/crates/jcode-app-core/src/server/provider_control.rs
index d5e3489d8..cfa588c88 100644
--- a/crates/jcode-app-core/src/server/provider_control.rs
+++ b/crates/jcode-app-core/src/server/provider_control.rs
@@ -1242,7 +1242,7 @@ mod tests {
     ) {
         let provider = Arc::new(TestEffortProvider::default());
         let provider_dyn: Arc<dyn Provider> = provider.clone();
-        let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn)).await;
+        let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn), None).await;
         let mut session =
             crate::session::Session::create_with_id(session_id.to_string(), None, None);
         session.model = Some(provider.model());
diff --git a/crates/jcode-app-core/src/server/queue_tests.rs b/crates/jcode-app-core/src/server/queue_tests.rs
index 27eae2c06..35485d0df 100644
--- a/crates/jcode-app-core/src/server/queue_tests.rs
+++ b/crates/jcode-app-core/src/server/queue_tests.rs
@@ -41,7 +41,7 @@ impl Provider for TestProvider {
 
 async fn test_agent() -> Arc<Mutex<Agent>> {
     let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     Arc::new(Mutex::new(Agent::new(provider, registry)))
 }
 
@@ -165,7 +165,7 @@ async fn queue_soft_interrupt_for_session_persists_when_live_queue_is_unavailabl
     assert_eq!(persisted[0].source, SoftInterruptSource::BackgroundTask);
 
     let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut restored = Agent::new(provider, registry);
     restored
         .restore_session(&session_id)
diff --git a/crates/jcode-app-core/src/server/tests.rs b/crates/jcode-app-core/src/server/tests.rs
index e2240f2ca..9a59fe918 100644
--- a/crates/jcode-app-core/src/server/tests.rs
+++ b/crates/jcode-app-core/src/server/tests.rs
@@ -172,7 +172,7 @@ impl Provider for StreamingMockProvider {
 }
 
 async fn test_agent(provider: Arc<dyn Provider>) -> Arc<Mutex<Agent>> {
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     Arc::new(Mutex::new(Agent::new(provider, registry)))
 }
 
diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs
index 0b60936d9..0ba3e4930 100644
--- a/crates/jcode-app-core/src/tool/mod.rs
+++ b/crates/jcode-app-core/src/tool/mod.rs
@@ -275,7 +275,10 @@ impl Registry {
         tools
     }
 
-    pub async fn new(provider: Arc<dyn Provider>) -> Self {
+    pub async fn new(
+        provider: Arc<dyn Provider>,
+        agent_registry: Option<Arc<jcode_agent_runtime::AgentRegistry>>,
+    ) -> Self {
         let start = std::time::Instant::now();
         let skills_start = std::time::Instant::now();
         let skills = Self::shared_skills_registry();
@@ -324,7 +327,7 @@ impl Registry {
             Self::insert_tool(
                 &mut tools_map,
                 "subagent",
-                task::SubagentTool::new(provider, registry.clone(), None),
+                task::SubagentTool::new(provider, registry.clone(), agent_registry),
             );
             Self::insert_tool(
                 &mut tools_map,
diff --git a/crates/jcode-app-core/src/tool/tests.rs b/crates/jcode-app-core/src/tool/tests.rs
index 5f6f4f295..8fdbef2f8 100644
--- a/crates/jcode-app-core/src/tool/tests.rs
+++ b/crates/jcode-app-core/src/tool/tests.rs
@@ -33,7 +33,7 @@ impl Provider for MockProvider {
 async fn test_tool_definitions_are_sorted() {
     // Create registry with mock provider
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
 
     // Get definitions multiple times and verify they're always in the same order
     let defs1 = registry.definitions(None).await;
@@ -98,7 +98,7 @@ fn tool_definitions_do_not_auto_inject_intent() {
 #[tokio::test]
 async fn first_party_tool_definitions_include_optional_intent_explicitly() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     registry.register_ambient_tools().await;
 
     let defs = registry.definitions(None).await;
@@ -160,7 +160,7 @@ fn test_resolve_tool_name_oauth_aliases() {
 #[tokio::test]
 async fn test_batch_resolves_oauth_names() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let temp_dir = std::env::temp_dir();
     let temp_dir_str = temp_dir.to_string_lossy().to_string();
 
@@ -188,7 +188,7 @@ async fn test_batch_resolves_oauth_names() {
 #[tokio::test]
 async fn registry_execute_enforces_session_tool_policy_after_alias_resolution() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let temp_dir = std::env::temp_dir();
     let session_id = "test-policy-deny";
     set_session_tool_policy(session_id, None, HashSet::from(["grep".to_string()]));
@@ -225,7 +225,7 @@ async fn registry_execute_enforces_session_tool_policy_after_alias_resolution()
 #[tokio::test]
 async fn test_definitions_keep_batch_schema_generic() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
 
     let defs = registry.definitions(None).await;
     let batch_def = defs
@@ -255,7 +255,7 @@ fn resolve_tool_name_maps_communicate_to_swarm() {
 #[ignore]
 async fn print_tool_definition_token_report() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let mut defs = registry.definitions(None).await;
     defs.sort_by_key(|def| std::cmp::Reverse(def.prompt_token_estimate()));
 
@@ -324,7 +324,7 @@ fn collect_schema_errors(schema: &Value, path: &str, errors: &mut Vec<String>) {
 #[tokio::test]
 async fn test_tool_definitions_do_not_expose_invalid_array_schemas() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
 
     let defs = registry.definitions(None).await;
     let mut errors = Vec::new();
@@ -449,7 +449,7 @@ async fn test_context_guard_zero_budget_passes_through() {
 #[tokio::test]
 async fn test_request_permission_is_ambient_only() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
 
     let defs = registry.definitions(None).await;
     assert!(
@@ -476,7 +476,7 @@ async fn test_no_builtin_tools_env_disables_registry() {
     crate::env::set_var("JCODE_NO_BUILTIN_TOOLS", "1");
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let defs = registry.definitions(None).await;
 
     assert!(
@@ -502,7 +502,7 @@ async fn test_default_registry_has_builtin_tools() {
     crate::env::remove_var("JCODE_NO_BUILTIN_TOOLS");
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let defs = registry.definitions(None).await;
 
     assert!(
@@ -537,7 +537,7 @@ fn closest_tool_names_suggests_near_misses() {
 #[tokio::test]
 async fn unknown_tool_error_lists_available_tools_and_suggestions() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     registry.register_ambient_tools().await;
 
     let ctx = ToolContext {
diff --git a/crates/jcode-tui/src/tui/app/remote_tests.rs b/crates/jcode-tui/src/tui/app/remote_tests.rs
index f6150556e..4359b9688 100644
--- a/crates/jcode-tui/src/tui/app/remote_tests.rs
+++ b/crates/jcode-tui/src/tui/app/remote_tests.rs
@@ -40,7 +40,7 @@ impl Provider for MockProvider {
 fn create_test_app() -> crate::tui::app::App {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
     let rt = tokio::runtime::Runtime::new().expect("runtime");
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = crate::tui::app::App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
diff --git a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs
index aa830a570..014d543e9 100644
--- a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs
+++ b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs
@@ -451,7 +451,7 @@ fn test_model_picker_reuses_cached_entries_until_invalidated() {
         delay: Duration::ZERO,
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -492,7 +492,7 @@ fn test_shift_tab_model_favorite_hotkey_preserves_input_line() {
         delay: Duration::ZERO,
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -528,7 +528,7 @@ fn test_tui_api_key_auth_refreshes_catalog_shows_diff_without_opening_picker() {
     let refreshes = provider.refreshes.clone();
     let provider: Arc<dyn Provider> = Arc::new(provider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -625,7 +625,7 @@ fn test_tui_cerebras_paste_key_lifecycle_has_no_degraded_success_messages() {
     let set_model_requests = fake_provider.set_model_requests.clone();
     let provider: Arc<dyn Provider> = Arc::new(fake_provider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -917,7 +917,7 @@ fn test_tui_openai_compatible_empty_catalog_does_not_switch_to_profile_default()
         set_model_attempts: StdArc::clone(&set_model_attempts),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -977,7 +977,7 @@ fn test_tui_openai_compatible_local_refresh_failure_is_pending_not_final_failure
         set_model_attempts: StdArc::clone(&set_model_attempts),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -1044,7 +1044,7 @@ fn test_model_picker_opens_simplified_state_before_async_routes_complete() {
         delay: Duration::from_millis(75),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -1083,7 +1083,7 @@ fn test_model_picker_state_space_preserves_provider_labels_after_route_hydration
         model: StdArc::new(StdMutex::new("gpt-5.5".to_string())),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -1154,7 +1154,7 @@ fn test_model_picker_does_not_cache_single_model_fallback() {
         delay: Duration::ZERO,
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -1215,7 +1215,7 @@ fn test_login_completed_spawns_auth_refresh_when_runtime_is_available() {
         delay: Duration::from_millis(150),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -1424,7 +1424,7 @@ fn test_azure_login_completion_switches_local_model_without_completion() {
         complete_calls: StdArc::clone(&complete_calls),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
diff --git a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs
index 4d05361ee..5af4ec460 100644
--- a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs
+++ b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs
@@ -182,7 +182,7 @@ fn create_test_app() -> App {
 
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -196,7 +196,7 @@ fn create_named_provider_test_app(name: &'static str, model: &'static str) -> Ap
 
     let provider: Arc<dyn Provider> = Arc::new(NamedMockProvider { name, model });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -222,7 +222,7 @@ fn create_refresh_summary_test_app(summary: crate::provider::ModelCatalogRefresh
 
     let provider: Arc<dyn Provider> = Arc::new(RefreshSummaryProvider { summary });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -239,7 +239,7 @@ fn create_openrouter_spec_capture_test_app() -> (App, StdArc<StdMutex<Vec<String
         set_model_calls: set_model_calls.clone(),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
diff --git a/crates/jcode-tui/src/tui/app/tests/support_failover/part_02.rs b/crates/jcode-tui/src/tui/app/tests/support_failover/part_02.rs
index aab5e46a8..f4e1fd7ec 100644
--- a/crates/jcode-tui/src/tui/app/tests/support_failover/part_02.rs
+++ b/crates/jcode-tui/src/tui/app/tests/support_failover/part_02.rs
@@ -117,7 +117,7 @@ fn create_switchable_test_app(initial_provider: &str) -> (App, StdArc<StdMutex<S
         active_provider: active_provider.clone(),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -248,7 +248,7 @@ fn create_auth_refresh_test_app() -> App {
         logged_in: StdArc::new(StdMutex::new(false)),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -331,7 +331,7 @@ fn create_antigravity_picker_test_app() -> App {
         model: StdArc::new(StdMutex::new("default".to_string())),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -455,7 +455,7 @@ fn create_login_smoke_model_app() -> App {
 
     let provider: Arc<dyn Provider> = Arc::new(LoginSmokeModelProvider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -512,7 +512,7 @@ fn create_failing_model_switch_test_app() -> App {
 
     let provider: Arc<dyn Provider> = Arc::new(FailingModelSwitchProvider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -537,7 +537,7 @@ fn create_fast_test_app() -> App {
         service_tier: StdArc::new(StdMutex::new(None)),
     });
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
@@ -574,7 +574,7 @@ fn create_gemini_test_app() -> App {
 
     let provider: Arc<dyn Provider> = Arc::new(GeminiMockProvider);
     let rt = tokio::runtime::Runtime::new().unwrap();
-    let registry = rt.block_on(crate::tool::Registry::new(provider.clone()));
+    let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None));
     let mut app = App::new_for_test_harness(provider, registry);
     app.queue_mode = false;
     app.diff_mode = crate::config::DiffDisplayMode::Inline;
diff --git a/crates/jcode-tui/src/tui/ui_header.rs b/crates/jcode-tui/src/tui/ui_header.rs
index 962d5a661..6bf17a918 100644
--- a/crates/jcode-tui/src/tui/ui_header.rs
+++ b/crates/jcode-tui/src/tui/ui_header.rs
@@ -782,7 +782,7 @@ mod tests {
 
         let provider: Arc<dyn Provider> = Arc::new(MockProvider);
         let rt = tokio::runtime::Runtime::new().expect("test runtime");
-        let registry = rt.block_on(Registry::new(provider.clone()));
+        let registry = rt.block_on(Registry::new(provider.clone(), None));
         crate::tui::app::App::new_for_test_harness(provider, registry)
     }
 
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index 35b9d31c5..bce8442a4 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -148,16 +148,187 @@ async fn main() -> Result<()> {
 }
 
 async fn pick_commits_impl(
-    _repo_url: &str,
-    _min_msg_len: usize,
-    _max_picks: usize,
-    _output: Option<PathBuf>,
+    repo_path: &str,
+    min_msg_len: usize,
+    max_picks: usize,
+    output: Option<PathBuf>,
 ) -> Result<()> {
-    todo_step("Phase 5.2: commit selection via git log heuristics + message quality filter")
+    // Verify the path is a git repository.
+    let check = std::process::Command::new("git")
+        .args(["-C", repo_path, "rev-parse", "--is-inside-work-tree"])
+        .output()
+        .context("failed to run git rev-parse")?;
+    if !check.status.success() {
+        anyhow::bail!("{} is not a git repository", repo_path);
+    }
+
+    // Get commit log: SHA, first parent, subject, then shortstat on the
+    // following line.  `COMMIT` acts as a block separator.
+    let log_out = std::process::Command::new("git")
+        .args([
+            "-C",
+            repo_path,
+            "log",
+            "--format=COMMIT%n%H%n%P%n%s",
+            "--shortstat",
+        ])
+        .output()
+        .context("failed to run git log")?;
+
+    if !log_out.status.success() {
+        let stderr = String::from_utf8_lossy(&log_out.stderr);
+        anyhow::bail!("git log failed: {}", stderr);
+    }
+
+    let stdout = String::from_utf8_lossy(&log_out.stdout);
+    let mut picked: Vec<serde_json::Value> = Vec::new();
+
+    for block in stdout.split("COMMIT\n").skip(1) {
+        let lines: Vec<&str> = block.lines().collect();
+        if lines.len() < 3 {
+            continue;
+        }
+
+        let sha = lines[0].trim();
+        let parent_sha = lines[1]
+            .split_whitespace()
+            .next()
+            .unwrap_or("")
+            .to_string();
+        let subject = lines[2].trim();
+
+        // Skip root commits (no parent).
+        if parent_sha.is_empty() {
+            continue;
+        }
+
+        // Filter: commit message must meet minimum length.
+        if subject.len() < min_msg_len {
+            continue;
+        }
+
+        // Parse file count from shortstat (e.g. " 3 files changed, …").
+        let file_count = lines
+            .iter()
+            .rev()
+            .find(|l| l.contains(" file"))
+            .and_then(|l| l.split_whitespace().next()?.parse::<usize>().ok())
+            .unwrap_or(0);
+
+        // Filter: bounded scope — not zero files, not a mega-commit.
+        if file_count == 0 || file_count > 10 {
+            continue;
+        }
+
+        picked.push(serde_json::json!({
+            "sha": sha,
+            "parent_sha": parent_sha,
+            "spec": subject,
+            "prompt": subject,
+        }));
+
+        if picked.len() >= max_picks {
+            break;
+        }
+    }
+
+    let json = serde_json::to_string_pretty(&picked)?;
+    if let Some(path) = output {
+        std::fs::write(&path, &json)?;
+        eprintln!("Wrote {} commits to {}", picked.len(), path.display());
+    } else {
+        println!("{json}");
+    }
+
+    Ok(())
 }
 
-async fn gen_evals_impl(_input: &PathBuf, _output: &PathBuf) -> Result<()> {
-    todo_step("Phase 5.2: read commit list, fetch each SHA, render EvalDataV2 JSON")
+async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> {
+    use jcode_jbench::types::{EvalCommit, EvalDataV2};
+
+    // Intermediate struct matching the pick-commits output format.
+    #[derive(serde::Deserialize)]
+    struct PickedCommit {
+        sha: String,
+        parent_sha: String,
+        spec: String,
+        prompt: String,
+    }
+
+    // Read input JSON.
+    let input_text = std::fs::read_to_string(input)
+        .with_context(|| format!("failed to read input file {}", input.display()))?;
+    let picked: Vec<PickedCommit> = serde_json::from_str(&input_text)
+        .context("failed to parse input JSON as array of picked commits")?;
+
+    if picked.is_empty() {
+        anyhow::bail!("input file contains no commits");
+    }
+
+    // Detect repo URL from the local git remote.
+    let repo_url = get_repo_url().unwrap_or_else(|| "unknown".to_owned());
+
+    let mut eval_commits = Vec::with_capacity(picked.len());
+
+    for pc in &picked {
+        let id = format!("{}-eval", &pc.sha[..std::cmp::min(8, pc.sha.len())]);
+
+        // git diff --name-status to get file statuses.
+        let name_status = run_git(&[
+            "diff",
+            "--name-status",
+            &format!("{}..{}", pc.parent_sha, pc.sha),
+        ])
+        .with_context(|| {
+            format!(
+                "git diff --name-status failed for {}..{}",
+                pc.parent_sha, pc.sha
+            )
+        })?;
+
+        // git diff to get the full unified diff.
+        let full_diff = run_git(&[
+            "diff",
+            &format!("{}..{}", pc.parent_sha, pc.sha),
+        ])
+        .with_context(|| {
+            format!("git diff failed for {}..{}", pc.parent_sha, pc.sha)
+        })?;
+
+        let file_diffs = parse_diffs(&name_status, &full_diff);
+
+        eval_commits.push(EvalCommit {
+            id,
+            sha: pc.sha.clone(),
+            parent_sha: pc.parent_sha.clone(),
+            spec: pc.spec.clone(),
+            prompt: pc.prompt.clone(),
+            supplemental_files: Vec::new(),
+            file_diffs,
+        });
+    }
+
+    let eval_data = EvalDataV2 {
+        repo_url,
+        test_repo_name: None,
+        generation_date: chrono_now(),
+        init_command: None,
+        env: std::collections::HashMap::new(),
+        final_check_commands: Vec::new(),
+        eval_commits,
+    };
+
+    let json = serde_json::to_string_pretty(&eval_data)
+        .context("failed to serialize EvalDataV2")?;
+    std::fs::write(output, &json)
+        .with_context(|| format!("failed to write output file {}", output.display()))?;
+
+    println!(
+        "Wrote {} eval commits to {}",
+        eval_data.eval_commits.len(),
+        output.display()
+    );
+    Ok(())
 }
 
 #[cfg(feature = "agent-runner")]
@@ -303,3 +474,160 @@ fn todo_step(phase: &str) -> Result<()> {
     eprintln!("{phase}");
     std::process::exit(2);
 }
+
+/// Run a `git` subcommand and return its stdout as a `String`.
+fn run_git(args: &[&str]) -> Result<String> {
+    let output = std::process::Command::new("git")
+        .args(args)
+        .output()
+        .context("failed to spawn git")?;
+    if !output.status.success() {
+        let stderr = String::from_utf8_lossy(&output.stderr);
+        anyhow::bail!("git {} failed: {}", args.join(" "), stderr.trim());
+    }
+    Ok(String::from_utf8_lossy(&output.stdout).into_owned())
+}
+
+/// Try to detect the repo URL from `git remote get-url origin`.
+fn get_repo_url() -> Option<String> {
+    std::process::Command::new("git")
+        .args(["remote", "get-url", "origin"])
+        .output()
+        .ok()
+        .filter(|o| o.status.success())
+        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_owned())
+}
+
+/// ISO-8601 timestamp without pulling in a full datetime crate.
+fn chrono_now() -> String {
+    // Use a simple approach: seconds since epoch formatted manually
+    // would be ideal, but for simplicity just use a debug-friendly format.
+    // The `chrono` crate isn't in deps, so we format from SystemTime.
+    use std::time::SystemTime;
+    let dur = SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)
+        .unwrap_or_default();
+    let secs = dur.as_secs();
+    // Break into Y-M-D H:M:S (UTC, simplified leap-year handling).
+    let days = secs / 86400;
+    let time_of_day = secs % 86400;
+    let h = time_of_day / 3600;
+    let m = (time_of_day % 3600) / 60;
+    let s = time_of_day % 60;
+    // Days since 1970-01-01 -> Y/M/D via a simple civil calendar.
+    let (y, mo, d) = civil_from_days(days as i64);
+    format!("{y:04}-{mo:02}-{d:02}T{h:02}:{m:02}:{s:02}Z")
+}
+
+/// Convert days since 1970-01-01 to (year, month, day).
+/// Uses Howard Hinnant's algorithm.
+fn civil_from_days(days: i64) -> (i64, u32, u32) {
+    let z = days + 719468;
+    let era = if z >= 0 { z } else { z - 146096 } / 146097;
+    let doe = (z - era * 146097) as u32;
+    let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365;
+    let y = yoe as i64 + era * 400;
+    let doy = doe - (365 * yoe + yoe / 4 - yoe / 100);
+    let mp = (5 * doy + 2) / 153;
+    let d = doy - (153 * mp + 2) / 5 + 1;
+    let m = if mp < 10 { mp + 3 } else { mp - 9 };
+    let y = if m <= 2 { y + 1 } else { y };
+    (y, m, d)
+}
+
+/// Parse `git diff --name-status` output and the full unified diff into
+/// `FileDiff` structs.
+///
+/// The name-status output gives us file paths and status codes; we split
+/// the full diff by file to associate each chunk with the right file.
+fn parse_diffs(name_status: &str, full_diff: &str) -> Vec<jcode_jbench::types::FileDiff> {
+    use jcode_jbench::types::{FileDiff, FileDiffStatus};
+
+    // Parse name-status lines: e.g. "M\tpath/to/file.rs" or "R100\told\tnew".
+    let mut file_entries: Vec<(FileDiffStatus, String, Option<String>)> = Vec::new();
+    for line in name_status.lines() {
+        let line = line.trim();
+        if line.is_empty() {
+            continue;
+        }
+        let parts: Vec<&str> = line.split('\t').collect();
+        if parts.len() < 2 {
+            continue;
+        }
+        let code = parts[0];
+        let (status, path, old_path) = match code {
+            "M" => (FileDiffStatus::Modified, parts[1].to_owned(), None),
+            "A" => (FileDiffStatus::Added, parts[1].to_owned(), None),
+            "D" => (FileDiffStatus::Deleted, parts[1].to_owned(), None),
+            r if r.starts_with('R') => {
+                // Renamed: "R100\told_path\tnew_path"
+                if parts.len() >= 3 {
+                    (FileDiffStatus::Renamed, parts[2].to_owned(), Some(parts[1].to_owned()))
+                } else {
+                    (FileDiffStatus::Modified, parts[1].to_owned(), None)
+                }
+            }
+            "C" => {
+                // Copied — treat as Added for our purposes.
+                let path = if parts.len() >= 3 { parts[2] } else { parts[1] };
+                (FileDiffStatus::Added, path.to_owned(), None)
+            }
+            _ => (FileDiffStatus::Modified, parts[1].to_owned(), None),
+        };
+        file_entries.push((status, path, old_path));
+    }
+
+    // Split the full diff by "diff --git" boundaries to get per-file chunks.
+    let file_diffs_map = split_diff_by_file(full_diff);
+
+    // Build FileDiff structs, matching by path.
+    let mut result = Vec::with_capacity(file_entries.len());
+    for (status, path, old_path) in file_entries {
+        let diff_text = file_diffs_map
+            .get(&path)
+            .cloned()
+            .unwrap_or_default();
+        result.push(FileDiff {
+            path,
+            status,
+            old_path,
+            diff: diff_text,
+        });
+    }
+
+    result
+}
+
+/// Split a unified diff into per-file chunks keyed by the post-image path.
+fn split_diff_by_file(full_diff: &str) -> std::collections::HashMap<String, String> {
+    let mut map = std::collections::HashMap::new();
+    let mut current_path: Option<String> = None;
+    let mut current_chunk = String::new();
+
+    for line in full_diff.lines() {
+        if line.starts_with("diff --git ") {
+            // Save previous chunk.
+            if let Some(ref p) = current_path {
+                map.insert(p.clone(), current_chunk.clone());
+            }
+            // Extract the post-image path from "diff --git a/path b/path".
+            let path = line
+                .splitn(2, " b/")
+                .nth(1)
+                .unwrap_or("")
+                .to_owned();
+            current_path = Some(path);
+            current_chunk.clear();
+        }
+        if current_path.is_some() {
+            current_chunk.push_str(line);
+            current_chunk.push('\n');
+        }
+    }
+    // Don't forget the last chunk.
+    if let Some(p) = current_path {
+        map.insert(p, current_chunk);
+    }
+
+    map
+}

From f50f912451fae6aa1a38a9dda839673f2ca62ea7 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:17:27 +0700
Subject: [PATCH 20/22] =?UTF-8?q?feat(multi-agent):=20Phase=203=20?=
 =?UTF-8?q?=E2=80=94=20TeamCreateTool=20+=20Task=20management=20tools?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## New tools registered in Registry

### team_create
Creates a team with name + description. Stores config as JSON at
~/.jcode/teams/<name>.json. Idempotent — re-creating returns existing.

### team_delete
Deletes a team config file by name.

### task_create
Adds a task to an existing team. Validates team exists. Uses UUID
for task IDs.

### task_update
Updates task status and/or owner. Partial updates supported.

### task_list
Lists all tasks in a team with their status and owner.

## Files
- crates/jcode-app-core/src/tool/team.rs — TeamConfig, TeamCreateTool, TeamDeleteTool
- crates/jcode-app-core/src/tool/task_management.rs — TaskCreate/Update/ListTool
- crates/jcode-app-core/src/tool/mod.rs — register 5 new tools

Build: cargo check passes (2 pre-existing warnings).
---
 crates/jcode-app-core/src/tool/mod.rs         |  32 +++
 .../src/tool/task_management.rs               | 256 ++++++++++++++++++
 crates/jcode-app-core/src/tool/team.rs        | 211 +++++++++++++++
 3 files changed, 499 insertions(+)
 create mode 100644 crates/jcode-app-core/src/tool/task_management.rs
 create mode 100644 crates/jcode-app-core/src/tool/team.rs

diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs
index 0ba3e4930..c010b94c9 100644
--- a/crates/jcode-app-core/src/tool/mod.rs
+++ b/crates/jcode-app-core/src/tool/mod.rs
@@ -30,6 +30,8 @@ mod session_search;
 mod side_panel;
 mod skill;
 mod task;
+pub mod task_management;
+mod team;
 mod todo;
 mod webfetch;
 mod websearch;
@@ -252,6 +254,36 @@ impl Registry {
             Self::insert_tool_timed(&mut m, &mut timings, "gmail", gmail::GmailTool::new);
             Self::insert_tool_timed(&mut m, &mut timings, "schedule", ambient::ScheduleTool::new);
             Self::insert_tool_timed(&mut m, &mut timings, "selfdev", selfdev::SelfDevTool::new);
+            Self::insert_tool_timed(
+                &mut m,
+                &mut timings,
+                "team_create",
+                team::TeamCreateTool::new,
+            );
+            Self::insert_tool_timed(
+                &mut m,
+                &mut timings,
+                "team_delete",
+                team::TeamDeleteTool::new,
+            );
+            Self::insert_tool_timed(
+                &mut m,
+                &mut timings,
+                "task_create",
+                task_management::TaskCreateTool::new,
+            );
+            Self::insert_tool_timed(
+                &mut m,
+                &mut timings,
+                "task_update",
+                task_management::TaskUpdateTool::new,
+            );
+            Self::insert_tool_timed(
+                &mut m,
+                &mut timings,
+                "task_list",
+                task_management::TaskListTool::new,
+            );
             let nonzero: Vec<String> = timings
                 .iter()
                 .filter(|(_, ms)| *ms > 0)
diff --git a/crates/jcode-app-core/src/tool/task_management.rs b/crates/jcode-app-core/src/tool/task_management.rs
new file mode 100644
index 000000000..896e89093
--- /dev/null
+++ b/crates/jcode-app-core/src/tool/task_management.rs
@@ -0,0 +1,256 @@
+use super::{Tool, ToolContext, ToolOutput};
+use super::team::{TeamConfig, TeamTask};
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::Deserialize;
+use serde_json::{Value, json};
+
+// ---------------------------------------------------------------------------
+// TaskCreateTool
+// ---------------------------------------------------------------------------
+
+pub struct TaskCreateTool;
+
+impl TaskCreateTool {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Deserialize)]
+struct TaskCreateInput {
+    team_name: String,
+    subject: String,
+    description: String,
+}
+
+#[async_trait]
+impl Tool for TaskCreateTool {
+    fn name(&self) -> &str {
+        "task_create"
+    }
+
+    fn description(&self) -> &str {
+        "Create a new task within a team. The task starts with status 'pending' \
+         and no owner assigned."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "required": ["team_name", "subject", "description"],
+            "properties": {
+                "intent": super::intent_schema_property(),
+                "team_name": {
+                    "type": "string",
+                    "description": "Team to add the task to."
+                },
+                "subject": {
+                    "type": "string",
+                    "description": "Short task title."
+                },
+                "description": {
+                    "type": "string",
+                    "description": "Detailed task description."
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> {
+        let params: TaskCreateInput = serde_json::from_value(input)?;
+
+        let mut team = match TeamConfig::load(&params.team_name)? {
+            Some(t) => t,
+            None => {
+                return Err(anyhow::anyhow!(
+                    "Team '{}' not found. Create it first with team_create.",
+                    params.team_name
+                ));
+            }
+        };
+
+        let task_id = format!("task-{}", uuid::Uuid::new_v4().as_simple());
+        let task = TeamTask {
+            id: task_id.clone(),
+            subject: params.subject,
+            description: params.description,
+            status: "pending".to_string(),
+            owner: None,
+        };
+        team.tasks.push(task);
+        team.save()?;
+
+        Ok(ToolOutput::new(format!(
+            "Task '{}' created in team '{}'.",
+            task_id, params.team_name
+        ))
+        .with_title(format!("Task created: {}", task_id)))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// TaskUpdateTool
+// ---------------------------------------------------------------------------
+
+pub struct TaskUpdateTool;
+
+impl TaskUpdateTool {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Deserialize)]
+struct TaskUpdateInput {
+    team_name: String,
+    task_id: String,
+    #[serde(default)]
+    status: Option<String>,
+    #[serde(default)]
+    owner: Option<String>,
+}
+
+#[async_trait]
+impl Tool for TaskUpdateTool {
+    fn name(&self) -> &str {
+        "task_update"
+    }
+
+    fn description(&self) -> &str {
+        "Update a task's status or owner within a team."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "required": ["team_name", "task_id"],
+            "properties": {
+                "intent": super::intent_schema_property(),
+                "team_name": {
+                    "type": "string",
+                    "description": "Team containing the task."
+                },
+                "task_id": {
+                    "type": "string",
+                    "description": "Task ID to update."
+                },
+                "status": {
+                    "type": "string",
+                    "enum": ["pending", "in_progress", "completed"],
+                    "description": "New status for the task."
+                },
+                "owner": {
+                    "type": "string",
+                    "description": "Assign or reassign the task to a team member name."
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> {
+        let params: TaskUpdateInput = serde_json::from_value(input)?;
+
+        let mut team = match TeamConfig::load(&params.team_name)? {
+            Some(t) => t,
+            None => {
+                return Err(anyhow::anyhow!(
+                    "Team '{}' not found.",
+                    params.team_name
+                ));
+            }
+        };
+
+        let task = team
+            .tasks
+            .iter_mut()
+            .find(|t| t.id == params.task_id)
+            .ok_or_else(|| anyhow::anyhow!("Task '{}' not found.", params.task_id))?;
+
+        if let Some(status) = params.status {
+            task.status = status;
+        }
+        if let Some(owner) = params.owner {
+            task.owner = Some(owner);
+        }
+
+        let updated = task.clone();
+        team.save()?;
+
+        Ok(ToolOutput::new(format!(
+            "Task '{}' updated.\n\n{}",
+            params.task_id,
+            serde_json::to_string_pretty(&updated)?
+        ))
+        .with_title(format!("Task '{}' updated", params.task_id)))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// TaskListTool
+// ---------------------------------------------------------------------------
+
+pub struct TaskListTool;
+
+impl TaskListTool {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Deserialize)]
+struct TaskListInput {
+    team_name: String,
+}
+
+#[async_trait]
+impl Tool for TaskListTool {
+    fn name(&self) -> &str {
+        "task_list"
+    }
+
+    fn description(&self) -> &str {
+        "List all tasks in a team, showing their status and owner."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "required": ["team_name"],
+            "properties": {
+                "intent": super::intent_schema_property(),
+                "team_name": {
+                    "type": "string",
+                    "description": "Team to list tasks for."
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> {
+        let params: TaskListInput = serde_json::from_value(input)?;
+
+        let team = match TeamConfig::load(&params.team_name)? {
+            Some(t) => t,
+            None => {
+                return Err(anyhow::anyhow!(
+                    "Team '{}' not found.",
+                    params.team_name
+                ));
+            }
+        };
+
+        let output = serde_json::to_string_pretty(&team.tasks)?;
+        let summary = format!(
+            "Team '{}': {} task(s) total, {} pending, {} in_progress, {} completed.",
+            params.team_name,
+            team.tasks.len(),
+            team.tasks.iter().filter(|t| t.status == "pending").count(),
+            team.tasks.iter().filter(|t| t.status == "in_progress").count(),
+            team.tasks.iter().filter(|t| t.status == "completed").count(),
+        );
+
+        Ok(ToolOutput::new(format!("{}\n\n{}", summary, output))
+            .with_title(format!("{} tasks in '{}'", team.tasks.len(), params.team_name)))
+    }
+}
diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs
new file mode 100644
index 000000000..72db41b3b
--- /dev/null
+++ b/crates/jcode-app-core/src/tool/team.rs
@@ -0,0 +1,211 @@
+use super::{Tool, ToolContext, ToolOutput};
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::Deserialize;
+use serde_json::{Value, json};
+use std::path::PathBuf;
+
+/// Get the teams directory path (~/.jcode/teams/).
+fn teams_dir() -> PathBuf {
+    dirs::home_dir()
+        .unwrap_or_else(|| PathBuf::from("."))
+        .join(".jcode")
+        .join("teams")
+}
+
+/// Team configuration stored as JSON on disk.
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct TeamConfig {
+    pub name: String,
+    pub description: String,
+    pub created_at: String,
+    pub members: Vec<TeamMember>,
+    pub tasks: Vec<TeamTask>,
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct TeamMember {
+    pub name: String,
+    pub session_id: String,
+    pub agent_type: String,
+    pub status: String, // "active" | "idle" | "shutdown"
+}
+
+#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
+pub struct TeamTask {
+    pub id: String,
+    pub subject: String,
+    pub description: String,
+    pub status: String,       // "pending" | "in_progress" | "completed"
+    pub owner: Option<String>, // member name
+}
+
+impl TeamConfig {
+    /// Load a team config from disk by name.
+    pub fn load(name: &str) -> Result<Option<Self>> {
+        let path = teams_dir().join(format!("{name}.json"));
+        if !path.exists() {
+            return Ok(None);
+        }
+        let text = std::fs::read_to_string(&path)?;
+        Ok(Some(serde_json::from_str(&text)?))
+    }
+
+    /// Save this team config to disk.
+    pub fn save(&self) -> Result<()> {
+        let dir = teams_dir();
+        std::fs::create_dir_all(&dir)?;
+        let path = dir.join(format!("{}.json", self.name));
+        let json = serde_json::to_string_pretty(self)?;
+        std::fs::write(&path, json)?;
+        Ok(())
+    }
+
+    /// Delete a team config from disk by name.
+    pub fn delete(name: &str) -> Result<()> {
+        let path = teams_dir().join(format!("{name}.json"));
+        if path.exists() {
+            std::fs::remove_file(&path)?;
+        }
+        Ok(())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// TeamCreateTool
+// ---------------------------------------------------------------------------
+
+pub struct TeamCreateTool;
+
+impl TeamCreateTool {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Deserialize)]
+struct TeamCreateInput {
+    name: String,
+    description: String,
+}
+
+#[async_trait]
+impl Tool for TeamCreateTool {
+    fn name(&self) -> &str {
+        "team_create"
+    }
+
+    fn description(&self) -> &str {
+        "Create a new team for coordinating sub-agents. Stores a lightweight \
+         team config file at ~/.jcode/teams/<name>.json that tracks members, \
+         tasks, and status."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "required": ["name", "description"],
+            "properties": {
+                "intent": super::intent_schema_property(),
+                "name": {
+                    "type": "string",
+                    "description": "Unique team name (used as filename)."
+                },
+                "description": {
+                    "type": "string",
+                    "description": "What this team is for."
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> {
+        let params: TeamCreateInput = serde_json::from_value(input)?;
+
+        if let Some(existing) = TeamConfig::load(&params.name)? {
+            return Ok(ToolOutput::new(format!(
+                "Team '{}' already exists.\n\n{}",
+                params.name,
+                serde_json::to_string_pretty(&existing)?
+            ))
+            .with_title(format!("Team '{}' already exists", params.name)));
+        }
+
+        let team = TeamConfig {
+            name: params.name.clone(),
+            description: params.description.clone(),
+            created_at: chrono::Utc::now().to_rfc3339(),
+            members: Vec::new(),
+            tasks: Vec::new(),
+        };
+        team.save()?;
+
+        let output = serde_json::to_string_pretty(&team)?;
+        Ok(ToolOutput::new(format!(
+            "Team '{}' created.\n\n{}",
+            params.name, output
+        ))
+        .with_title(format!("Team '{}' created", params.name)))
+    }
+}
+
+// ---------------------------------------------------------------------------
+// TeamDeleteTool
+// ---------------------------------------------------------------------------
+
+pub struct TeamDeleteTool;
+
+impl TeamDeleteTool {
+    pub fn new() -> Self {
+        Self
+    }
+}
+
+#[derive(Deserialize)]
+struct TeamDeleteInput {
+    name: String,
+}
+
+#[async_trait]
+impl Tool for TeamDeleteTool {
+    fn name(&self) -> &str {
+        "team_delete"
+    }
+
+    fn description(&self) -> &str {
+        "Delete a team configuration. Removes the team config file from \
+         ~/.jcode/teams/<name>.json."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "required": ["name"],
+            "properties": {
+                "intent": super::intent_schema_property(),
+                "name": {
+                    "type": "string",
+                    "description": "Team name to delete."
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, input: Value, _ctx: ToolContext) -> Result<ToolOutput> {
+        let params: TeamDeleteInput = serde_json::from_value(input)?;
+
+        let existed = TeamConfig::load(&params.name)?.is_some();
+        TeamConfig::delete(&params.name)?;
+
+        if existed {
+            Ok(ToolOutput::new(format!("Team '{}' deleted.", params.name))
+                .with_title(format!("Team '{}' deleted", params.name)))
+        } else {
+            Ok(ToolOutput::new(format!(
+                "Team '{}' did not exist (no-op).",
+                params.name
+            ))
+            .with_title(format!("Team '{}' not found", params.name)))
+        }
+    }
+}

From d06a4175d02e244a91a91e73d23fcf8ab5665d81 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Fri, 5 Jun 2026 23:42:57 +0700
Subject: [PATCH 21/22] fix(ci): resolve conflict markers + clippy + fmt fixes

- Remove stale >>>>>>> conflict marker in skill.rs
- Fix clippy: derive Default on PermissionMode instead of manual impl
- Fix clippy: collapsible if-let in tier.rs
- Fix clippy: doc list indentation in output.rs
- cargo fmt --all
---
 crates/jcode-agent-runtime/src/output.rs      |  6 +--
 crates/jcode-agent-runtime/src/permission.rs  | 19 ++------
 crates/jcode-agent-runtime/src/tier.rs        |  8 ++--
 .../src/agent/turn_execution.rs               |  4 +-
 .../src/agent/turn_streaming_mpsc.rs          | 15 +++---
 crates/jcode-app-core/src/dcg_bridge.rs       | 10 +++-
 crates/jcode-app-core/src/lib.rs              |  2 +-
 .../jcode-app-core/src/server/comm_session.rs |  7 +--
 .../src/server/comm_session_tests.rs          | 15 ++++--
 .../jcode-app-core/src/tool/selfdev/setup.rs  | 46 +++++++------------
 .../jcode-app-core/src/tool/selfdev/tests.rs  |  8 +++-
 .../src/tool/task_management.rs               | 31 +++++++------
 crates/jcode-app-core/src/tool/team.rs        | 20 ++++----
 .../src/auth/live_provider_probes.rs          | 10 +++-
 crates/jcode-base/src/auth/provider_e2e.rs    | 30 ++++++------
 crates/jcode-base/src/provider/gemini.rs      |  4 +-
 .../jcode-base/src/provider/gemini_tests.rs   |  5 +-
 crates/jcode-base/src/provider/mod.rs         |  2 +-
 crates/jcode-base/src/skill.rs                |  1 -
 crates/jcode-base/src/telemetry/tests.rs      | 19 ++++++--
 crates/jcode-provider-core/src/lib.rs         |  8 ++--
 crates/jcode-provider-core/src/selection.rs   | 15 ++++--
 .../src/render_core_adapter_tests.rs          | 38 +++++++++++----
 crates/jcode-tui/src/tui/app/misc_ui.rs       |  8 ++--
 crates/jcode-tui/src/tui/app/tests.rs         |  5 +-
 crates/jcode-tui/src/tui/info_widget.rs       |  6 ++-
 evals/jbench/src/agent_runner.rs              | 10 ++--
 evals/jbench/src/bin/jbench.rs                | 40 ++++++----------
 evals/jbench/src/judge.rs                     |  6 ++-
 src/cli/provider_doctor.rs                    |  5 +-
 tests/e2e/reload_multiclient.rs               |  5 +-
 31 files changed, 230 insertions(+), 178 deletions(-)

diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs
index 93dc60a93..bda4ee17d 100644
--- a/crates/jcode-agent-runtime/src/output.rs
+++ b/crates/jcode-agent-runtime/src/output.rs
@@ -5,9 +5,9 @@
 //!
 //! - `LastMessage`: parent receives only the agent's final assistant turn.
 //!   Default. Good for "research-and-summarize" agents like file-picker.
-//! - `AllMessages`: parent receives the full child message history (text
-//!   + tool calls + tool results). Good for editor-like agents that need
-//!   to expose their full edit trace.
+//! - `AllMessages`: parent receives the full child message history
+//!   (text + tool calls + tool results). Good for editor-like agents
+//!   that need to expose their full edit trace.
 //! - `StructuredOutput`: agent must call `set_output` with a JSON value
 //!   that conforms to `output_schema`. Good for judge agents, lessons
 //!   extractors, structured planners.
diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs
index 41db95a72..2f112efc0 100644
--- a/crates/jcode-agent-runtime/src/permission.rs
+++ b/crates/jcode-agent-runtime/src/permission.rs
@@ -28,11 +28,12 @@ use std::fmt;
 /// This enum intentionally mirrors `dcg_core::Mode` (from the
 /// `destructive_command_guard` crate) so that `jcode-agent-runtime`
 /// does not need to depend on `dcg-core` directly.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)]
 #[serde(rename_all = "kebab-case")]
 pub enum PermissionMode {
     /// Rule-based classification using the legacy `AUTO_ALLOWED` list.
     /// Read-only tools auto-allowed; writes require permission.
+    #[default]
     Default,
     /// File operations (edit, write, patch) auto-allowed. Network,
     /// spawn, and irreversible operations still prompt.
@@ -49,12 +50,6 @@ pub enum PermissionMode {
     Auto,
 }
 
-impl Default for PermissionMode {
-    fn default() -> Self {
-        PermissionMode::Default
-    }
-}
-
 impl PermissionMode {
     /// String representation matching the wire format used by TOML
     /// definitions and the CLI.
@@ -113,10 +108,7 @@ mod tests {
             PermissionMode::parse("accept-edits"),
             Some(PermissionMode::AcceptEdits)
         );
-        assert_eq!(
-            PermissionMode::parse("plan"),
-            Some(PermissionMode::Plan)
-        );
+        assert_eq!(PermissionMode::parse("plan"), Some(PermissionMode::Plan));
         assert_eq!(
             PermissionMode::parse("DONTASK"),
             Some(PermissionMode::DontAsk)
@@ -133,10 +125,7 @@ mod tests {
             PermissionMode::parse("bypass-permissions"),
             Some(PermissionMode::BypassPermissions)
         );
-        assert_eq!(
-            PermissionMode::parse("auto"),
-            Some(PermissionMode::Auto)
-        );
+        assert_eq!(PermissionMode::parse("auto"), Some(PermissionMode::Auto));
         assert_eq!(PermissionMode::parse(""), None);
         assert_eq!(PermissionMode::parse("nonsense"), None);
     }
diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs
index 33ee6288b..b75916fa5 100644
--- a/crates/jcode-agent-runtime/src/tier.rs
+++ b/crates/jcode-agent-runtime/src/tier.rs
@@ -118,10 +118,10 @@ pub fn resolve_model(
         return override_id;
     }
 
-    if let Some(tier) = prefer_tier {
-        if let Some(tier_model) = tier.read_user_override() {
-            return tier_model;
-        }
+    if let Some(tier) = prefer_tier
+        && let Some(tier_model) = tier.read_user_override()
+    {
+        return tier_model;
     }
 
     current_session_model.to_string()
diff --git a/crates/jcode-app-core/src/agent/turn_execution.rs b/crates/jcode-app-core/src/agent/turn_execution.rs
index 44393c474..bb23bded7 100644
--- a/crates/jcode-app-core/src/agent/turn_execution.rs
+++ b/crates/jcode-app-core/src/agent/turn_execution.rs
@@ -325,8 +325,8 @@ impl Agent {
     fn apply_selfdev_tool_surface(tools: &mut [ToolDefinition], is_canary: bool) {
         for tool in tools.iter_mut() {
             if tool.name == "selfdev" {
-                tool.description = crate::tool::selfdev::SelfDevTool::description_for(is_canary)
-                    .to_string();
+                tool.description =
+                    crate::tool::selfdev::SelfDevTool::description_for(is_canary).to_string();
                 tool.input_schema = crate::tool::selfdev::SelfDevTool::schema_for(is_canary);
             }
         }
diff --git a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs
index a91adff4c..0f4b0faf5 100644
--- a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs
+++ b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs
@@ -396,8 +396,9 @@ impl Agent {
                         // answer renders as a normal paragraph rather than as reasoning.
                         if reasoning_open && !text.trim().is_empty() {
                             reasoning_open = false;
-                            let _ = event_tx
-                                .send(ServerEvent::ReasoningDone { duration_secs: None });
+                            let _ = event_tx.send(ServerEvent::ReasoningDone {
+                                duration_secs: None,
+                            });
                         }
                         text_content.push_str(&text);
                         if !text_wrapped_detected {
@@ -430,8 +431,9 @@ impl Agent {
                     StreamEvent::ToolUseStart { id, name } => {
                         if reasoning_open {
                             reasoning_open = false;
-                            let _ = event_tx
-                                .send(ServerEvent::ReasoningDone { duration_secs: None });
+                            let _ = event_tx.send(ServerEvent::ReasoningDone {
+                                duration_secs: None,
+                            });
                         }
                         let _ = event_tx.send(ServerEvent::ToolStart {
                             id: id.clone(),
@@ -595,8 +597,9 @@ impl Agent {
                         // step) so the client flushes its live partial line.
                         if reasoning_open {
                             reasoning_open = false;
-                            let _ = event_tx
-                                .send(ServerEvent::ReasoningDone { duration_secs: None });
+                            let _ = event_tx.send(ServerEvent::ReasoningDone {
+                                duration_secs: None,
+                            });
                         }
                         if reason.is_some() {
                             stop_reason = reason;
diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs
index b26de1cd5..b91ac925f 100644
--- a/crates/jcode-app-core/src/dcg_bridge.rs
+++ b/crates/jcode-app-core/src/dcg_bridge.rs
@@ -160,7 +160,10 @@ pub fn session_mode(session_id: &str) -> Option<Mode> {
 /// Call sites that know the agent's `PermissionMode` (e.g. subagent tool
 /// execution) should use this instead of [`classify`].
 #[must_use]
-pub fn classify_for_agent(action: &str, agent_permission_mode: Option<PermissionMode>) -> BridgeDecision {
+pub fn classify_for_agent(
+    action: &str,
+    agent_permission_mode: Option<PermissionMode>,
+) -> BridgeDecision {
     let mode = agent_permission_mode
         .map(permission_mode_to_dcg)
         .unwrap_or_else(current_mode);
@@ -468,7 +471,10 @@ mod tests {
         assert_eq!(permission_mode_to_dcg(PM::AcceptEdits), Mode::AcceptEdits);
         assert_eq!(permission_mode_to_dcg(PM::Plan), Mode::Plan);
         assert_eq!(permission_mode_to_dcg(PM::DontAsk), Mode::DontAsk);
-        assert_eq!(permission_mode_to_dcg(PM::BypassPermissions), Mode::BypassPermissions);
+        assert_eq!(
+            permission_mode_to_dcg(PM::BypassPermissions),
+            Mode::BypassPermissions
+        );
         assert_eq!(permission_mode_to_dcg(PM::Auto), Mode::Auto);
     }
 
diff --git a/crates/jcode-app-core/src/lib.rs b/crates/jcode-app-core/src/lib.rs
index 1e23d83ee..27d8ee45e 100644
--- a/crates/jcode-app-core/src/lib.rs
+++ b/crates/jcode-app-core/src/lib.rs
@@ -39,8 +39,8 @@ pub mod network_retry;
 pub mod notifications;
 pub mod overnight;
 pub mod perf;
-pub mod prompt_templates;
 pub mod prompt_placeholders;
+pub mod prompt_templates;
 pub mod replay;
 pub mod restart_snapshot;
 pub mod sandbox;
diff --git a/crates/jcode-app-core/src/server/comm_session.rs b/crates/jcode-app-core/src/server/comm_session.rs
index 3b4e27196..540c03de1 100644
--- a/crates/jcode-app-core/src/server/comm_session.rs
+++ b/crates/jcode-app-core/src/server/comm_session.rs
@@ -266,9 +266,10 @@ fn resolve_swarm_spawn_selection(
         }
         None => SwarmSpawnSelection {
             model: coordinator.model.clone(),
-            provider_key: coordinator.provider_key.clone().or_else(|| {
-                provider_key_for_spawn_model(coordinator.model.as_deref(), None)
-            }),
+            provider_key: coordinator
+                .provider_key
+                .clone()
+                .or_else(|| provider_key_for_spawn_model(coordinator.model.as_deref(), None)),
             route_api_method: coordinator.route_api_method.clone(),
         },
     }
diff --git a/crates/jcode-app-core/src/server/comm_session_tests.rs b/crates/jcode-app-core/src/server/comm_session_tests.rs
index 52812df70..eac745636 100644
--- a/crates/jcode-app-core/src/server/comm_session_tests.rs
+++ b/crates/jcode-app-core/src/server/comm_session_tests.rs
@@ -466,7 +466,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api()
     // the same API route, not Claude OAuth (the config default).
     let selection = resolve_swarm_spawn_selection(
         None,
-        &coordinator_identity(Some("claude-opus-4-6"), Some("claude-api"), Some("claude-api")),
+        &coordinator_identity(
+            Some("claude-opus-4-6"),
+            Some("claude-api"),
+            Some("claude-api"),
+        ),
     );
 
     assert_eq!(selection.model.as_deref(), Some("claude-opus-4-6"));
@@ -478,7 +482,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api()
 fn resolve_swarm_spawn_model_keeps_provider_key_when_config_matches_coordinator() {
     let selection = resolve_swarm_spawn_selection(
         Some("custom-model".to_string()),
-        &coordinator_identity(Some("custom-model"), Some("custom-provider"), Some("custom-route")),
+        &coordinator_identity(
+            Some("custom-model"),
+            Some("custom-provider"),
+            Some("custom-route"),
+        ),
     );
 
     assert_eq!(selection.model.as_deref(), Some("custom-model"));
@@ -541,8 +549,7 @@ async fn coordinator_identity_falls_back_to_persisted_session_when_agent_busy()
     // Persist a coordinator session that records a concrete model + auth route.
     // Persist after the agent is built so it reflects the authoritative on-disk
     // snapshot the spawn path will read when the agent lock is unavailable.
-    let mut session =
-        crate::session::Session::create_with_id("coord_busy".to_string(), None, None);
+    let mut session = crate::session::Session::create_with_id("coord_busy".to_string(), None, None);
     session.model = Some("claude-opus-4-6".to_string());
     session.provider_key = Some("claude-api".to_string());
     session.route_api_method = Some("claude-api".to_string());
diff --git a/crates/jcode-app-core/src/tool/selfdev/setup.rs b/crates/jcode-app-core/src/tool/selfdev/setup.rs
index 3f07483fb..496329daf 100644
--- a/crates/jcode-app-core/src/tool/selfdev/setup.rs
+++ b/crates/jcode-app-core/src/tool/selfdev/setup.rs
@@ -21,11 +21,7 @@ impl SetupCheck {
         }
     }
 
-    fn missing(
-        name: &'static str,
-        detail: impl Into<String>,
-        fix: impl Into<String>,
-    ) -> Self {
+    fn missing(name: &'static str, detail: impl Into<String>, fix: impl Into<String>) -> Self {
         Self {
             name,
             ok: false,
@@ -102,36 +98,25 @@ impl SelfDevTool {
         if repo_dir.is_none() {
             // Only attempt a clone when git is available and we're not in a
             // synthetic test session.
-            let git_available = checks
-                .iter()
-                .any(|check| check.name == "git" && check.ok);
+            let git_available = checks.iter().any(|check| check.name == "git" && check.ok);
             if SelfDevTool::is_test_session() {
-                clone_note = Some(
-                    "Test mode: skipped cloning the jcode source.".to_string(),
-                );
+                clone_note = Some("Test mode: skipped cloning the jcode source.".to_string());
             } else if git_available {
                 match Self::clone_selfdev_source() {
                     Ok(path) => {
-                        clone_note = Some(format!(
-                            "Cloned jcode source into {}.",
-                            path.display()
-                        ));
+                        clone_note = Some(format!("Cloned jcode source into {}.", path.display()));
                         repo_dir = Some(path);
                     }
                     Err(err) => {
-                        clone_note = Some(format!(
-                            "Could not clone jcode source automatically: {err}",
-                        ));
+                        clone_note =
+                            Some(format!("Could not clone jcode source automatically: {err}",));
                     }
                 }
             }
         }
 
         match &repo_dir {
-            Some(path) => checks.push(SetupCheck::ok(
-                "repository",
-                path.display().to_string(),
-            )),
+            Some(path) => checks.push(SetupCheck::ok("repository", path.display().to_string())),
             None => {
                 let target = Self::selfdev_clone_dir()
                     .map(|p| p.display().to_string())
@@ -152,10 +137,9 @@ impl SelfDevTool {
         // build before `selfdev reload`/`enter` can hand off into a dev binary.
         if let Some(repo) = repo_dir.as_deref() {
             match build::find_dev_binary(repo) {
-                Some(binary) => checks.push(SetupCheck::ok(
-                    "dev binary",
-                    binary.display().to_string(),
-                )),
+                Some(binary) => {
+                    checks.push(SetupCheck::ok("dev binary", binary.display().to_string()))
+                }
                 None => checks.push(SetupCheck::missing(
                     "dev binary",
                     "no built binary in target/selfdev or target/release",
@@ -222,7 +206,11 @@ impl SelfDevTool {
         let format_path = |path: Option<&std::path::Path>| match path {
             Some(p) => {
                 let exists = p.exists();
-                format!("{} {}", p.display(), if exists { "(exists)" } else { "(missing)" })
+                format!(
+                    "{} {}",
+                    p.display(),
+                    if exists { "(exists)" } else { "(missing)" }
+                )
             }
             None => "unavailable".to_string(),
         };
@@ -293,9 +281,7 @@ impl SelfDevTool {
     /// is strictly newer than the running process).
     pub(super) async fn do_reload_to_newer_build(&self, _ctx: &ToolContext) -> Result<ToolOutput> {
         if SelfDevTool::is_test_session() {
-            return Ok(ToolOutput::new(
-                "Test mode: skipped reload-to-newer-build.",
-            ));
+            return Ok(ToolOutput::new("Test mode: skipped reload-to-newer-build."));
         }
 
         if !server::server_has_newer_binary() {
diff --git a/crates/jcode-app-core/src/tool/selfdev/tests.rs b/crates/jcode-app-core/src/tool/selfdev/tests.rs
index 4f633c3e6..d569cda02 100644
--- a/crates/jcode-app-core/src/tool/selfdev/tests.rs
+++ b/crates/jcode-app-core/src/tool/selfdev/tests.rs
@@ -325,7 +325,13 @@ fn non_selfdev_schema_only_exposes_onramp_actions() {
         sorted,
         vec!["enter", "find-config", "reload", "setup", "status"]
     );
-    for hidden in ["build", "test", "cancel-build", "socket-info", "socket-help"] {
+    for hidden in [
+        "build",
+        "test",
+        "cancel-build",
+        "socket-info",
+        "socket-help",
+    ] {
         assert!(
             !actions.contains(&hidden),
             "on-ramp schema should not expose {hidden}"
diff --git a/crates/jcode-app-core/src/tool/task_management.rs b/crates/jcode-app-core/src/tool/task_management.rs
index 896e89093..6533f0b27 100644
--- a/crates/jcode-app-core/src/tool/task_management.rs
+++ b/crates/jcode-app-core/src/tool/task_management.rs
@@ -1,5 +1,5 @@
-use super::{Tool, ToolContext, ToolOutput};
 use super::team::{TeamConfig, TeamTask};
+use super::{Tool, ToolContext, ToolOutput};
 use anyhow::Result;
 use async_trait::async_trait;
 use serde::Deserialize;
@@ -154,10 +154,7 @@ impl Tool for TaskUpdateTool {
         let mut team = match TeamConfig::load(&params.team_name)? {
             Some(t) => t,
             None => {
-                return Err(anyhow::anyhow!(
-                    "Team '{}' not found.",
-                    params.team_name
-                ));
+                return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name));
             }
         };
 
@@ -233,10 +230,7 @@ impl Tool for TaskListTool {
         let team = match TeamConfig::load(&params.team_name)? {
             Some(t) => t,
             None => {
-                return Err(anyhow::anyhow!(
-                    "Team '{}' not found.",
-                    params.team_name
-                ));
+                return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name));
             }
         };
 
@@ -246,11 +240,22 @@ impl Tool for TaskListTool {
             params.team_name,
             team.tasks.len(),
             team.tasks.iter().filter(|t| t.status == "pending").count(),
-            team.tasks.iter().filter(|t| t.status == "in_progress").count(),
-            team.tasks.iter().filter(|t| t.status == "completed").count(),
+            team.tasks
+                .iter()
+                .filter(|t| t.status == "in_progress")
+                .count(),
+            team.tasks
+                .iter()
+                .filter(|t| t.status == "completed")
+                .count(),
         );
 
-        Ok(ToolOutput::new(format!("{}\n\n{}", summary, output))
-            .with_title(format!("{} tasks in '{}'", team.tasks.len(), params.team_name)))
+        Ok(
+            ToolOutput::new(format!("{}\n\n{}", summary, output)).with_title(format!(
+                "{} tasks in '{}'",
+                team.tasks.len(),
+                params.team_name
+            )),
+        )
     }
 }
diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs
index 72db41b3b..6c8ad7738 100644
--- a/crates/jcode-app-core/src/tool/team.rs
+++ b/crates/jcode-app-core/src/tool/team.rs
@@ -36,7 +36,7 @@ pub struct TeamTask {
     pub id: String,
     pub subject: String,
     pub description: String,
-    pub status: String,       // "pending" | "in_progress" | "completed"
+    pub status: String,        // "pending" | "in_progress" | "completed"
     pub owner: Option<String>, // member name
 }
 
@@ -141,11 +141,10 @@ impl Tool for TeamCreateTool {
         team.save()?;
 
         let output = serde_json::to_string_pretty(&team)?;
-        Ok(ToolOutput::new(format!(
-            "Team '{}' created.\n\n{}",
-            params.name, output
-        ))
-        .with_title(format!("Team '{}' created", params.name)))
+        Ok(
+            ToolOutput::new(format!("Team '{}' created.\n\n{}", params.name, output))
+                .with_title(format!("Team '{}' created", params.name)),
+        )
     }
 }
 
@@ -201,11 +200,10 @@ impl Tool for TeamDeleteTool {
             Ok(ToolOutput::new(format!("Team '{}' deleted.", params.name))
                 .with_title(format!("Team '{}' deleted", params.name)))
         } else {
-            Ok(ToolOutput::new(format!(
-                "Team '{}' did not exist (no-op).",
-                params.name
-            ))
-            .with_title(format!("Team '{}' not found", params.name)))
+            Ok(
+                ToolOutput::new(format!("Team '{}' did not exist (no-op).", params.name))
+                    .with_title(format!("Team '{}' not found", params.name)),
+            )
         }
     }
 }
diff --git a/crates/jcode-base/src/auth/live_provider_probes.rs b/crates/jcode-base/src/auth/live_provider_probes.rs
index e551bc0d4..a47f4697a 100644
--- a/crates/jcode-base/src/auth/live_provider_probes.rs
+++ b/crates/jcode-base/src/auth/live_provider_probes.rs
@@ -1341,7 +1341,10 @@ pub async fn run_live_native_provider_smoke(
     .with_duration_ms(started.elapsed().as_millis() as u64)
     .with_evidence("model", serde_json::json!(model))
     .with_evidence("matched_expected_content", serde_json::json!(true))
-    .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone()));
+    .with_evidence(
+        "stop_reason",
+        serde_json::json!(outcome.stop_reason.clone()),
+    );
     if let Some(usage) = outcome.usage_evidence() {
         stage = stage.with_evidence("usage", usage);
     }
@@ -1429,7 +1432,10 @@ pub async fn run_live_native_provider_stream_smoke(
     .with_evidence("attempts", serde_json::json!(attempts))
     .with_evidence("total_events", serde_json::json!(outcome.total_events))
     .with_evidence("matched_expected_content", serde_json::json!(true))
-    .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone()));
+    .with_evidence(
+        "stop_reason",
+        serde_json::json!(outcome.stop_reason.clone()),
+    );
     if let Some(usage) = outcome.usage_evidence() {
         stage = stage.with_evidence("usage", usage);
     }
diff --git a/crates/jcode-base/src/auth/provider_e2e.rs b/crates/jcode-base/src/auth/provider_e2e.rs
index cf1f2b3c1..391de4515 100644
--- a/crates/jcode-base/src/auth/provider_e2e.rs
+++ b/crates/jcode-base/src/auth/provider_e2e.rs
@@ -1321,8 +1321,8 @@ impl NativeProviderKind {
     /// Returns an error only when the runtime cannot be constructed at all (e.g.
     /// Copilot with no credential file); model selection happens later.
     fn build_runtime(self) -> anyhow::Result<std::sync::Arc<dyn crate::provider::Provider>> {
-        use anyhow::Context as _;
         use crate::provider::Provider;
+        use anyhow::Context as _;
         let runtime: std::sync::Arc<dyn Provider> = match self {
             Self::OpenAi => {
                 let credentials = crate::auth::codex::load_credentials().unwrap_or_else(|_| {
@@ -1337,9 +1337,7 @@ impl NativeProviderKind {
                 std::sync::Arc::new(crate::provider::openai::OpenAIProvider::new(credentials))
             }
             Self::Gemini => std::sync::Arc::new(crate::provider::gemini::GeminiProvider::new()),
-            Self::Cursor => {
-                std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new())
-            }
+            Self::Cursor => std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new()),
             Self::Copilot => {
                 // `new()` requires a loadable GitHub token; fall back to an empty
                 // token so the offline tier can still construct the runtime for
@@ -1354,18 +1352,14 @@ impl NativeProviderKind {
                 crate::env::set_var("JCODE_COPILOT_PREFETCH_STARTUP_GRACE_MS", "0");
                 let runtime = match crate::provider::copilot::CopilotApiProvider::new() {
                     Ok(runtime) => runtime,
-                    Err(_) => crate::provider::copilot::CopilotApiProvider::new_with_token(
-                        String::new(),
-                    ),
+                    Err(_) => {
+                        crate::provider::copilot::CopilotApiProvider::new_with_token(String::new())
+                    }
                 };
                 std::sync::Arc::new(runtime)
             }
-            Self::Bedrock => {
-                std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new())
-            }
-            Self::Jcode => {
-                std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new())
-            }
+            Self::Bedrock => std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new()),
+            Self::Jcode => std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new()),
             Self::Azure => {
                 // Azure OpenAI is the OpenRouter transport configured via Azure
                 // env; apply that env (endpoint/key/header wiring) before building
@@ -1696,8 +1690,14 @@ pub async fn run_generic_native_e2e(
                 ));
             }
         } else {
-            run_generic_native_api_checks(runtime.as_ref(), &selected, spec.label, &mut checks, &mut spend)
-                .await;
+            run_generic_native_api_checks(
+                runtime.as_ref(),
+                &selected,
+                spec.label,
+                &mut checks,
+                &mut spend,
+            )
+            .await;
         }
     } else {
         for checkpoint in API_DEPENDENT_CHECKPOINTS {
diff --git a/crates/jcode-base/src/provider/gemini.rs b/crates/jcode-base/src/provider/gemini.rs
index 8e8dc9174..485fb0786 100644
--- a/crates/jcode-base/src/provider/gemini.rs
+++ b/crates/jcode-base/src/provider/gemini.rs
@@ -849,9 +849,7 @@ impl Provider for GeminiProvider {
                                 .await;
                             let _ = tx.send(Ok(StreamEvent::ToolUseEnd)).await;
                             if let Some(signature) = signature {
-                                let _ = tx
-                                    .send(Ok(StreamEvent::ToolUseSignature(signature)))
-                                    .await;
+                                let _ = tx.send(Ok(StreamEvent::ToolUseSignature(signature))).await;
                             }
                         } else if let Some(signature) = part_signature {
                             // Standalone signature part; remember it for the next
diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs
index 8d2917a04..21c3bcc6f 100644
--- a/crates/jcode-base/src/provider/gemini_tests.rs
+++ b/crates/jcode-base/src/provider/gemini_tests.rs
@@ -386,7 +386,10 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() {
     assert!(!schema_contains_key(parameters, "additionalProperties"));
     assert!(!schema_contains_key(parameters, "$schema"));
     // Real schema content is preserved.
-    assert_eq!(parameters["properties"]["file_path"]["type"], json!("string"));
+    assert_eq!(
+        parameters["properties"]["file_path"]["type"],
+        json!("string")
+    );
     assert_eq!(
         parameters["properties"]["opts"]["properties"]["limit"]["type"],
         json!("integer")
diff --git a/crates/jcode-base/src/provider/mod.rs b/crates/jcode-base/src/provider/mod.rs
index ef4011e37..bfeccd7f9 100644
--- a/crates/jcode-base/src/provider/mod.rs
+++ b/crates/jcode-base/src/provider/mod.rs
@@ -48,6 +48,7 @@ pub use catalog_routes::{
     remote_model_routes_lightweight_fallback, remote_model_should_offer_copilot_route,
     remote_openai_compatible_route_for_model, simplified_model_routes_for_picker,
 };
+pub use jcode_provider_core::cli_provider_arg_for_session_key;
 pub use jcode_provider_core::{
     ALL_CLAUDE_MODELS, ALL_OPENAI_MODELS, CHEAPNESS_REFERENCE_INPUT_TOKENS,
     CHEAPNESS_REFERENCE_OUTPUT_TOKENS, DEFAULT_CONTEXT_LIMIT, EventStream, JCODE_USER_AGENT,
@@ -58,7 +59,6 @@ pub use jcode_provider_core::{
     normalize_copilot_model_name, provider_from_model_key, shared_http_client,
     summarize_model_catalog_refresh,
 };
-pub use jcode_provider_core::cli_provider_arg_for_session_key;
 pub use jcode_provider_core::{ProviderFailoverPrompt, parse_failover_prompt_message};
 pub use route_builders::{
     build_anthropic_oauth_route, build_copilot_route, build_openai_api_key_route,
diff --git a/crates/jcode-base/src/skill.rs b/crates/jcode-base/src/skill.rs
index f704d04ac..3ca1799aa 100644
--- a/crates/jcode-base/src/skill.rs
+++ b/crates/jcode-base/src/skill.rs
@@ -924,6 +924,5 @@ mod invocation_parse_tests {
         );
         let skill = SkillRegistry::parse_skill(&path).unwrap();
         assert_eq!(skill.tags, vec!["rust", "perf"]);
->>>>>>> origin/master
     }
 }
diff --git a/crates/jcode-base/src/telemetry/tests.rs b/crates/jcode-base/src/telemetry/tests.rs
index 0cade87aa..a5871b080 100644
--- a/crates/jcode-base/src/telemetry/tests.rs
+++ b/crates/jcode-base/src/telemetry/tests.rs
@@ -30,12 +30,25 @@ fn test_do_not_track() {
 fn test_is_ci_detects_ci_env() {
     let _guard = lock_test_env();
     // Clear any inherited CI markers so the baseline is deterministic.
-    for key in ["CI", "GITHUB_ACTIONS", "BUILDKITE", "JENKINS_URL", "GITLAB_CI", "CIRCLECI"] {
+    for key in [
+        "CI",
+        "GITHUB_ACTIONS",
+        "BUILDKITE",
+        "JENKINS_URL",
+        "GITLAB_CI",
+        "CIRCLECI",
+    ] {
         crate::env::remove_var(key);
     }
-    assert!(!is_ci(), "expected non-CI baseline after clearing CI markers");
+    assert!(
+        !is_ci(),
+        "expected non-CI baseline after clearing CI markers"
+    );
     crate::env::set_var("CI", "true");
-    assert!(is_ci(), "CI env var should mark the run as CI (gates install skip)");
+    assert!(
+        is_ci(),
+        "CI env var should mark the run as CI (gates install skip)"
+    );
     crate::env::remove_var("CI");
     assert!(!is_ci());
 }
diff --git a/crates/jcode-provider-core/src/lib.rs b/crates/jcode-provider-core/src/lib.rs
index 93fe676e7..73433d8ad 100644
--- a/crates/jcode-provider-core/src/lib.rs
+++ b/crates/jcode-provider-core/src/lib.rs
@@ -26,10 +26,10 @@ pub use models::{
     provider_for_model_with_hint as core_provider_for_model_with_hint, provider_key_from_hint,
 };
 pub use selection::{
-    ActiveProvider, ProviderAvailability, auto_default_provider,
-    cli_provider_arg_for_session_key, dedupe_model_routes, explicit_model_provider_prefix,
-    fallback_sequence, model_name_for_provider, parse_provider_hint, provider_from_model_key,
-    provider_key, provider_label,
+    ActiveProvider, ProviderAvailability, auto_default_provider, cli_provider_arg_for_session_key,
+    dedupe_model_routes, explicit_model_provider_prefix, fallback_sequence,
+    model_name_for_provider, parse_provider_hint, provider_from_model_key, provider_key,
+    provider_label,
 };
 
 use anyhow::Result;
diff --git a/crates/jcode-provider-core/src/selection.rs b/crates/jcode-provider-core/src/selection.rs
index 1c4139cba..bc83ae280 100644
--- a/crates/jcode-provider-core/src/selection.rs
+++ b/crates/jcode-provider-core/src/selection.rs
@@ -361,16 +361,25 @@ mod tests {
             Some("anthropic-api")
         );
         // Anthropic OAuth -> claude.
-        assert_eq!(cli_provider_arg_for_session_key("claude-oauth"), Some("claude"));
+        assert_eq!(
+            cli_provider_arg_for_session_key("claude-oauth"),
+            Some("claude")
+        );
         assert_eq!(cli_provider_arg_for_session_key("claude"), Some("claude"));
         // OpenAI variants.
-        assert_eq!(cli_provider_arg_for_session_key("openai-oauth"), Some("openai"));
+        assert_eq!(
+            cli_provider_arg_for_session_key("openai-oauth"),
+            Some("openai")
+        );
         assert_eq!(
             cli_provider_arg_for_session_key("openai-api-key"),
             Some("openai-api")
         );
         // Passthrough providers.
-        assert_eq!(cli_provider_arg_for_session_key("openrouter"), Some("openrouter"));
+        assert_eq!(
+            cli_provider_arg_for_session_key("openrouter"),
+            Some("openrouter")
+        );
         assert_eq!(cli_provider_arg_for_session_key("copilot"), Some("copilot"));
         assert_eq!(cli_provider_arg_for_session_key("gemini"), Some("gemini"));
         assert_eq!(cli_provider_arg_for_session_key("bedrock"), Some("bedrock"));
diff --git a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs
index 2b4a35903..220a2bd14 100644
--- a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs
+++ b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs
@@ -290,13 +290,34 @@ impl Rng {
 }
 
 const WORDS: &[&str] = &[
-    "alpha", "beta", "gamma", "delta", "x", "y", "z", "the", "quick", "brown",
-    "fox", "中文", "데이터", "emoji", "lorem", "ipsum", "a", "I", "we", "code",
+    "alpha",
+    "beta",
+    "gamma",
+    "delta",
+    "x",
+    "y",
+    "z",
+    "the",
+    "quick",
+    "brown",
+    "fox",
+    "中文",
+    "데이터",
+    "emoji",
+    "lorem",
+    "ipsum",
+    "a",
+    "I",
+    "we",
+    "code",
 ];
 
 fn gen_words(rng: &mut Rng, max: usize) -> String {
     let n = 1 + rng.below(max);
-    (0..n).map(|_| *rng.pick(WORDS)).collect::<Vec<_>>().join(" ")
+    (0..n)
+        .map(|_| *rng.pick(WORDS))
+        .collect::<Vec<_>>()
+        .join(" ")
 }
 
 /// Generate an inline fragment (no leading/trailing block structure).
@@ -307,7 +328,11 @@ fn gen_inline(rng: &mut Rng, depth: usize) -> String {
         2 => format!("_{}_", gen_words(rng, 3)),
         3 => format!("`{}`", gen_words(rng, 2)),
         4 => format!("~~{}~~", gen_words(rng, 2)),
-        5 => format!("[{}](http://example.com/{})", gen_words(rng, 2), rng.below(99)),
+        5 => format!(
+            "[{}](http://example.com/{})",
+            gen_words(rng, 2),
+            rng.below(99)
+        ),
         6 => format!("${}+{}$", rng.pick(WORDS), rng.pick(WORDS)),
         7 => format!("${}", rng.below(999)), // currency
         _ => format!(
@@ -536,8 +561,3 @@ fn fuzz_random_documents_wrapped_parity() {
             .join("\n\n")
     );
 }
-
-
-
-
-
diff --git a/crates/jcode-tui/src/tui/app/misc_ui.rs b/crates/jcode-tui/src/tui/app/misc_ui.rs
index 0d408cc80..58789734d 100644
--- a/crates/jcode-tui/src/tui/app/misc_ui.rs
+++ b/crates/jcode-tui/src/tui/app/misc_ui.rs
@@ -38,9 +38,8 @@ impl ResolvedTokenPricing {
         cache_read_tokens: u64,
         cache_creation_tokens: u64,
     ) -> f32 {
-        let split_accounting = self.is_anthropic
-            || cache_creation_tokens > 0
-            || cache_read_tokens > input_tokens;
+        let split_accounting =
+            self.is_anthropic || cache_creation_tokens > 0 || cache_read_tokens > input_tokens;
 
         let fresh_input_tokens = if split_accounting {
             input_tokens
@@ -275,8 +274,7 @@ impl App {
 
         let model = <Self as TuiState>::provider_model(self);
         let provider_name = <Self as TuiState>::provider_name(self).to_lowercase();
-        let is_anthropic =
-            provider_name.contains("anthropic") || provider_name.contains("claude");
+        let is_anthropic = provider_name.contains("anthropic") || provider_name.contains("claude");
         let is_openai = provider_name.contains("openai");
 
         // The server resolves the active credential authoritatively; only bill
diff --git a/crates/jcode-tui/src/tui/app/tests.rs b/crates/jcode-tui/src/tui/app/tests.rs
index ea81b409b..0cc1ef805 100644
--- a/crates/jcode-tui/src/tui/app/tests.rs
+++ b/crates/jcode-tui/src/tui/app/tests.rs
@@ -452,7 +452,10 @@ fn skills_command_marks_active_skill_in_remote_mode() {
     assert!(content.contains("- /optimization (active)"), "{content}");
     assert!(content.contains("- /firefox-browser\n"), "{content}");
     // Endorsed list should mark remote-installed skills as installed.
-    assert!(content.contains("/firefox-browser [installed]"), "{content}");
+    assert!(
+        content.contains("/firefox-browser [installed]"),
+        "{content}"
+    );
 }
 
 #[test]
diff --git a/crates/jcode-tui/src/tui/info_widget.rs b/crates/jcode-tui/src/tui/info_widget.rs
index 9448ba98f..e6d669fbe 100644
--- a/crates/jcode-tui/src/tui/info_widget.rs
+++ b/crates/jcode-tui/src/tui/info_widget.rs
@@ -419,7 +419,11 @@ pub struct CacheMissAttribution {
 impl CacheHitInfo {
     /// Effective total prompt tokens across the session (read denominator).
     fn effective_reported_tokens(&self) -> u64 {
-        effective_prompt_tokens(self.reported_input_tokens, self.read_tokens, self.creation_tokens)
+        effective_prompt_tokens(
+            self.reported_input_tokens,
+            self.read_tokens,
+            self.creation_tokens,
+        )
     }
 
     /// Fraction of the session's prompt tokens that were served from cache.
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index 5fd3f3031..8b56d4a46 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -116,9 +116,9 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         let line = timeout(timeout_duration, lines_stream.next_line()).await;
         match line {
             Ok(Ok(Some(l))) => trace_lines.push(l),
-            Ok(Ok(None)) => break false,     // EOF — clean exit
-            Ok(Err(_)) => break false,       // read error
-            Err(_) => break true,            // timeout
+            Ok(Ok(None)) => break false, // EOF — clean exit
+            Ok(Err(_)) => break false,   // read error
+            Err(_) => break true,        // timeout
         }
     };
 
@@ -130,7 +130,9 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         return Ok(EvalRun {
             commit_sha: String::new(),
             prompt: config.prompt,
-            diff: extract_diff_from_repo(&config.repo_path).await.unwrap_or_default(),
+            diff: extract_diff_from_repo(&config.repo_path)
+                .await
+                .unwrap_or_default(),
             judging: Default::default(),
             cost_usd: 0.0,
             duration_ms: start.elapsed().as_millis() as u64,
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index bce8442a4..5e3c651d0 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -131,7 +131,9 @@ async fn main() -> Result<()> {
                 .await?;
             }
             #[cfg(not(feature = "agent-runner"))]
-            anyhow::bail!("'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner");
+            anyhow::bail!(
+                "'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner"
+            );
         }
         Command::Judge {
             runs_dir,
@@ -190,11 +192,7 @@ async fn pick_commits_impl(
         }
 
         let sha = lines[0].trim();
-        let parent_sha = lines[1]
-            .split_whitespace()
-            .next()
-            .unwrap_or("")
-            .to_string();
+        let parent_sha = lines[1].split_whitespace().next().unwrap_or("").to_string();
         let subject = lines[2].trim();
 
         // Skip root commits (no parent).
@@ -287,13 +285,8 @@ async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> {
         })?;
 
         // git diff to get the full unified diff.
-        let full_diff = run_git(&[
-            "diff",
-            &format!("{}..{}", pc.parent_sha, pc.sha),
-        ])
-        .with_context(|| {
-            format!("git diff failed for {}..{}", pc.parent_sha, pc.sha)
-        })?;
+        let full_diff = run_git(&["diff", &format!("{}..{}", pc.parent_sha, pc.sha)])
+            .with_context(|| format!("git diff failed for {}..{}", pc.parent_sha, pc.sha))?;
 
         let file_diffs = parse_diffs(&name_status, &full_diff);
 
@@ -318,8 +311,8 @@ async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> {
         eval_commits,
     };
 
-    let json = serde_json::to_string_pretty(&eval_data)
-        .context("failed to serialize EvalDataV2")?;
+    let json =
+        serde_json::to_string_pretty(&eval_data).context("failed to serialize EvalDataV2")?;
     std::fs::write(output, &json)
         .with_context(|| format!("failed to write output file {}", output.display()))?;
 
@@ -562,7 +555,11 @@ fn parse_diffs(name_status: &str, full_diff: &str) -> Vec<jcode_jbench::types::F
             r if r.starts_with('R') => {
                 // Renamed: "R100\told_path\tnew_path"
                 if parts.len() >= 3 {
-                    (FileDiffStatus::Renamed, parts[2].to_owned(), Some(parts[1].to_owned()))
+                    (
+                        FileDiffStatus::Renamed,
+                        parts[2].to_owned(),
+                        Some(parts[1].to_owned()),
+                    )
                 } else {
                     (FileDiffStatus::Modified, parts[1].to_owned(), None)
                 }
@@ -583,10 +580,7 @@ fn parse_diffs(name_status: &str, full_diff: &str) -> Vec<jcode_jbench::types::F
     // Build FileDiff structs, matching by path.
     let mut result = Vec::with_capacity(file_entries.len());
     for (status, path, old_path) in file_entries {
-        let diff_text = file_diffs_map
-            .get(&path)
-            .cloned()
-            .unwrap_or_default();
+        let diff_text = file_diffs_map.get(&path).cloned().unwrap_or_default();
         result.push(FileDiff {
             path,
             status,
@@ -611,11 +605,7 @@ fn split_diff_by_file(full_diff: &str) -> std::collections::HashMap<String, Stri
                 map.insert(p.clone(), current_chunk.clone());
             }
             // Extract the post-image path from "diff --git a/path b/path".
-            let path = line
-                .splitn(2, " b/")
-                .nth(1)
-                .unwrap_or("")
-                .to_owned();
+            let path = line.splitn(2, " b/").nth(1).unwrap_or("").to_owned();
             current_path = Some(path);
             current_chunk.clear();
         }
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
index 0d4d36f72..7b461a44c 100644
--- a/evals/jbench/src/judge.rs
+++ b/evals/jbench/src/judge.rs
@@ -425,7 +425,11 @@ pub async fn judge_with_three_models(
 
     // Median analysis — sort by overall_score and pick the middle
     let mut sorted = valid.clone();
-    sorted.sort_by(|a, b| a.overall_score.partial_cmp(&b.overall_score).unwrap_or(std::cmp::Ordering::Equal));
+    sorted.sort_by(|a, b| {
+        a.overall_score
+            .partial_cmp(&b.overall_score)
+            .unwrap_or(std::cmp::Ordering::Equal)
+    });
     let median_idx = sorted.len() / 2;
     let median = &sorted[median_idx];
 
diff --git a/src/cli/provider_doctor.rs b/src/cli/provider_doctor.rs
index e438604db..c257165c1 100644
--- a/src/cli/provider_doctor.rs
+++ b/src/cli/provider_doctor.rs
@@ -30,9 +30,8 @@ pub async fn run_provider_doctor_command(
             Some("claude") => run_claude_native_e2e(provider, model, tier).await?,
             Some("antigravity") => run_antigravity_native_e2e(provider, model, tier).await?,
             Some(other) => {
-                let kind = NativeProviderKind::from_normalized(other).ok_or_else(|| {
-                    anyhow!("`{provider}` has no native provider-doctor driver")
-                })?;
+                let kind = NativeProviderKind::from_normalized(other)
+                    .ok_or_else(|| anyhow!("`{provider}` has no native provider-doctor driver"))?;
                 run_generic_native_e2e(kind, model, tier).await?
             }
             None => anyhow::bail!("`{provider}` has no native provider-doctor driver"),
diff --git a/tests/e2e/reload_multiclient.rs b/tests/e2e/reload_multiclient.rs
index dd8fd6b6f..8e6e077cd 100644
--- a/tests/e2e/reload_multiclient.rs
+++ b/tests/e2e/reload_multiclient.rs
@@ -160,7 +160,10 @@ async fn reload_notifies_successor_after_session_takeover() -> Result<()> {
         assert!(
             b_saw,
             "the live successor connection must be told the server is reloading; saw: {:?}",
-            b_events.iter().map(|e| format!("{e:?}")).collect::<Vec<_>>()
+            b_events
+                .iter()
+                .map(|e| format!("{e:?}"))
+                .collect::<Vec<_>>()
         );
 
         // The superseded original connection must end (disconnect) rather than

From 188a857b353b8d269cd54c85fa35ba2038ba33c1 Mon Sep 17 00:00:00 2001
From: quangdang46 <quangdang46@users.noreply.github.com>
Date: Sat, 6 Jun 2026 01:22:57 +0700
Subject: [PATCH 22/22] fix: address review swarm findings for PR #313

Security fixes:
- H1: Add validate_team_name() to prevent path traversal in TeamConfig
- H4: Reject BypassPermissions in project-local TOML agent definitions

Runtime wiring:
- H2: Wire shared AgentRegistry into production Registry::new sites
- H3: Add classify_for_session() that checks per-session mode overrides
- H5: Add max_turns enforcement in Agent turn loop
- H6: Wire agent_def.resolve_model() into SubagentTool model resolution

Code quality:
- M4: Remove deny_unknown_fields from AgentDefinition for forward compat
- M5: Align PermissionMode::parse() with serde kebab-case
- M6: Gate experimental team/task tools behind JCODE_EXPERIMENTAL_TOOLS env
- M7: Document parent session mutation race condition
- M8: Add SessionModeGuard RAII for automatic session mode cleanup

All 63 agent-runtime tests pass. cargo check clean.
---
 Cargo.lock                                    |   1 +
 crates/jcode-agent-runtime/Cargo.toml         |   1 +
 crates/jcode-agent-runtime/src/definition.rs  |  13 +-
 crates/jcode-agent-runtime/src/permission.rs  |  35 ++---
 crates/jcode-agent-runtime/src/registry.rs    |  11 ++
 crates/jcode-app-core/src/agent.rs            |   5 +
 .../src/agent/turn_execution.rs               |   6 +
 crates/jcode-app-core/src/agent/turn_loops.rs |  18 +++
 crates/jcode-app-core/src/ambient/runner.rs   |   8 +-
 crates/jcode-app-core/src/dcg_bridge.rs       |  43 ++++++
 crates/jcode-app-core/src/overnight.rs        |   1 +
 crates/jcode-app-core/src/server.rs           |   3 +-
 .../src/server/client_lifecycle.rs            |   2 +-
 .../src/server/client_session.rs              |   3 +
 crates/jcode-app-core/src/server/headless.rs  |   2 +-
 crates/jcode-app-core/src/tool/mod.rs         | 122 +++++++++++++-----
 crates/jcode-app-core/src/tool/task.rs        |  56 +++++---
 crates/jcode-app-core/src/tool/team.rs        |  24 ++++
 .../jcode-base/src/provider/gemini_tests.rs   |   2 +-
 src/bin/harness.rs                            |   2 +-
 src/cli/commands.rs                           |   2 +-
 src/cli/commands_tests.rs                     |   4 +-
 src/cli/provider_init.rs                      |   4 +-
 src/cli/selfdev_tests.rs                      |   4 +-
 tests/e2e/ambient.rs                          |   8 +-
 tests/e2e/provider_behavior.rs                |   8 +-
 tests/e2e/session_flow.rs                     |   4 +-
 27 files changed, 287 insertions(+), 105 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 038002b3f..efbedfaaf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5172,6 +5172,7 @@ dependencies = [
  "thiserror 1.0.69",
  "tokio",
  "toml",
+ "tracing",
 ]
 
 [[package]]
diff --git a/crates/jcode-agent-runtime/Cargo.toml b/crates/jcode-agent-runtime/Cargo.toml
index f66eb40ce..9a769a299 100644
--- a/crates/jcode-agent-runtime/Cargo.toml
+++ b/crates/jcode-agent-runtime/Cargo.toml
@@ -14,6 +14,7 @@ serde = { version = "1", features = ["derive"] }
 serde_json = "1"
 toml = "0.8"
 anyhow = "1"
+tracing = "0.1"
 
 [dev-dependencies]
 serde_json = "1"
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index c26e5f3ad..61ce6190b 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -47,7 +47,6 @@ pub const DEFAULT_AGENT_VERSION: &str = "0.1.0";
 /// Intentionally `Clone` so the runtime can hand each spawn its own copy
 /// without locking the registry. Definitions are small (a few KB at most).
 #[derive(Debug, Clone, Serialize, Deserialize)]
-#[serde(deny_unknown_fields)]
 pub struct AgentDefinition {
     // -----------------------------------------------------------------
     // Identity
@@ -646,19 +645,15 @@ mod tests {
     }
 
     #[test]
-    fn toml_unknown_field_is_rejected() {
+    fn toml_unknown_field_is_silently_ignored() {
         let src = r#"
             id = "ok"
             display_name = "ok"
             unknown_future_field = "value"
         "#;
-        let err = toml::from_str::<AgentDefinition>(src).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field")
-                || err.to_string().contains("unknown")
-                || err.to_string().contains("`unknown_future_field`"),
-            "expected denial of unknown field, got: {err}"
-        );
+        let def = toml::from_str::<AgentDefinition>(src).expect("unknown fields should be ignored for forward compat");
+        assert_eq!(def.id, "ok");
+        assert_eq!(def.display_name, "ok");
     }
 
     // -----------------------------------------------------------------
diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs
index 2f112efc0..045922933 100644
--- a/crates/jcode-agent-runtime/src/permission.rs
+++ b/crates/jcode-agent-runtime/src/permission.rs
@@ -64,16 +64,15 @@ impl PermissionMode {
         }
     }
 
-    /// Parse a permission mode from a string, accepting common variants.
+    /// Parse a permission mode from a string. Only accepts kebab-case
+    /// variants matching the serde wire format for consistency.
     pub fn parse(s: &str) -> Option<PermissionMode> {
         match s.trim().to_ascii_lowercase().as_str() {
             "default" => Some(PermissionMode::Default),
-            "acceptedits" | "accept_edits" | "accept-edits" => Some(PermissionMode::AcceptEdits),
+            "accept-edits" => Some(PermissionMode::AcceptEdits),
             "plan" => Some(PermissionMode::Plan),
-            "dontask" | "dont_ask" | "dont-ask" => Some(PermissionMode::DontAsk),
-            "bypasspermissions" | "bypass_permissions" | "bypass-permissions" => {
-                Some(PermissionMode::BypassPermissions)
-            }
+            "dont-ask" => Some(PermissionMode::DontAsk),
+            "bypass-permissions" => Some(PermissionMode::BypassPermissions),
             "auto" => Some(PermissionMode::Auto),
             _ => None,
         }
@@ -91,36 +90,20 @@ mod tests {
     use super::*;
 
     #[test]
-    fn parse_accepts_common_variants() {
+    fn parse_accepts_kebab_case_only() {
         assert_eq!(
             PermissionMode::parse("default"),
             Some(PermissionMode::Default)
         );
-        assert_eq!(
-            PermissionMode::parse("AcceptEdits"),
-            Some(PermissionMode::AcceptEdits)
-        );
-        assert_eq!(
-            PermissionMode::parse("accept_edits"),
-            Some(PermissionMode::AcceptEdits)
-        );
         assert_eq!(
             PermissionMode::parse("accept-edits"),
             Some(PermissionMode::AcceptEdits)
         );
         assert_eq!(PermissionMode::parse("plan"), Some(PermissionMode::Plan));
         assert_eq!(
-            PermissionMode::parse("DONTASK"),
+            PermissionMode::parse("dont-ask"),
             Some(PermissionMode::DontAsk)
         );
-        assert_eq!(
-            PermissionMode::parse("dont_ask"),
-            Some(PermissionMode::DontAsk)
-        );
-        assert_eq!(
-            PermissionMode::parse("bypass_permissions"),
-            Some(PermissionMode::BypassPermissions)
-        );
         assert_eq!(
             PermissionMode::parse("bypass-permissions"),
             Some(PermissionMode::BypassPermissions)
@@ -128,6 +111,10 @@ mod tests {
         assert_eq!(PermissionMode::parse("auto"), Some(PermissionMode::Auto));
         assert_eq!(PermissionMode::parse(""), None);
         assert_eq!(PermissionMode::parse("nonsense"), None);
+        // Non-kebab-case variants are rejected for serde consistency
+        assert_eq!(PermissionMode::parse("accept_edits"), None);
+        assert_eq!(PermissionMode::parse("AcceptEdits"), None);
+        assert_eq!(PermissionMode::parse("bypass_permissions"), None);
     }
 
     #[test]
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index d322e6a3c..9bc2398a8 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -21,6 +21,7 @@
 //!   session start. Self-dev is welcome to call `reload_from_disk()`.
 
 use crate::definition::{AgentDefinition, DefinitionError};
+use crate::permission::PermissionMode;
 
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
@@ -228,6 +229,16 @@ impl AgentRegistry {
                             AgentSource::ProjectLocal { path: path.clone() }
                         }
                     };
+                    let mut definition = definition;
+                    if matches!(source, AgentSource::ProjectLocal { .. })
+                        && definition.permission_mode == Some(PermissionMode::BypassPermissions)
+                    {
+                        tracing::warn!(
+                            agent_id = %definition.id,
+                            "project-local agent definition attempted to set bypass-permissions; downgrading to default"
+                        );
+                        definition.permission_mode = None;
+                    }
                     self.insert(LoadedAgent { definition, source });
                     loaded += 1;
                 }
diff --git a/crates/jcode-app-core/src/agent.rs b/crates/jcode-app-core/src/agent.rs
index f62fb15e7..5518ef25e 100644
--- a/crates/jcode-app-core/src/agent.rs
+++ b/crates/jcode-app-core/src/agent.rs
@@ -270,6 +270,10 @@ pub struct Agent {
     mcp_late_register_resolved: bool,
     /// Override system prompt (used by ambient mode to inject a custom prompt)
     system_prompt_override: Option<String>,
+    /// Maximum number of tool-call turns before the agent is forced to
+    /// stop. `None` means unlimited. Set by `SubagentTool` from the agent
+    /// definition's `max_turns` field.
+    max_turns: Option<u32>,
     /// Whether memory features are enabled for this session
     memory_enabled: bool,
     /// One-step undo snapshot captured before the most recent rewind.
@@ -328,6 +332,7 @@ impl Agent {
             locked_tools: None,
             mcp_late_register_resolved: false,
             system_prompt_override: crate::config::config().provider.system_prompt.clone(),
+            max_turns: None,
             memory_enabled: crate::config::config().features.memory,
             rewind_undo_snapshot: None,
             stdin_request_tx: None,
diff --git a/crates/jcode-app-core/src/agent/turn_execution.rs b/crates/jcode-app-core/src/agent/turn_execution.rs
index bb23bded7..f60916a0c 100644
--- a/crates/jcode-app-core/src/agent/turn_execution.rs
+++ b/crates/jcode-app-core/src/agent/turn_execution.rs
@@ -215,6 +215,10 @@ impl Agent {
         self.system_prompt_override = Some(prompt.to_string());
     }
 
+    pub fn set_max_turns(&mut self, max: u32) {
+        self.max_turns = Some(max);
+    }
+
     pub fn set_debug(&mut self, is_debug: bool) {
         self.session.set_debug(is_debug);
         if let Err(err) = self.session.save() {
@@ -246,6 +250,7 @@ impl Agent {
     pub(super) async fn tool_definitions(&mut self) -> Vec<ToolDefinition> {
         if self.session.is_canary {
             self.registry.register_selfdev_tools().await;
+            self.registry.register_experimental_tools().await;
         }
 
         // Return locked tools if available (prevents cache invalidation from
@@ -358,6 +363,7 @@ impl Agent {
     pub async fn tool_definitions_for_debug(&self) -> Vec<crate::message::ToolDefinition> {
         if self.session.is_canary {
             self.registry.register_selfdev_tools().await;
+            self.registry.register_experimental_tools().await;
         }
         let mut tools = self.registry.definitions(self.allowed_tools.as_ref()).await;
         if !self.disabled_tools.is_empty() {
diff --git a/crates/jcode-app-core/src/agent/turn_loops.rs b/crates/jcode-app-core/src/agent/turn_loops.rs
index 8be6df2db..96ccdbd15 100644
--- a/crates/jcode-app-core/src/agent/turn_loops.rs
+++ b/crates/jcode-app-core/src/agent/turn_loops.rs
@@ -14,8 +14,26 @@ impl Agent {
         let mut context_limit_retries = 0u32;
         let mut incomplete_continuations = 0u32;
         let mut empty_post_tool_continuations = 0u32;
+        let mut turn_count = 0u32;
 
         loop {
+            turn_count += 1;
+            if let Some(max) = self.max_turns {
+                if turn_count > max {
+                    logging::info(&format!(
+                        "max_turns limit reached ({}); forcing turn completion",
+                        max
+                    ));
+                    if final_text.is_empty() {
+                        final_text = format!(
+                            "[agent stopped: reached max_turns limit of {}]",
+                            max
+                        );
+                    }
+                    break;
+                }
+            }
+
             let repaired = self.repair_missing_tool_outputs();
             if repaired > 0 {
                 logging::warn(&format!(
diff --git a/crates/jcode-app-core/src/ambient/runner.rs b/crates/jcode-app-core/src/ambient/runner.rs
index 092f17486..8a973d842 100644
--- a/crates/jcode-app-core/src/ambient/runner.rs
+++ b/crates/jcode-app-core/src/ambient/runner.rs
@@ -385,9 +385,10 @@ impl AmbientRunnerHandle {
     ) -> anyhow::Result<()> {
         let session = Session::load(session_id)?;
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await;
         if session.is_canary {
             registry.register_selfdev_tools().await;
+            registry.register_experimental_tools().await;
         }
         // Issue #89: ambient cycles previously skipped MCP registration, so
         // user-installed MCP tools were invisible to the cycle agent —
@@ -470,9 +471,10 @@ impl AmbientRunnerHandle {
         let child_is_canary = child.is_canary;
         let child_is_debug = child.is_debug;
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await;
         if child_is_canary {
             registry.register_selfdev_tools().await;
+            registry.register_experimental_tools().await;
         }
         // Issue #89: register MCP tools for ambient cycles (same as main session).
         registry
@@ -928,7 +930,7 @@ impl AmbientRunnerHandle {
         self.set_running_detail("setting up tools").await;
 
         let cycle_provider = provider.fork();
-        let registry = tool::Registry::new(cycle_provider.clone(), None).await;
+        let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await;
         registry.register_ambient_tools().await;
         // Issue #89: register MCP tools so user-installed MCP servers are
         // available to the ambient agent — without this, the cycle agent
diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs
index b91ac925f..1612992b3 100644
--- a/crates/jcode-app-core/src/dcg_bridge.rs
+++ b/crates/jcode-app-core/src/dcg_bridge.rs
@@ -153,6 +153,36 @@ pub fn session_mode(session_id: &str) -> Option<Mode> {
         .and_then(|guard| guard.get(session_id).copied())
 }
 
+/// RAII guard that clears a per-session permission mode on drop.
+///
+/// Use this instead of manual `set_session_mode` / `clear_session_mode`
+/// pairs to guarantee cleanup even when the subagent exits via early
+/// return or error path.
+pub struct SessionModeGuard {
+    session_id: String,
+}
+
+impl SessionModeGuard {
+    /// Set the per-session mode and return a guard that will clear it on
+    /// drop. If `mode` is `None`, no override is set and the guard is a
+    /// no-op on drop (but still safe to hold).
+    #[must_use]
+    pub fn new(session_id: &str, mode: Option<Mode>) -> Self {
+        if let Some(mode) = mode {
+            set_session_mode(session_id, mode);
+        }
+        Self {
+            session_id: session_id.to_string(),
+        }
+    }
+}
+
+impl Drop for SessionModeGuard {
+    fn drop(&mut self) {
+        clear_session_mode(&self.session_id);
+    }
+}
+
 /// Classify an action using the agent-specific permission mode when
 /// provided, falling back to the global mode otherwise.
 ///
@@ -170,6 +200,19 @@ pub fn classify_for_agent(
     classify_with_mode(action, mode)
 }
 
+/// Classify an action using the per-session mode override when one exists
+/// for `session_id`, falling back to the global mode otherwise.
+///
+/// This is the session-aware variant of [`classify`]. Call sites that
+/// know the session id (e.g. tool execution within a subagent) should
+/// prefer this over the global [`classify`] so that per-session
+/// permission overrides set via [`set_session_mode`] are honoured.
+#[must_use]
+pub fn classify_for_session(action: &str, session_id: &str) -> BridgeDecision {
+    let mode = session_mode(session_id).unwrap_or_else(current_mode);
+    classify_with_mode(action, mode)
+}
+
 /// Three-state outcome from the bridge. jcode's `SafetySystem` collapses
 /// `Allow` to `ActionTier::AutoAllowed` and `Prompt`/`Deny` to
 /// `ActionTier::RequiresPermission` — but exposing the full set here
diff --git a/crates/jcode-app-core/src/overnight.rs b/crates/jcode-app-core/src/overnight.rs
index a619cdaaf..ee181ef3c 100644
--- a/crates/jcode-app-core/src/overnight.rs
+++ b/crates/jcode-app-core/src/overnight.rs
@@ -253,6 +253,7 @@ async fn run_supervisor(
 
     if child_is_canary {
         registry.register_selfdev_tools().await;
+        registry.register_experimental_tools().await;
     }
 
     let mut agent = Agent::new_with_session(provider, registry, child, None);
diff --git a/crates/jcode-app-core/src/server.rs b/crates/jcode-app-core/src/server.rs
index 8669c9404..6ae36c4bc 100644
--- a/crates/jcode-app-core/src/server.rs
+++ b/crates/jcode-app-core/src/server.rs
@@ -636,9 +636,10 @@ impl Server {
 
             let previous_status = session.status.clone();
             let provider = self.provider.fork();
-            let registry = crate::tool::Registry::new(provider.clone(), None).await;
+            let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await;
             if session.is_canary {
                 registry.register_selfdev_tools().await;
+                registry.register_experimental_tools().await;
             }
             registry
                 .register_mcp_tools(
diff --git a/crates/jcode-app-core/src/server/client_lifecycle.rs b/crates/jcode-app-core/src/server/client_lifecycle.rs
index e52e2dd05..38fc6d646 100644
--- a/crates/jcode-app-core/src/server/client_lifecycle.rs
+++ b/crates/jcode-app-core/src/server/client_lifecycle.rs
@@ -418,7 +418,7 @@ pub(super) async fn handle_client(
 
     let provider = provider_template.fork();
     let t0 = std::time::Instant::now();
-    let registry = Registry::new(provider.clone(), None).await;
+    let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await;
     let registry_ms = t0.elapsed().as_millis();
 
     let mut swarm_enabled = crate::config::config().features.swarm;
diff --git a/crates/jcode-app-core/src/server/client_session.rs b/crates/jcode-app-core/src/server/client_session.rs
index 01b229fd1..d0542800a 100644
--- a/crates/jcode-app-core/src/server/client_session.rs
+++ b/crates/jcode-app-core/src/server/client_session.rs
@@ -592,6 +592,7 @@ pub(super) async fn handle_subscribe(
         }
         drop(agent_guard);
         registry.register_selfdev_tools().await;
+        registry.register_experimental_tools().await;
     }
 
     let mcp_register_ms = if register_mcp_tools {
@@ -1039,6 +1040,7 @@ pub(super) async fn handle_resume_session(
         if is_canary {
             *client_selfdev = true;
             registry.register_selfdev_tools().await;
+            registry.register_experimental_tools().await;
         }
 
         *client_session_id = session_id.clone();
@@ -1233,6 +1235,7 @@ pub(super) async fn handle_resume_session(
     if result.is_ok() && is_canary {
         *client_selfdev = true;
         registry.register_selfdev_tools().await;
+        registry.register_experimental_tools().await;
     }
 
     match result {
diff --git a/crates/jcode-app-core/src/server/headless.rs b/crates/jcode-app-core/src/server/headless.rs
index ca2093a3f..8dc03feaa 100644
--- a/crates/jcode-app-core/src/server/headless.rs
+++ b/crates/jcode-app-core/src/server/headless.rs
@@ -50,7 +50,7 @@ pub(super) async fn create_headless_session(
     };
 
     let provider = provider_template.fork();
-    let registry = Registry::new(provider.clone(), None).await;
+    let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await;
 
     registry.enable_memory_test_mode().await;
 
diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs
index ecb9cc2f1..2299e5d88 100644
--- a/crates/jcode-app-core/src/tool/mod.rs
+++ b/crates/jcode-app-core/src/tool/mod.rs
@@ -98,6 +98,26 @@ fn session_tool_policy(session_id: &str) -> Option<SessionToolPolicy> {
         .cloned()
 }
 
+static SHARED_AGENT_REGISTRY: LazyLock<Option<Arc<jcode_agent_runtime::AgentRegistry>>> =
+    LazyLock::new(|| {
+        let home = dirs::home_dir();
+        let cwd = std::env::current_dir().ok();
+        let mut registry = jcode_agent_runtime::AgentRegistry::new();
+        registry.discover_standard_paths(
+            home.as_deref(),
+            cwd.as_deref(),
+        );
+        if registry.is_empty() {
+            None
+        } else {
+            Some(Arc::new(registry))
+        }
+    });
+
+pub fn shared_agent_registry() -> Option<Arc<jcode_agent_runtime::AgentRegistry>> {
+    SHARED_AGENT_REGISTRY.clone()
+}
+
 /// Registry of available tools (Arc-wrapped for sharing)
 ///
 /// Clone creates a fresh CompactionManager so each subagent gets independent
@@ -254,36 +274,6 @@ impl Registry {
             Self::insert_tool_timed(&mut m, &mut timings, "gmail", gmail::GmailTool::new);
             Self::insert_tool_timed(&mut m, &mut timings, "schedule", ambient::ScheduleTool::new);
             Self::insert_tool_timed(&mut m, &mut timings, "selfdev", selfdev::SelfDevTool::new);
-            Self::insert_tool_timed(
-                &mut m,
-                &mut timings,
-                "team_create",
-                team::TeamCreateTool::new,
-            );
-            Self::insert_tool_timed(
-                &mut m,
-                &mut timings,
-                "team_delete",
-                team::TeamDeleteTool::new,
-            );
-            Self::insert_tool_timed(
-                &mut m,
-                &mut timings,
-                "task_create",
-                task_management::TaskCreateTool::new,
-            );
-            Self::insert_tool_timed(
-                &mut m,
-                &mut timings,
-                "task_update",
-                task_management::TaskUpdateTool::new,
-            );
-            Self::insert_tool_timed(
-                &mut m,
-                &mut timings,
-                "task_list",
-                task_management::TaskListTool::new,
-            );
             let nonzero: Vec<String> = timings
                 .iter()
                 .filter(|(_, ms)| *ms > 0)
@@ -383,6 +373,45 @@ impl Registry {
             Self::insert_tool(&mut tools_map, "dcp_recompress", DcpRecompressTool::new());
         }
 
+        // Register experimental team/task tools when opted in via env var.
+        // Canary sessions register these explicitly via register_experimental_tools().
+        let experimental_tools_enabled = matches!(
+            std::env::var("JCODE_EXPERIMENTAL_TOOLS")
+                .ok()
+                .as_deref()
+                .map(str::trim)
+                .map(str::to_ascii_lowercase)
+                .as_deref(),
+            Some("1") | Some("true") | Some("yes") | Some("on")
+        );
+        if experimental_tools_enabled && !no_builtin {
+            Self::insert_tool(
+                &mut tools_map,
+                "team_create",
+                team::TeamCreateTool::new(),
+            );
+            Self::insert_tool(
+                &mut tools_map,
+                "team_delete",
+                team::TeamDeleteTool::new(),
+            );
+            Self::insert_tool(
+                &mut tools_map,
+                "task_create",
+                task_management::TaskCreateTool::new(),
+            );
+            Self::insert_tool(
+                &mut tools_map,
+                "task_update",
+                task_management::TaskUpdateTool::new(),
+            );
+            Self::insert_tool(
+                &mut tools_map,
+                "task_list",
+                task_management::TaskListTool::new(),
+            );
+        }
+
         let write_start = std::time::Instant::now();
         *registry.tools.write().await = tools_map;
         let write_ms = write_start.elapsed().as_millis();
@@ -995,6 +1024,39 @@ impl Registry {
         .await;
     }
 
+    /// Register experimental team/task tools.
+    ///
+    /// Gated behind `JCODE_EXPERIMENTAL_TOOLS=1` or canary sessions.
+    /// These tools expose team and task management primitives that are
+    /// still under active development and not yet ready for general use.
+    pub async fn register_experimental_tools(&self) {
+        self.register(
+            "team_create".to_string(),
+            Arc::new(team::TeamCreateTool::new()) as Arc<dyn Tool>,
+        )
+        .await;
+        self.register(
+            "team_delete".to_string(),
+            Arc::new(team::TeamDeleteTool::new()) as Arc<dyn Tool>,
+        )
+        .await;
+        self.register(
+            "task_create".to_string(),
+            Arc::new(task_management::TaskCreateTool::new()) as Arc<dyn Tool>,
+        )
+        .await;
+        self.register(
+            "task_update".to_string(),
+            Arc::new(task_management::TaskUpdateTool::new()) as Arc<dyn Tool>,
+        )
+        .await;
+        self.register(
+            "task_list".to_string(),
+            Arc::new(task_management::TaskListTool::new()) as Arc<dyn Tool>,
+        )
+        .await;
+    }
+
     /// Register ambient-mode tools (only for ambient sessions)
     pub async fn register_ambient_tools(&self) {
         self.register(
diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs
index 6c87e65ef..31546dddf 100644
--- a/crates/jcode-app-core/src/tool/task.rs
+++ b/crates/jcode-app-core/src/tool/task.rs
@@ -193,19 +193,39 @@ impl Tool for SubagentTool {
         };
         let parent_subagent_model = Self::preferred_parent_subagent_model(&ctx.session_id);
         let provider_model = self.provider.model();
-        let resolved_model = Self::resolve_model(
-            params.model.as_deref(),
-            session.model.as_deref(),
-            parent_subagent_model.as_deref(),
-            &provider_model,
-        );
+        // When the agent definition specifies model_override or prefer_tier,
+        // use its resolve_model() which honours those fields. Otherwise fall
+        // back to the standard resolution chain.
+        let resolved_model = if let Some(def) = agent_def {
+            if def.model_override.is_some() || def.prefer_tier.is_some() {
+                def.resolve_model(&provider_model)
+            } else {
+                Self::resolve_model(
+                    params.model.as_deref(),
+                    session.model.as_deref(),
+                    parent_subagent_model.as_deref(),
+                    &provider_model,
+                )
+            }
+        } else {
+            Self::resolve_model(
+                params.model.as_deref(),
+                session.model.as_deref(),
+                parent_subagent_model.as_deref(),
+                &provider_model,
+            )
+        };
         session.model = Some(resolved_model.clone());
 
         if let Some(ref working_dir) = ctx.working_dir {
             session.working_dir = Some(working_dir.display().to_string());
         }
 
-        // Register child in parent's session
+        // Register child in parent's session.
+        // NOTE: This load→mutate→save sequence is not atomic. Concurrent
+        // subagent spawns sharing the same parent could clobber each
+        // other's `children` entries. Acceptable for experimental Phase 0;
+        // a file-lock or in-memory session cache would fix this properly.
         if let Ok(mut parent_session) = Session::load(&ctx.session_id) {
             parent_session.add_child(session.id.clone());
             let _ = parent_session.save();
@@ -214,16 +234,19 @@ impl Tool for SubagentTool {
         session.save()?;
 
         // Propagate the effective permission mode to the child session so
-        // that `dcg_bridge::classify_for_agent` / `session_mode` observe it
-        // during the child's tool execution.
+        // that `dcg_bridge::classify_for_session` / `session_mode` observe
+        // it during the child's tool execution. The guard clears the
+        // override on drop (both success and error paths).
         let child_session_id = session.id.clone();
-        if let Some(pm) = effective_permission_mode {
-            let dcg_mode = dcg_bridge::permission_mode_to_dcg(pm);
-            dcg_bridge::set_session_mode(&child_session_id, dcg_mode);
+        let _mode_guard = dcg_bridge::SessionModeGuard::new(
+            &child_session_id,
+            effective_permission_mode.map(dcg_bridge::permission_mode_to_dcg),
+        );
+        if effective_permission_mode.is_some() {
             logging::info(&format!(
                 "[tool:subagent] session {} permission mode: {} (from agent definition)",
                 child_session_id,
-                pm.as_str(),
+                effective_permission_mode.unwrap().as_str(),
             ));
         }
 
@@ -325,8 +348,9 @@ impl Tool for SubagentTool {
                 ));
             }
             if let Some(max_turns) = def.max_turns {
+                agent.set_max_turns(max_turns);
                 logging::info(&format!(
-                    "[tool:subagent] agent definition '{}' specifies max_turns={}",
+                    "[tool:subagent] agent definition '{}' max_turns={} enforced",
                     params.subagent_type, max_turns,
                 ));
             }
@@ -344,7 +368,6 @@ impl Tool for SubagentTool {
                     resolved_model,
                     err
                 ));
-                dcg_bridge::clear_session_mode(&child_session_id);
                 return Err(err);
             }
         };
@@ -367,8 +390,7 @@ impl Tool for SubagentTool {
             start.elapsed().as_secs_f64()
         ));
 
-        // Clean up per-session permission mode to prevent unbounded growth.
-        dcg_bridge::clear_session_mode(&child_session_id);
+        // _mode_guard drops here, clearing the per-session permission override.
 
         listener.abort();
 
diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs
index 6c8ad7738..39b48fc75 100644
--- a/crates/jcode-app-core/src/tool/team.rs
+++ b/crates/jcode-app-core/src/tool/team.rs
@@ -40,9 +40,31 @@ pub struct TeamTask {
     pub owner: Option<String>, // member name
 }
 
+/// Validate that a team name is safe for use as a filename.
+/// Rejects path traversal attempts and special characters.
+fn validate_team_name(name: &str) -> Result<()> {
+    if name.is_empty() {
+        anyhow::bail!("Team name cannot be empty");
+    }
+    if name.contains("..") || name.contains('/') || name.contains('\\') {
+        anyhow::bail!(
+            "Team name '{}' is invalid: must not contain '..', '/', or '\\'",
+            name
+        );
+    }
+    if !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') {
+        anyhow::bail!(
+            "Team name '{}' is invalid: only alphanumeric, hyphen, and underscore allowed",
+            name
+        );
+    }
+    Ok(())
+}
+
 impl TeamConfig {
     /// Load a team config from disk by name.
     pub fn load(name: &str) -> Result<Option<Self>> {
+        validate_team_name(name)?;
         let path = teams_dir().join(format!("{name}.json"));
         if !path.exists() {
             return Ok(None);
@@ -53,6 +75,7 @@ impl TeamConfig {
 
     /// Save this team config to disk.
     pub fn save(&self) -> Result<()> {
+        validate_team_name(&self.name)?;
         let dir = teams_dir();
         std::fs::create_dir_all(&dir)?;
         let path = dir.join(format!("{}.json", self.name));
@@ -63,6 +86,7 @@ impl TeamConfig {
 
     /// Delete a team config from disk by name.
     pub fn delete(name: &str) -> Result<()> {
+        validate_team_name(name)?;
         let path = teams_dir().join(format!("{name}.json"));
         if path.exists() {
             std::fs::remove_file(&path)?;
diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs
index 21c3bcc6f..b59ce9225 100644
--- a/crates/jcode-base/src/provider/gemini_tests.rs
+++ b/crates/jcode-base/src/provider/gemini_tests.rs
@@ -400,7 +400,7 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() {
 #[tokio::test]
 async fn build_tools_from_registry_definitions_omits_const_keywords() {
     let provider: Arc<dyn Provider> = Arc::new(MockProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
     let defs = registry.definitions(None).await;
 
     let built = build_tools(&defs).expect("gemini tools");
diff --git a/src/bin/harness.rs b/src/bin/harness.rs
index d6e9a301d..e0a467f98 100644
--- a/src/bin/harness.rs
+++ b/src/bin/harness.rs
@@ -73,7 +73,7 @@ async fn main() -> Result<()> {
     eprintln!("Harness workspace: {}", workspace.display());
 
     let provider: Arc<dyn Provider> = Arc::new(NoopProvider);
-    let registry = Registry::new(provider).await;
+    let registry = Registry::new(provider, None).await;
 
     let session_id = new_id("harness");
     let base_ctx = ToolContext {
diff --git a/src/cli/commands.rs b/src/cli/commands.rs
index cbe734875..a257ce34e 100644
--- a/src/cli/commands.rs
+++ b/src/cli/commands.rs
@@ -2595,7 +2595,7 @@ pub async fn run_single_message_command(
     } else {
         super::provider_init::init_provider_for_validation(choice, model).await?
     };
-    let registry = crate::tool::Registry::new(provider.clone()).await;
+    let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await;
     let mut agent = crate::agent::Agent::new(provider.clone(), registry);
     restore_agent_session_if_requested(&mut agent, resume_session)?;
 
diff --git a/src/cli/commands_tests.rs b/src/cli/commands_tests.rs
index c8aba0c90..224e4bceb 100644
--- a/src/cli/commands_tests.rs
+++ b/src/cli/commands_tests.rs
@@ -952,7 +952,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() {
     let _guard = crate::storage::lock_test_env();
 
     let provider: Arc<dyn Provider> = Arc::new(TestProvider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut original = crate::agent::Agent::new(provider.clone(), registry);
     let original_session_id = original.session_id().to_string();
     original
@@ -960,7 +960,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() {
         .await
         .expect("seed session");
 
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut resumed = crate::agent::Agent::new(provider, registry);
     let fresh_session_id = resumed.session_id().to_string();
     assert_ne!(fresh_session_id, original_session_id);
diff --git a/src/cli/provider_init.rs b/src/cli/provider_init.rs
index 7fce11689..f6efc81a7 100644
--- a/src/cli/provider_init.rs
+++ b/src/cli/provider_init.rs
@@ -1780,7 +1780,7 @@ pub async fn init_provider_and_registry(
     model: Option<&str>,
 ) -> Result<(Arc<dyn provider::Provider>, tool::Registry)> {
     let provider = init_provider(choice, model).await?;
-    let registry = tool::Registry::new(provider.clone()).await;
+    let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await;
     Ok((provider, registry))
 }
 
@@ -1789,7 +1789,7 @@ pub async fn init_provider_and_registry_for_validation(
     model: Option<&str>,
 ) -> Result<(Arc<dyn provider::Provider>, tool::Registry)> {
     let provider = init_provider_for_validation(choice, model).await?;
-    let registry = tool::Registry::new(provider.clone()).await;
+    let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await;
     Ok((provider, registry))
 }
 
diff --git a/src/cli/selfdev_tests.rs b/src/cli/selfdev_tests.rs
index 643f73902..0836c9df7 100644
--- a/src/cli/selfdev_tests.rs
+++ b/src/cli/selfdev_tests.rs
@@ -130,7 +130,7 @@ async fn test_selfdev_tool_registration() {
     assert!(session.is_canary, "Session should be marked as canary");
 
     let provider = Arc::new(TestProvider) as Arc<dyn provider::Provider>;
-    let registry = tool::Registry::new(provider).await;
+    let registry = tool::Registry::new(provider, None).await;
 
     let tools_before: Vec<String> = registry.tool_names().await;
     let has_selfdev_before = tools_before.contains(&"selfdev".to_string());
@@ -167,7 +167,7 @@ async fn test_selfdev_session_and_registry() {
     assert!(loaded.is_canary, "Loaded session should be canary");
 
     let provider = Arc::new(TestProvider) as Arc<dyn provider::Provider>;
-    let registry = tool::Registry::new(provider.clone()).await;
+    let registry = tool::Registry::new(provider.clone(), None).await;
 
     let tools_before = registry.tool_names().await;
     assert!(
diff --git a/tests/e2e/ambient.rs b/tests/e2e/ambient.rs
index d92012834..9438f2b0f 100644
--- a/tests/e2e/ambient.rs
+++ b/tests/e2e/ambient.rs
@@ -203,7 +203,7 @@ async fn test_ambient_end_cycle_tool() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     registry.register_ambient_tools().await;
 
     let mut agent = Agent::new(provider, registry);
@@ -261,7 +261,7 @@ async fn test_ambient_request_permission_tool() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     registry.register_ambient_tools().await;
 
     let mut agent = Agent::new(provider, registry);
@@ -309,7 +309,7 @@ async fn test_ambient_schedule_tool() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     registry.register_ambient_tools().await;
 
     let mut agent = Agent::new(provider, registry);
@@ -585,7 +585,7 @@ async fn test_full_ambient_cycle_simulation() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     registry.register_ambient_tools().await;
 
     let mut agent = Agent::new(provider.clone(), registry);
diff --git a/tests/e2e/provider_behavior.rs b/tests/e2e/provider_behavior.rs
index 5bce2b96f..f82213547 100644
--- a/tests/e2e/provider_behavior.rs
+++ b/tests/e2e/provider_behavior.rs
@@ -25,7 +25,7 @@ async fn test_multi_turn_conversation() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     // First turn
@@ -60,7 +60,7 @@ async fn test_token_usage() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let response = agent.run_once_capture("Test").await?;
@@ -84,7 +84,7 @@ async fn test_stream_error() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let result = agent.run_once_capture("Test").await;
@@ -800,7 +800,7 @@ async fn test_system_prompt_no_claude_code_identity() -> Result<()> {
     // Keep a clone of Arc<MockProvider> before converting to Arc<dyn Provider>
     let provider_for_check = provider.clone();
     let provider_dyn: Arc<dyn jcode::provider::Provider> = provider;
-    let registry = Registry::new(provider_dyn.clone()).await;
+    let registry = Registry::new(provider_dyn.clone(), None).await;
     let mut agent = Agent::new(provider_dyn, registry);
 
     // Run a simple query - we just need to trigger a complete() call
diff --git a/tests/e2e/session_flow.rs b/tests/e2e/session_flow.rs
index b84df85a1..587781d8b 100644
--- a/tests/e2e/session_flow.rs
+++ b/tests/e2e/session_flow.rs
@@ -138,7 +138,7 @@ async fn test_simple_response() -> Result<()> {
     ]);
 
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
 
     let response = agent.run_once_capture("Say hello").await?;
@@ -154,7 +154,7 @@ async fn test_agent_clear_preserves_debug_flag() -> Result<()> {
     let _env = setup_test_env()?;
     let provider = MockProvider::new();
     let provider: Arc<dyn jcode::provider::Provider> = Arc::new(provider);
-    let registry = Registry::new(provider.clone()).await;
+    let registry = Registry::new(provider.clone(), None).await;
     let mut agent = Agent::new(provider, registry);
     agent.set_debug(true);
     let old_session_id = agent.session_id().to_string();