diff --git a/.agents/skills/feature-planning/SKILL.md b/.claude/.agents/skills/feature-planning/SKILL.md similarity index 100% rename from .agents/skills/feature-planning/SKILL.md rename to .claude/.agents/skills/feature-planning/SKILL.md diff --git a/.agents/skills/feature-planning/references/repo-summaries.md b/.claude/.agents/skills/feature-planning/references/repo-summaries.md similarity index 100% rename from .agents/skills/feature-planning/references/repo-summaries.md rename to .claude/.agents/skills/feature-planning/references/repo-summaries.md diff --git a/.jcode/agents/basher.toml b/.jcode/agents/basher.toml new file mode 100644 index 000000000..da53e515a --- /dev/null +++ b/.jcode/agents/basher.toml @@ -0,0 +1,80 @@ +# Basher agent. +# +# Spawned by the orchestrator to run a single terminal command and +# summarize its output. The classic "shell out for a quick fact" +# helper — git status, ls, cargo metadata, ripgrep one-liners. +# +# Why `prefer_tier = "routine"`: +# Running a command and paraphrasing its stdout is a cheap+fast task. +# A pay-per-token user with `JCODE_ROUTING_ROUTINE=` +# keeps the cost of these frequent leaf calls low. Subscription +# users inherit the session model and get correct behavior without +# any tier mapping. +# +# Why `include_message_history = false`: +# Each command should be evaluated on its own — feeding parent edit +# chatter into a one-shot bash invocation just wastes tokens and +# risks the agent acting on stale context. Clean slate per command. +# +# Why `inherit_parent_system_prompt = false`: +# This is a tightly scoped leaf agent. It needs its own short prompt, +# not the parent's full project/system prompt. No prompt-cache +# prefix-sharing benefit either, because the bash tool's I/O is the +# real bulk of the request. +# +# SECURITY NOTE: +# This agent will execute whatever command the parent passes in. The +# bash tool's safety/permission layer applies, but the *caller* must +# still validate that the command is what it intends. Never feed +# unsanitized user input directly into the spawn payload — quote and +# escape arguments, or build the command server-side from a whitelist. + +id = "basher" +display_name = "Basher" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "routine" +reasoning = "minimal" + +# Basher runs terminal commands — auto-approve file ops so the parent +# doesn't need to re-approve every bash call. Network/spawn still prompt. +permission_mode = "accept-edits" +max_turns = 10 + +include_message_history = false +inherit_parent_system_prompt = false +output_mode = "last_message" + +# Single tool: jcode's terminal command runner. +tool_names = ["bash"] + +# Leaf agent — does not spawn other agents. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent to run a single terminal command and get a short +summary of its output. Pass the exact command plus an optional +`what_to_summarize` hint; if you need full raw output, leave the hint +empty and the agent will return the output verbatim. +""" + +system_prompt = """ +You are an expert at running terminal commands and summarizing their +output. + +Inputs you receive: +- the command to run (required). +- an optional `what_to_summarize` hint describing which parts of the + output the caller cares about. + +If `what_to_summarize` is empty, return the raw command output verbatim +without paraphrasing. +""" + +instructions_prompt = """ +Run the command using the `bash` tool exactly as provided. Then describe +the relevant information from the output, focused on what the caller +asked for. Be concise. Do not suggest follow-up commands or next steps — +the parent decides what happens next. +""" diff --git a/.jcode/agents/code-reviewer.toml b/.jcode/agents/code-reviewer.toml new file mode 100644 index 000000000..7d44e08ba --- /dev/null +++ b/.jcode/agents/code-reviewer.toml @@ -0,0 +1,80 @@ +# Code reviewer agent. +# +# Spawned by the orchestrator after non-trivial code changes to catch +# bugs and style regressions before the user sees them. Adapted from +# Codebuff's `code-reviewer`. +# +# Why `prefer_tier = "thinking"`: +# Review work benefits from reasoning. A pay-per-token user with +# `JCODE_ROUTING_THINKING=` gets the right model +# for the right job; subscription users inherit the session model. +# +# Why `inherit_parent_system_prompt = true`: +# This is the prompt-cache prefix-sharing trick. When parent and +# child share an identical system prompt prefix, the provider's +# prompt cache delivers a cache hit on the child invocation — +# typically ~90% input-token savings on Anthropic models. +# +# IMPORTANT: must leave `system_prompt` empty (validated). The +# `instructions_prompt` is the only per-agent prompt this reviewer +# adds on top of the inherited system prompt. + +id = "code-reviewer" +display_name = "Code Reviewer" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "thinking" +reasoning = "medium" + +inherit_parent_system_prompt = true +include_message_history = true +output_mode = "last_message" + +# Reviewer is read-only — plan mode denies writes without prompting. +permission_mode = "plan" +max_turns = 15 + +tool_names = [ + "read", + "grep", +] + +# Reviewers don't spawn other agents — they read, reason, and report. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent after non-trivial code changes to review them. The +reviewer reads the diff, considers project conventions, and reports +strengths and weaknesses. Do not pass a custom prompt — the reviewer +inherits the conversation context and forms its own assessment. +""" + +# system_prompt MUST be empty when inherit_parent_system_prompt is true. +# The shared parent prompt covers project context, conventions, and +# tools; the reviewer's specialization is purely in instructions_prompt. + +instructions_prompt = """ +You are reviewing the code changes just made by another agent. + +Focus on: +- Correctness: does the code do what the user asked? +- Project conventions: imports, formatting, naming, error handling. +- Test coverage: are new code paths exercised? +- Edge cases: what could go wrong? What was missed? + +Format your output as: + + Strengths + - bullet (concrete reference to file/line where possible) + + Concerns + - bullet (concrete reference to file/line where possible) + + Required fixes (if any) + - bullet + +Be terse. Be specific. Do not restate code that's already in the diff. +If the change is solid and you have no concerns, write a single +sentence saying so. +""" diff --git a/.jcode/agents/editor.toml b/.jcode/agents/editor.toml new file mode 100644 index 000000000..4ab1e83d8 --- /dev/null +++ b/.jcode/agents/editor.toml @@ -0,0 +1,91 @@ +# Code editor agent. +# +# Spawned by the orchestrator to perform precise, reasoned code edits. +# Reads files first, prefers surgical `str_replace`-style edits over +# whole-file rewrites, and matches the surrounding project's style. +# +# Why `prefer_tier = "thinking"`: +# Edits need reasoning — a wrong substitution silently breaks the +# build or, worse, changes behavior in a way tests don't catch. A +# pay-per-token user with `JCODE_ROUTING_THINKING=` +# gets the right tool for the job; subscription users inherit the +# session model. +# +# Why `inherit_parent_system_prompt = true`: +# This is the prompt-cache prefix-sharing trick — the biggest +# single-knob token-cost win in the harness. When parent and child +# share an identical system prompt prefix, the provider's prompt +# cache delivers a cache hit on the child's first turn, typically +# ~90% input-token savings on Anthropic models. The editor is one +# of the most-spawned sub-agents, so this matters. +# +# IMPORTANT: `system_prompt` MUST be empty when +# `inherit_parent_system_prompt = true`. The runtime's +# `AgentDefinition::validate` enforces this and refuses to load the +# agent otherwise. Per-agent specialization lives in +# `instructions_prompt` only. +# +# Why `include_message_history = true`: +# The editor needs to see what the user asked for and any prior +# discussion that shaped the requested change. Without history it +# would re-derive context the parent already has. + +id = "editor" +display_name = "Code Editor" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "thinking" +reasoning = "medium" + +# Editor makes code edits — auto-approve file operations so the parent +# agent doesn't need to re-approve every str_replace/write call. +permission_mode = "accept-edits" + +inherit_parent_system_prompt = true +include_message_history = true +output_mode = "all_messages" + +# system_prompt MUST be empty when inherit_parent_system_prompt = true +# (validated at load time). Specialization is purely in +# instructions_prompt below. + +# Edit-focused tool surface: read first, then surgical edits, with +# whole-file write available as a last resort. +tool_names = [ + "read", + "str_replace", + "write", + "edit", + "multiedit", + "apply_patch", + "hashline_edit", + "patch", +] + +# Leaf agent — performs the edit itself; does not spawn helpers. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent for precise code edits that need reasoning. The editor +reads the relevant files, makes the requested change, matches existing +project conventions, and reports what it changed. Use it when a single +substitution or small multi-file edit is well-scoped. +""" + +instructions_prompt = """ +You are an expert code editor. + +Make the requested edit: +1. Read the target file(s) first to confirm current contents. +2. Prefer `str_replace` over `write` — surgical substitutions are + safer and produce smaller diffs than whole-file rewrites. +3. Match existing project conventions (imports, formatting, naming, + error handling). Look at sibling code if unsure. +4. Do not introduce new dependencies. If the change appears to need + one, stop and report instead of adding it. + +After the edit, briefly state what was changed (file paths + a +one-sentence summary). Do not restate code already visible in the +edit's diff. +""" diff --git a/.jcode/agents/file-picker.toml b/.jcode/agents/file-picker.toml new file mode 100644 index 000000000..6d6e41081 --- /dev/null +++ b/.jcode/agents/file-picker.toml @@ -0,0 +1,75 @@ +# File picker agent. +# +# Spawned by the orchestrator to find files in the codebase that are +# relevant to a task. Adapted from Codebuff's `file-picker` agent. +# +# Why `prefer_tier = "routine"`: +# File picking is a fuzzy-search task — a smaller/cheaper model +# handles it well. Pay-per-token users who set +# `JCODE_ROUTING_ROUTINE=` save real money here. +# Subscription users (Claude Pro, ChatGPT Plus, ...) inherit the +# session model and get correctness without any tier mapping. +# +# Why `include_message_history = false`: +# File picker doesn't need to see prior edit chatter. A clean slate +# keeps the prompt short and avoids accidentally biasing path +# selection toward already-touched files. +# +# Why `inherit_parent_system_prompt = false`: +# Like basher, this is a tightly scoped leaf agent. It needs its own +# short prompt focused on file discovery, not the parent's full +# project/system prompt. + +id = "file-picker" +display_name = "Fletcher the File Fetcher" +publisher = "jcode" +version = "0.1.0" + +prefer_tier = "routine" +reasoning = "minimal" + +include_message_history = false +inherit_parent_system_prompt = false +output_mode = "last_message" + +# File picker is read-only — plan mode denies writes without prompting. +permission_mode = "plan" +max_turns = 5 + +# Tools required: read project file tree + glob fallback. Whitelist is +# checked at runtime against the tool registry; unknown tools fail loudly +# rather than silently degrading. +tool_names = [ + "ls", + "glob", + "read", +] + +# This agent is a leaf — it does not spawn other agents. +spawnable_agents = [] + +spawner_prompt = """ +Spawn this agent to find relevant files in the codebase. Provide a brief +description of what you're looking for. The agent will return up to ~12 +file paths with one-line summaries. It does fuzzy semantic search; for +exact-string searches, spawn a code searcher instead. +""" + +system_prompt = """ +You are an expert at finding relevant files in a codebase. You have the +project file tree and the user's request. Return the most relevant +files, one per line, prefixed with the path. After the list, write a +single short paragraph explaining how the files relate to the request. + +Do not read file contents — that is the parent agent's job. +Do not propose changes — that is the editor's job. +Stay focused on path discovery. +""" + +instructions_prompt = """ +Provide an extremely concise report: +1. List of relevant file paths (one per line). +2. One paragraph (<= 4 sentences) explaining the relevance. + +Do not exceed 12 paths unless the parent explicitly asks for more. +""" diff --git a/.omo/plans/multi-agent-master-plan.md b/.omo/plans/multi-agent-master-plan.md new file mode 100644 index 000000000..87e5ef7cd --- /dev/null +++ b/.omo/plans/multi-agent-master-plan.md @@ -0,0 +1,1145 @@ +# Implementation Plan: Multi-Agent System for jcode +> Generated from research across 9 repos + jcode codebase analysis +> Goal: Full multi-agent orchestration — model-driven delegation, team pipeline, DAG parallelism, agent tree lifecycle + +--- + +## 1. Executive Summary + +jcode currently has swarm visualization infrastructure (TUI, protocol, prompts) but **zero agent spawning/driving logic**. The LLM can talk about swarm helpers in prompts, but there's no actual `agent` tool, no agent tree, no sub-agent lifecycle, and no team pipeline. + +This plan builds a production-grade multi-agent system by synthesizing the best patterns from codex (AgentPath tree + mailbox, proven in Rust), Claude Code (tool-based delegation, the model drives everything), oh-my-pi (DAG wave parallelism), codebuff (LLM-derived pipeline + cost aggregation), and oh-my-claudecode (team lifecycle + file-based shared state). The result is a three-surface system: **model-driven delegation** (LLM calls `agent` tool), **team pipeline** (CLI-driven multi-step workflow), and **batch processing** (programmatic multi-agent jobs). + +--- + +## 2. Architecture Decision + +### Chosen Approach: Hybrid Tree + Tool + Wave + +``` +┌─────────────────────────────────────────────────────────┐ +│ AgentControl │ +│ (central registry: tree, threads, names, mailboxes) │ +│ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ /root │ │ /root/ │ │ /root/ │ │ +│ │ (user │ │ explorer │ │ worker │ │ +│ │ session) │ │ (read-only) │ │ (execute) │ │ +│ └──────┬───────┘ └──────────────┘ └──────────────┘ │ +│ │ │ +│ ┌──────┴───────┐ │ +│ │ /root/worker │ │ +│ │ /code-review │ │ +│ │ (sub-task) │ │ +│ └──────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +Three delegation modes, one agent tree: + +| Mode | Trigger | Use Case | Parallelism | +|------|---------|----------|-------------| +| **Tool-based** | LLM calls `agent` tool | Model decides to delegate | Sync/async/fork | +| **Team pipeline** | `jcode team` CLI | Plan→PRD→Exec→Verify→Fix | DAG wave | +| **Batch** | `jcode agent batch` CSV | Parallel research/review jobs | FuturesUnordered | + +### Alternatives Considered + +| Approach | Source Repo | Pros | Cons | Decision | +|----------|-------------|------|------|----------| +| AgentPath tree + mailbox | codex | Hierarchical addressing, async decoupling, Rust-native, production-tested | Higher initial complexity | **PRIMARY** — best fit for Rust codebase | +| Tool-based delegation | CC | Model drives everything, simple mental model, proven UX | No automated pipeline | **PRIMARY** — best UX for interactive use | +| DAG wave parallelism | oh-my-pi | Clean dependency resolution, parallel by default | Requires DAG definition upfront | **SECONDARY** — for team pipeline only | +| Centralized orchestrator | codebuff | LLM-pipeline means flexible | Spawning overhead per step | **SECONDARY** — for team pipeline | +| Tmux teams | oh-my-claudecode | Pragmatic, visible | OS-level coupling, fragile | **REFERENCE** — file-based state pattern | +| Single monolithic agent | pi-agent-rust | Simplest, zero overhead | No delegation at all | **REJECTED** — doesn't meet goal | +| Protocol-first | opencode | Clean abstraction | Over-engineered for our needs | **REJECTED** — too abstract | + +--- + +## 3. Data Structures & Types + +```rust +// === Core Agent Tree === + +/// Unique path in the agent tree. +/// Examples: "/root", "/root/explorer", "/root/worker/code-review" +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentPath(Arc); + +impl AgentPath { + pub fn root() -> Self { Self("/root".into()) } + pub fn parent(&self) -> Option; + pub fn child(&self, name: &str) -> AgentPath; + pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool; +} + +/// Agent identity — registered in AgentControl. +#[derive(Debug, Clone)] +pub struct AgentEntry { + pub id: AgentId, // UUID + pub path: AgentPath, // Tree position + pub name: String, // Human-readable nickname (unique pool) + pub role: AgentRole, + pub config: AgentConfig, + pub state: AgentState, + pub created_at: Instant, + pub ancestry: AgentAncestry, // parent_id, ancestor_ids + pub mailbox: Option, +} + +/// Role determines default model, tools, and permissions. +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub enum AgentRole { + /// General agent — full tool access, plans and executes + Default, + /// Read-only investigator — grep, read, glob, websearch only + Explorer, + /// Execute known plan — limited tools, no planning + Worker, + /// Orchestrator — delegates subtasks, synthesizes results + Orchestrator, +} + +/// Agent config bundle — inspired by opencode + codex role profiles. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentConfig { + pub model: Option, // None = inherit parent + pub system_prompt: Option, // None = inherit, Some = override + pub tools: AgentToolPolicy, + pub permissions: AgentPermissionBound, + pub max_turns: u32, // Hard stop + pub max_cost: Option, // Cost cap (USD) + pub timeout: Option, // Wall-clock timeout +} + +/// What tools this agent can use. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum AgentToolPolicy { + /// Inherit parent's tool policy + Inherit, + /// Explicit allow list + Allow(HashSet), + /// Inherit + add + Extend(HashSet), + /// No tools (chat-only) + None, +} + +/// Permission boundary — bubble model from CC. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentPermissionBound { + pub max_risk_level: RiskLevel, // Can't exceed this + pub allow_approve: bool, // Can approve own requests + pub pre_approved: Vec, // Always-ok tool calls +} + +// === Mailbox (from codex) === + +/// One-shot channel for agent communication. +type MailboxSender = tokio::sync::oneshot::Sender; +type MailboxReceiver = tokio::sync::oneshot::Receiver; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentMessage { + pub from: AgentPath, + pub kind: AgentMessageKind, + pub payload: serde_json::Value, + pub timestamp: Instant, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum AgentMessageKind { + /// "Do this subtask, report back" + Task { prompt: String, max_turns: u32 }, + /// "Here are the results" + Result { output: String, cost: Option }, + /// "I need more context" + RequestInfo { question: String }, + /// "Here's the info you requested" + Info { data: serde_json::Value }, + /// "Stop what you're doing" + Cancel, +} + +// === Agent spawn tool input/output === + +/// The `agent` tool that the LLM calls. +#[derive(Debug, Deserialize)] +pub struct AgentToolInput { + /// Role: "explorer", "worker", "orchestrator", or "default" + pub role: String, + /// What to do + pub prompt: String, + /// Sync (wait), async (fire-and-forget), fork (share prompt cache) + #[serde(default = "default_mode")] + pub mode: AgentSpawnMode, + /// Optional tools to add beyond role defaults + #[serde(default)] + pub extra_tools: Vec, + /// Optional max turns for this sub-agent + #[serde(default = "default_subagent_turns")] + pub max_turns: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub enum AgentSpawnMode { + #[default] + /// Wait for completion, return result + Sync, + /// Fire and forget — results logged but not returned + Async, + /// Spawn with current prompt cache — zero cold start + Fork, +} + +/// What the LLM sees after `agent` tool completes. +#[derive(Debug, Serialize)] +pub struct AgentToolOutput { + pub agent_id: String, + pub agent_path: String, + pub result: Option, // None for async + pub turn_count: u32, + pub cost: Option, + pub timed_out: bool, +} + +// === Agent tree registry === + +/// Central agent tree — thread-safe, tree-addressed. +pub struct AgentControl { + tree: Arc>, + name_pool: Arc>>, + thread_limits: AgentThreadLimits, +} + +struct AgentTreeInner { + agents: HashMap, + parent_children: HashMap>, + next_id: u64, +} + +pub struct AgentThreadLimits { + pub max_depth: u32, // Default: 5 + pub max_siblings: u32, // Default: 10 + pub max_total: u32, // Default: 50 +} + +// === DAG pipeline (from oh-my-pi) === + +/// A plan step in the DAG. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PlanStep { + pub id: String, + pub agent_role: AgentRole, + pub prompt: String, + pub depends_on: Vec, // Step IDs that must complete first + pub timeout: Option, +} + +/// Wave = set of steps that can run in parallel. +pub struct ExecutionWave { + pub wave_index: usize, + pub steps: Vec, +} +``` + +--- + +## 4. Pseudocode — Core Algorithm + +### 4a. Spawn Sub-Agent (Tool-Based Delegation) + +``` +FUNCTION spawn_agent(parent_session, input: AgentToolInput): + // 1. Validate + role = RESOLVE_ROLE(input.role) + VALIDATE parent_session can spawn(role) + CHECK AgentControl.thread_limits (depth < max_depth, siblings < max_siblings) + + // 2. Build AgentConfig from role defaults + input overrides + config = AgentConfig { + model: role.default_model ?? parent_session.model, + tools: role.default_tools + input.extra_tools, + permissions: role.default_permissions, + max_turns: input.max_turns, + ... + } + + // 3. Create mailbox + (tx, rx) = oneshot::channel() + + // 4. Register in AgentTree + path = parent_session.path.child(autoname()) + entry = AgentEntry { path, role, config, mailbox: tx, ... } + AgentControl.register(entry) + + // 5. Fire SubagentStart hook + FIRE_HOOK(SubagentStart { parent_path: parent.path, child_path: path, role }) + + // 6. Handle mode: + IF input.mode == Sync: + // Run sub-agent in same task, await result + result = RUN_AGENT_SESSION(config, input.prompt, parent_context) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + RETURN AgentToolOutput { result, ... } + + ELIF input.mode == Async: + // Spawn separate tokio task, no waiting + task = tokio::spawn(async { + result = RUN_AGENT_SESSION(config, input.prompt, parent_context) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + }) + RETURN AgentToolOutput { agent_id: path, result: None, ... } + + ELIF input.mode == Fork: + // Share parent's prompt cache, zero cold start + cached_prompt = parent_session.get_prompt_cache() + task = tokio::spawn(async { + result = RUN_AGENT_SESSION(config, input.prompt, + parent_context, cached_prompt) + AgentControl.complete(path) + FIRE_HOOK(SubagentStop { path, result }) + }) + RETURN AgentToolOutput { agent_id: path, result: None, ... } + + END +END +``` + +### 4b. Agent Turn Loop (Sub-Agent Runtime) + +``` +FUNCTION run_agent_session(config, prompt, parent_context, cached_prompt?): + // 1. Create isolated session context + session = AgentSession { + config, + context: parent_context.clone(), + prompt_cache: cached_prompt, + turn_count: 0, + accumulated_cost: 0.0, + mailbox: rx from spawn, + } + + // 2. Execute turn loop + WHILE session.turn_count < config.max_turns: + // Check mailbox for parent messages + IF session.mailbox has message: + IF message.kind == Cancel: + RETURN Result { output: "cancelled", ... } + ELIF message.kind == RequestInfo: + SEND response back via oneshot + CONTINUE + + // Normal LLM turn + response = LLM_CALL(session.context) + session.turn_count++ + session.accumulated_cost += response.cost + + // Process tool calls + FOR tool_call in response.tool_calls: + IF tool_call.name == "agent": + // Nested delegation — recursive spawn + sub_result = spawn_agent(session, tool_call.input) + ADD sub_result to session.context + ELSE: + result = EXECUTE_TOOL(tool_call) + ADD result to session.context + + // Check cost cap + IF config.max_cost && session.accumulated_cost > config.max_cost: + RETURN Result { output: "cost limit exceeded", ... } + + // Check if done (no tool calls = final answer) + IF response.tool_calls is empty: + RETURN Result { output: response.text, cost: session.accumulated_cost } + + RETURN Result { output: "max turns reached", ... } +END +``` + +### 4c. Team Pipeline (DAG Wave Execution) + +``` +FUNCTION execute_team_pipeline(steps: Vec): + // 1. Build DAG from depends_on edges + dag = BUILD_DAG(steps) // adjacency list + in-degree count + + // 2. Decompose into topological waves + waves = TOPOLOGICAL_WAVES(dag) + // Wave 0: steps with no dependencies + // Wave 1: steps whose deps are all in wave 0 + // ... + + // 3. Execute wave by wave + step_results = Map + + FOR wave in waves: + // Run all steps in this wave in parallel + handles = [] + FOR step in wave: + handle = tokio::spawn(async { + // Inherit context from parent + prev wave results + context = BUILD_CONTEXT(step, step_results) + result = spawn_agent(parent, { + role: step.agent_role, + prompt: step.prompt, + mode: Sync, + }) + // Store result for dependent steps + step_results[step.id] = result + }) + handles.push(handle) + + // Wait for entire wave (fail-one = fail-wave) + FOR handle in handles: + await handle + + // Fire wave-complete hook + FIRE_HOOK(WaveComplete { wave_index: wave.wave_index }) + + RETURN step_results +END +``` + +--- + +## 5. Implementation Code & Modules + +### New Cargo Crate: `jcode-agent-tree` + +``` +crates/jcode-agent-tree/ + Cargo.toml + src/ + lib.rs — re-exports + path.rs — AgentPath type + entry.rs — AgentEntry, AgentConfig, AgentRole + control.rs — AgentControl (registry, thread limits) + mailbox.rs — MailboxSender/Receiver, AgentMessage + serialization.rs — tree save/restore +``` + +### `path.rs` + +```rust +use std::sync::Arc; +use serde::{Serialize, Deserialize}; + +/// Tree-addressed agent path. +/// Always starts with "/root". Examples: +/// "/root" +/// "/root/explorer" +/// "/root/worker/code-review" +#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)] +pub struct AgentPath(Arc); + +impl AgentPath { + pub fn root() -> Self { + Self("/root".into()) + } + + /// Parse from string — validates format. + pub fn parse(s: &str) -> Result { + if !s.starts_with('/') { + return Err(AgentPathError::InvalidFormat); + } + if s == "/" { + return Err(AgentPathError::TooShort); + } + // Must not end with / + if s.ends_with('/') && s.len() > 1 { + return Err(AgentPathError::TrailingSlash); + } + Ok(Self(s.into())) + } + + /// Create child path: /root/foo + "bar" = /root/foo/bar + pub fn child(&self, name: &str) -> Self { + let parent = self.0.as_ref(); + if parent.ends_with('/') { + Self(format!("{}{}", parent, name).into()) + } else { + Self(format!("{}/{}", parent, name).into()) + } + } + + /// Parent path or None if root. + pub fn parent(&self) -> Option { + let s = self.0.as_ref(); + if s == "/root" { + return None; + } + let last_slash = s.rfind('/')?; + if last_slash == 0 { + return Some(Self("/root".into())); + } + Some(Self(s[..last_slash].into())) + } + + /// Depth: /root = 0, /root/explorer = 1 + pub fn depth(&self) -> usize { + self.0.chars().filter(|&c| c == '/').count().saturating_sub(1) + } + + /// Is this path a descendant of ancestor? + pub fn is_descendant_of(&self, ancestor: &AgentPath) -> bool { + let self_s = self.0.as_ref(); + let anc_s = ancestor.0.as_ref(); + self_s.starts_with(anc_s) && self_s.len() > anc_s.len() + && self_s.as_bytes().get(anc_s.len()) == Some(&b'/') + } + + pub fn as_str(&self) -> &str { + self.0.as_ref() + } +} +``` + +### `control.rs` + +```rust +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{RwLock, Mutex, oneshot}; +use std::time::Instant; + +use crate::path::AgentPath; +use crate::entry::{AgentEntry, AgentRole, AgentConfig, AgentState}; + +/// Maximum thread limits for safety. +const MAX_DEPTH: u32 = 10; +const MAX_SIBLINGS: u32 = 32; +const MAX_TOTAL: u32 = 200; + +/// Central agent tree — thread-safe singleton. +pub struct AgentControl { + inner: Arc>, + name_pool: Arc>, + limits: AgentThreadLimits, +} + +struct AgentTreeInner { + agents: HashMap, + parent_children: HashMap>, + next_id: u64, +} + +pub struct AgentThreadLimits { + pub max_depth: u32, + pub max_siblings: u32, + pub max_total: u32, +} + +impl Default for AgentThreadLimits { + fn default() -> Self { + Self { + max_depth: MAX_DEPTH, + max_siblings: MAX_SIBLINGS, + max_total: MAX_TOTAL, + } + } +} + +impl AgentControl { + pub fn new() -> Self { + let inner = AgentTreeInner { + agents: HashMap::new(), + parent_children: HashMap::new(), + next_id: 1, + }; + Self { + inner: Arc::new(RwLock::new(inner)), + name_pool: Arc::new(Mutex::new(NamePool::new())), + limits: AgentThreadLimits::default(), + } + } + + /// Register a new agent in the tree. + /// Returns error if thread limits would be exceeded. + pub async fn register( + &self, + parent_path: &AgentPath, + name: &str, + role: AgentRole, + config: AgentConfig, + mailbox: oneshot::Sender<...>, + ) -> Result { + let mut inner = self.inner.write().await; + + // Check max total + if inner.agents.len() as u32 >= self.limits.max_total { + return Err(AgentControlError::MaxTotalAgents); + } + + // Check depth + let depth = parent_path.depth() + 1; + if depth > self.limits.max_depth { + return Err(AgentControlError::MaxDepth(depth)); + } + + // Check siblings + let siblings = inner.parent_children.get(parent_path) + .map(|v| v.len()) + .unwrap_or(0); + if siblings >= self.limits.max_siblings as usize { + return Err(AgentControlError::MaxSiblings(siblings)); + } + + // Generate unique name + let unique_name = self.name_pool.lock().unwrap() + .allocate(name); + + let path = parent_path.child(&unique_name); + let id = inner.next_id; + + let entry = AgentEntry { + id, + path: path.clone(), + name: unique_name.clone(), + role, + config, + state: AgentState::Spawning, + created_at: Instant::now(), + mailbox, + }; + + inner.agents.insert(path.clone(), entry); + inner.parent_children + .entry(parent_path.clone()) + .or_default() + .push(path.clone()); + inner.next_id += 1; + + Ok(path) + } + + /// Find agent by path. + pub async fn get(&self, path: &AgentPath) -> Option { + self.inner.read().await.agents.get(path).cloned() + } + + /// List children of a path. + pub async fn children(&self, path: &AgentPath) -> Vec { + self.inner.read().await + .parent_children.get(path) + .cloned() + .unwrap_or_default() + } + + /// Shutdown an agent and all its descendants (recursive). + pub async fn shutdown_tree(&self, path: &AgentPath) { + let mut inner = self.inner.write().await; + let children = inner.parent_children.get(path).cloned().unwrap_or_default(); + + for child_path in &children { + if let Some(entry) = inner.agents.get(child_path) { + if let Some(tx) = &entry.mailbox { + let _ = tx.send(AgentMessage::shutdown()); + } + } + } + // Remove from parent's children list + if let Some(parent) = path.parent() { + if let Some(siblings) = inner.parent_children.get_mut(&parent) { + siblings.retain(|p| p != path); + } + } + inner.agents.remove(path); + } + + /// Complete an agent (success or failure) + pub async fn complete(&self, path: &AgentPath, state: AgentState) { + let mut inner = self.inner.write().await; + if let Some(entry) = inner.agents.get_mut(path) { + entry.state = state; + } + } + + /// Serialize the agent tree for display. + pub async fn snapshot(&self) -> Vec { + self.inner.read().await.agents.values().cloned().collect() + } +} + +// === Name pool (unique agent nicknames) === + +struct NamePool { + used: HashSet, + counters: HashMap, +} + +impl NamePool { + fn new() -> Self { + Self { + used: HashSet::new(), + counters: HashMap::new(), + } + } + + fn allocate(&mut self, base: &str) -> String { + let counter = self.counters.entry(base.to_string()).or_insert(0); + *counter += 1; + let name = format!("{}-{}", base, *counter); + self.used.insert(name.clone()); + name + } +} +``` + +### Modifications to Existing Files + +#### `crates/jcode-app-core/src/agent/mod.rs` — New `agent` tool + +```rust +/// The `agent` tool — lets the LLM spawn sub-agents. +pub struct AgentTool { + agent_control: Arc, + session_registry: Arc, +} + +#[async_trait] +impl Tool for AgentTool { + fn name(&self) -> &str { "agent" } + fn description(&self) -> &str { + "Spawn a sub-agent to work on a task. Use sync mode to get the result back, \ + async for fire-and-forget, fork to reuse the current prompt cache. \ + Roles: explorer (read-only), worker (execute), orchestrator (plan+delegate)." + } + + async fn execute(&self, input: Value, ctx: ToolContext) -> ToolOutput { + let input: AgentToolInput = serde_json::from_value(input)?; + // Validate role + let role = AgentRole::from_str(&input.role) + .map_err(|_| ToolError::InvalidParam("role"))?; + + // Build config from role defaults + overrides + let config = self.build_config(&ctx, role, &input); + + // Create mailbox + let (tx, rx) = oneshot::channel(); + + // Register in tree + let parent_path = ctx.agent_path(); // from session runtime + let path = self.agent_control.register( + &parent_path, &role.to_string(), role, config, tx + ).await?; + + // Fire hook + fire_hook(HookEvent::SubagentStart { + parent: parent_path.to_string(), + child: path.to_string(), + role: role.to_string(), + }).await; + + // ... spawn session and run ... + } +} +``` + +#### `src/cli/args.rs` — New subcommands + +```rust +pub(crate) enum Command { + // ... existing ... + /// Multi-agent team orchestration + #[command(subcommand)] + Team(TeamCommand), + /// Sub-agent tree management + #[command(subcommand)] + Agent(AgentCommand), +} + +#[derive(Subcommand)] +pub(crate) enum TeamCommand { + /// Start a team pipeline from a plan file + Start { + /// Path to plan file (YAML/TOML) + plan: PathBuf, + /// Number of parallel workers + #[arg(long, default_value = "4")] + workers: u32, + }, + /// Show team status + Status, + /// Stop a running team + Stop { + /// Team ID (from `team start`) + team_id: String, + }, +} + +#[derive(Subcommand)] +pub(crate) enum AgentCommand { + /// List all sub-agents in tree + List, + /// Show agent tree + Tree, + /// Kill a sub-agent by path + Kill { + path: String, + }, + /// Get agent status + Status { + path: String, + }, +} +``` + +#### `src/cli/dispatch.rs` — Route new commands + +```rust +Command::Team(cmd) => { + match cmd { + TeamCommand::Start { plan, workers } => { + let plan = parse_plan_file(&plan)?; + runtime.execute_team_pipeline(plan, workers).await?; + } + TeamCommand::Status => { + let tree = runtime.agent_control().snapshot().await; + // Print formatted table + } + TeamCommand::Stop { team_id } => { + runtime.agent_control() + .shutdown_tree(&AgentPath::parse(&format!("/root/{}", team_id))?) + .await; + } + } +} +``` + +#### Integration into Agent Turn Loop + +In `turn_streaming_mpsc.rs`, the existing soft-interrupt points already provide hooks for sub-agent injection: + +- **Point A (pre-API)**: Check sub-agent mailbox for incoming messages (Cancel, RequestInfo) +- **Point B (post-response)**: Process `agent` tool calls from the model +- **Point C (between tools)**: Check for sub-agent result availability +- **Point D (after all tools)**: Fire SubagentStop hooks, propagate results + +```rust +// In the agent turn loop, after tool call processing: +if tool_call.name == "agent" { + let input: AgentToolInput = serde_json::from_value(tool_call.input)?; + let result = AgentTool::execute(input, ctx).await; + // result goes back as a regular tool result + context.add_tool_result(tool_call.id, result); +} +``` + +--- + +## 6. Configuration & Wiring + +### `~/.jcode/config.toml` — Agent section + +```toml +[agents] +# Max sub-agents in the tree +max_total = 50 +# Max delegation depth +max_depth = 5 +# Max siblings per parent +max_siblings = 10 +# Default agent timeout +default_timeout = "300s" +# Default max turns +default_max_turns = 50 + +[agents.roles.explorer] +model = "claude-sonnet-4-20250514" +tools = ["read", "grep", "glob", "websearch", "web_fetch"] +max_turns = 20 +permissions = { max_risk_level = "read_only", allow_approve = false } + +[agents.roles.worker] +model = "claude-sonnet-4-20250514" +tools = ["read", "write", "edit", "bash", "grep", "glob"] +max_turns = 50 +permissions = { max_risk_level = "standard", allow_approve = false } + +[agents.roles.orchestrator] +model = "claude-opus-4-20250514" +tools = "*" # All available tools +max_turns = 30 +permissions = { max_risk_level = "elevated", allow_approve = true } +``` + +### Env Vars (in `disable-registry` style) + +| Env Var | Effect | +|---------|--------| +| `JCODE_DISABLE_AGENT_TREE=1` | Disable all multi-agent features | +| `JCODE_MAX_AGENTS=10` | Override max_total at process level | +| `JCODE_AGENT_TIMEOUT_MS=60000` | Per-agent timeout override | + +### Integration Points Checklist + +| File | Change | Priority | +|------|--------|----------| +| `Cargo.toml` (workspace) | Add `jcode-agent-tree` crate | P0 | +| `crates/jcode-agent-tree/src/lib.rs` | New crate — AgentPath, AgentTree, Mailbox | P0 | +| `crates/jcode-app-core/src/tool/mod.rs` | Register `AgentTool` | P0 | +| `crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs` | Handle `agent` tool calls in turn loop | P0 | +| `src/cli/args.rs` | Add `Team` + `Agent` subcommands | P1 | +| `src/cli/dispatch.rs` | Route team/agent commands | P1 | +| `crates/jcode-base/src/config.rs` | Add `[agents]` config section | P1 | +| `crates/jcode-protocol/src/wire.rs` | Add SubagentStart/Stop events | P1 | +| `crates/jcode-tui/src/tui/app.rs` | Display agent tree in side panel | P2 | +| `crates/jcode-tui/src/tui/ui.rs` | Agent tree widget | P2 | + +--- + +## 7. Repo References + +| Feature Aspect | Repo | File | Link | +|----------------|------|------|------| +| AgentPath tree | codex | cli/kernel/agents/agent_path.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_path.rs | +| Mailbox | codex | cli/kernel/agents/mailbox.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/mailbox.rs | +| AgentControl | codex | cli/kernel/agents/agent_control.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/agent_control.rs | +| Batch CSV | codex | cli/kernel/agents/spawn.rs | https://github.com/openai/codex/blob/main/cli/kernel/agents/spawn.rs | +| Agent tool | CC | src/tools/agent.ts | https://github.com/claude-code-best/claude-code/blob/main/src/tools/agent.ts | +| Subagent hooks | CC | src/services/hooks.ts | https://github.com/claude-code-best/claude-code/blob/main/src/services/hooks.ts | +| DAG wave | oh-my-pi | src/agent/swarm/DAGSwarm.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/swarm/DAGSwarm.ts | +| EventBus | oh-my-pi | src/agent/EventBus.ts | https://github.com/can1357/oh-my-pi/blob/main/src/agent/EventBus.ts | +| Pipeline orchestration | codebuff | src/orchestrator/Buffy.ts | https://github.com/CodebuffAI/codebuff/blob/main/src/orchestrator/Buffy.ts | +| Team pipeline | oh-my-claudecode | src/team/index.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/index.ts | +| Spawn agent | oh-my-openagent | src/agents/agentOrchestration.ts | https://github.com/code-yeongyu/oh-my-openagent/blob/main/src/agents/agentOrchestration.ts | +| Fork subagent | oh-my-claudecode | src/team/agents.ts | https://github.com/Yeachan-Heo/oh-my-claudecode/blob/main/src/team/agents.ts | +| Agent posture gating | oh-my-codex | src/orchestrator/posture.ts | https://github.com/Yeachan-Heo/oh-my-codex/blob/main/src/orchestrator/posture.ts | +| jcode existing swarm TUI | jcode | crates/jcode-tui/src/tui/app.rs | — | +| jcode existing orchestration API | jcode | src/orchestration_api.rs | — | + +--- + +## 8. Test Cases + +### Unit Tests + +```rust +// === AgentPath tests === +#[test] +fn test_agent_path_root() { + let root = AgentPath::root(); + assert_eq!(root.as_str(), "/root"); + assert_eq!(root.depth(), 0); + assert!(root.parent().is_none()); +} + +#[test] +fn test_agent_path_child() { + let root = AgentPath::root(); + let explorer = root.child("explorer"); + assert_eq!(explorer.as_str(), "/root/explorer"); + assert_eq!(explorer.depth(), 1); + assert_eq!(explorer.parent().unwrap().as_str(), "/root"); +} + +#[test] +fn test_agent_path_is_descendant() { + let root = AgentPath::root(); + let worker = root.child("worker"); + let task = worker.child("code-review"); + assert!(task.is_descendant_of(&root)); + assert!(task.is_descendant_of(&worker)); + assert!(!worker.is_descendant_of(&task)); +} + +#[test] +fn test_agent_path_parse_valid() { + let p = AgentPath::parse("/root/explorer").unwrap(); + assert_eq!(p.as_str(), "/root/explorer"); +} + +#[test] +fn test_agent_path_parse_invalid() { + assert!(AgentPath::parse("/").is_err()); + assert!(AgentPath::parse("root").is_err()); +} + +// === AgentControl tests === + +#[tokio::test] +async fn test_register_agent() { + let ctrl = AgentControl::new(); + let root = AgentPath::root(); + let (tx, _rx) = oneshot::channel(); + + let path = ctrl.register(&root, "explorer", AgentRole::Explorer, + AgentConfig::default(), tx).await.unwrap(); + + assert!(path.as_str().starts_with("/root/explorer-")); + assert!(ctrl.get(&path).await.is_some()); +} + +#[tokio::test] +async fn test_max_depth_enforced() { + let ctrl = AgentControl::new(); + let mut path = AgentPath::root(); + for i in 0..12 { // max_depth = 10 + let (tx, _rx) = oneshot::channel(); + let result = ctrl.register(&path, "deep", AgentRole::Worker, + AgentConfig::default(), tx).await; + if i >= 10 { + assert!(result.is_err()); + } else { + path = result.unwrap(); + } + } +} + +#[tokio::test] +async fn test_shutdown_tree() { + let ctrl = AgentControl::new(); + let root = AgentPath::root(); + let (tx1, _rx1) = oneshot::channel(); + let (tx2, _rx2) = oneshot::channel(); + let p1 = ctrl.register(&root, "a", AgentRole::Explorer, + AgentConfig::default(), tx1).await.unwrap(); + let p2 = ctrl.register(&p1, "b", AgentRole::Worker, + AgentConfig::default(), tx2).await.unwrap(); + + ctrl.shutdown_tree(&root).await; + assert!(ctrl.get(&p1).await.is_none()); + assert!(ctrl.get(&p2).await.is_none()); +} + +// === AgentTool tests === + +#[tokio::test] +async fn test_agent_tool_spawn_sync() { + // Setup: create session, register AgentTool, call with input + let tool = AgentTool::new(agent_control, session_registry); + let input = serde_json::json!({ + "role": "explorer", + "prompt": "Check if Cargo.toml exists", + "mode": "sync" + }); + let ctx = ToolContext::test(); + let output = tool.execute(input, ctx).await; + assert!(output.result.is_some()); + assert!(output.turn_count > 0); +} + +#[tokio::test] +async fn test_agent_tool_invalid_role() { + let tool = AgentTool::new(agent_control, session_registry); + let input = serde_json::json!({ + "role": "superhero", // Invalid + "prompt": "Do something" + }); + let result = tool.execute(input, ToolContext::test()).await; + assert!(result.is_err()); +} +``` + +### Integration Tests + +```rust +#[tokio::test] +async fn test_subagent_result_propagates_to_parent() { + // 1. Start parent session via orchestration API + // 2. Parent calls `agent` tool with sync mode + // 3. Sub-agent runs, does some work, returns result + // 4. Verify parent's next turn includes sub-agent result + todo!("End-to-end: spawn parent → parent spawns child → child returns → parent sees result"); +} + +#[tokio::test] +async fn test_agent_tree_persistence() { + // 1. Create agent tree with multiple agents + // 2. Serialize to JSON + // 3. Deserialize + // 4. Verify all paths and entries match + todo!("Agent tree save/restore round-trip"); +} + +#[tokio::test] +async fn test_team_pipeline_dag_wave() { + // 1. Define 5-step DAG: step2 depends on step1, step3 on step1, step4 on step2+3 + // 2. Execute pipeline + // 3. Verify wave order: wave0=[step1], wave1=[step2,step3], wave2=[step4] + // 4. Verify all results present + todo!("DAG execution respects topological order"); +} +``` + +--- + +## 9. Benchmarks + +| Metric | Baseline (no multi-agent) | Target | How to Measure | +|--------|---------------------------|--------|----------------| +| Sub-agent spawn latency | N/A | < 100ms (in-process) | `time` before/after `register()` call | +| Sub-agent LLM first-token | N/A | Same as parent (fork) + 500ms (sync) | Measure TTFT of sub-agent vs parent | +| Memory per sub-agent | N/A | < 50MB baseline + 10MB per active agent | `alloc` profiling | +| Agent tree — 100 agents | N/A | Lookup < 1µs, register < 10µs | Criterion bench | +| DAG wave — 20 steps / 4 waves | N/A | Total < serial time / 3 | Integration timer | +| Cost tracking overhead | N/A | < 0.1% of total API cost | Differential measurement | + +--- + +## 10. Migration / Rollout + +**Phase 1 — Foundation (estimate: 1-2 weeks)** +- New crate `jcode-agent-tree` with AgentPath, AgentControl, Mailbox +- Unit tests for tree operations +- No agent tool yet — infrastructure only +- **Risk**: None (new crate, no existing code touched) + +**Phase 2 — Agent Tool (estimate: 1 week)** +- `AgentTool` implementation: sync + async + fork modes +- Integration into agent turn loop +- Wire hooks (SubagentStart/SubagentStop) to existing hook system +- **Risk**: Medium — turn loop changes must not break single-agent mode + +**Phase 3 — CLI + Config (estimate: 1 week)** +- `jcode agent list/tree/kill/status` commands +- `jcode team start/status/stop` commands +- `[agents]` config section in config.toml +- **Risk**: Low — CLI and config are additive + +**Phase 4 — Team Pipeline + Batch (estimate: 1 week)** +- DAG pipeline executor (plan file → waves → results) +- Batch CSV agent spawning +- TUI agent tree visualization +- **Risk**: Low — builds on Phase 1-3 foundation + +### Feature Flag +All multi-agent functionality gated behind `JCODE_DISABLE_AGENT_TREE` kill-switch (from disable-env system). When disabled, `agent` tool returns "multi-agent disabled" error, team CLI commands error out, and agent tree stays empty. + +--- + +## 11. Known Limitations & Future Work + +- [ ] **Cross-process sub-agents**: Current design is in-process only. Future: sub-agents as separate `jcode` processes via the protocol layer. +- [ ] **Agent checkpoint/resume**: Sub-agents that survive parent restart — requires session persistence. +- [ ] **Prompt cache sharing (Fork)**: Full fork mode requires the LLM provider to support prompt cache snapshots. Phase 1 fork = copy context (not true cache sharing). +- [ ] **Inter-agent streaming**: Sub-agents can only communicate via mailbox messages (discrete), not streaming. Future: SSE-based streaming between agents. +- [ ] **Cost optimization**: No sub-agent cost optimization yet (e.g., cheaper model for explorer). +- [ ] **Agent governance**: No per-user agent quotas, no team-based agent pools. +- [ ] **Swarm replay export**: jcode already has `export_swarm_video()` in the TUI — tie this into agent tree history. + +--- + +## 12. Success Criteria Checklist + +- [ ] `AgentPath` type supports hierarchical addressing, parent/child traversal, depth checks +- [ ] `AgentControl` enforces thread limits (depth, siblings, total) +- [ ] Mailbox-based communication works: parent sends task, agent receives, agent sends result, parent receives +- [ ] `agent` tool call spawns a sub-agent with correct role defaults +- [ ] Sync mode: parent waits, gets result with turn count + cost +- [ ] Async mode: parent continues immediately, result logged +- [ ] SubagentStart/SubagentStop hooks fire correctly +- [ ] `jcode agent list` shows all active agents with paths +- [ ] `jcode agent kill /root/worker-1` terminates agent + children +- [ ] `jcode agent tree` prints hierarchical tree view +- [ ] `jcode team start` reads plan file, executes waves, reports results +- [ ] `jcode team stop ` cancels all running agents in team +- [ ] DAG pipeline executes steps in correct topological wave order +- [ ] Cost aggregation: parent's cost includes all children's costs +- [ ] `JCODE_DISABLE_AGENT_TREE=1` disables all multi-agent features +- [ ] Existing single-agent behavior unchanged (regression test pass) +- [ ] 50 concurrent agents don't overwhelm the runtime diff --git a/.omo/plans/pr-313-review.md b/.omo/plans/pr-313-review.md new file mode 100644 index 000000000..44253c131 --- /dev/null +++ b/.omo/plans/pr-313-review.md @@ -0,0 +1,255 @@ +# PR #313 Review: jcode Multi-Agent Foundation vs 9 Reference Repos + +> **Date**: 2026-06-05 +> **Reviewer**: Claude Opus 4.8 (feature-planning skill) +> **PR**: #313 — `experimental/multi-agent-foundation` → `master` +> **Scope**: +5775 / -94 lines, 28 files, 7 commits + +--- + +## 1. Per-Dimension Comparison Tables + +### 1A. Agent Definition Schema + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Format** | TOML | TS imperative + `handleSteps` | N/A (TUI) | Markdown + YAML frontmatter | Markdown + YAML | TS imperative | Markdown + YAML | Markdown + YAML | Rust runtime | N/A | +| **Schema validation** | `serde(deny_unknown_fields)` | Zod runtime | TS types | Zod (lazy) | Effect `Schema.Class` | TS types | YAML parse | YAML parse | serde derive | N/A | +| **`model` field** | optional (`model_override` + `prefer_tier`) | **required** | N/A | optional (`inherit`) | optional | **required** | optional | optional | N/A | env var stack | +| **`reasoning`/`effort`** | `ReasoningEffort` enum (4 levels) | `reasoningOptions.effort` (5 levels) + `max_tokens` | N/A | `effort` enum + integer | `variant` per-model | `Effort` enum | `ModelV2.VariantID` | N/A | N/A | N/A | +| **`outputMode`** | `last_message`/`all_messages`/`structured_output` | identical | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`tool_names`** | whitelist (deny-by-default) | whitelist + MCP servers | built-in list | `tools` + `disallowedTools` | optional from registry | `loadMode` + `tier` | tool registry | tool allowlist | optional | N/A | +| **`spawnable_agents`** | whitelist | `publisher/agent@version` | N/A | N/A (model drives) | N/A | N/A | N/A | N/A | N/A | N/A | +| **`inherit_parent_system_prompt`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`include_message_history`** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`handleSteps`** | N/A (Phase 2) | ✅ Generator | N/A | N/A | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **`permissionMode`** | N/A | N/A | N/A | ✅ per-agent | ✅ per-agent | `ToolTier` per-tool | N/A | N/A | N/A | N/A | +| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **`isolation`** | N/A | N/A | N/A | `worktree`/`remote` | N/A | N/A | N/A | `worktree` (git) | N/A | N/A | +| **`mcpServers`** | N/A | ✅ per-agent | N/A | ✅ per-agent | N/A | N/A | N/A | ✅ MCP server | N/A | N/A | +| **`hooks`** | N/A | N/A | N/A | ✅ per-agent | N/A | N/A | N/A | N/A | N/A | N/A | +| **`memory` scope** | N/A | N/A | N/A | `user`/`project`/`local` | N/A | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1B. Agent Registry / Discovery + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Discovery paths** | 3-tier: project > user > builtin | `.agents/` local | N/A | `.claude/agents/*.md` + settings | `.opencode/agents/*.md` + `modes/` | N/A | N/A | N/A | N/A | N/A | +| **Priority order** | project > user > builtin | built-in first | N/A | built-in first | primary source glob | N/A | N/A | N/A | N/A | N/A | +| **Filename == id check** | ✅ enforced | ❌ | N/A | ❌ | ❌ | N/A | N/A | N/A | N/A | N/A | +| **Non-fatal errors** | ✅ collected for `doctor` | throws | N/A | log + skip | throws | N/A | N/A | N/A | N/A | N/A | +| **On-disk format** | TOML | TS | N/A | Markdown | Markdown | N/A | N/A | N/A | N/A | N/A | +| **Reload at runtime** | not yet | no | N/A | cache + plugin invalidation | `update` API | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1C. Model Routing / Tier + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Approach** | env-var slots + session inherit | OpenRouter catalog | `JCODE_ROUTING_*` env vars | `inherit` | `ModelV2.parse` | dynamic `ModelV2` | `ModelResolutionPipeline` (5 stages) | via Claude session | direct | env var stack | +| **Slot/tier concept** | `Routine`/`Thinking` | no (literal model id) | `ROUTINE`+`THINKING`+`THRESHOLD` | no | variant per-provider | model string | catalog aliases | no | no | default + fallback | +| **Fallback chain** | 3-level: override > env > session | OpenRouter routing | N/A | N/A | provider fallback | `resolveModelWithFallback` | 5-stage pipeline | N/A | per-provider | 2-tier fallback | +| **Predefined catalog** | **no** (intentional) | yes (100+ models) | no | no | yes (`models-dev.ts`) | no | yes (60+ models) | no | no | no | +| **Provider abstraction** | no (single OAuth) | OpenRouter | multi-provider | Anthropic | multi-provider | 40+ providers | multi-provider | Anthropic | 15+ providers | Codex only | + +--- + +### 1D. Agent Lifecycle / Spawn + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Agent tree** | N/A | N/A | ✅ `AgentPath` + `ThreadSpawnEdgeStatus` | `team_name` (1:1 TaskList) | `mode: subagent/primary/all` | runtime | `boulder-state` (worktrees) | `team jobs` | session tree | N/A | +| **Spawn tool** | N/A (schema only) | `spawn_agents` | `SpawnAgent`/`WaitAgent`/`CloseAgent`/`SendMessage`/`AssignAgentTask` | `Agent` tool + `TeamCreate` | delegation via tools | N/A | `delegate_task` | `omc_team_start` CLI | N/A | N/A | +| **Message bus** | N/A | output return | `InterAgentCommunication` + delivery edges | `SendMessage` tool | N/A | N/A | `shared-state.ts` | `omc-team-state.ts` | N/A | N/A | +| **Parallel execution** | N/A | `Promise.all` | DAG traversal | concurrent teammates | concurrent | DAG wave | sequential | sequential | N/A | N/A | +| **Worktree isolation** | N/A | N/A | N/A | ✅ `isolation: worktree/remote` | N/A | N/A | N/A | ✅ git worktree cleanup | N/A | N/A | +| **`maxTurns`** | N/A | N/A | N/A | ✅ per-agent | `steps: PositiveInt` | N/A | N/A | N/A | N/A | N/A | +| **Job persistence** | N/A | N/A | ✅ SQLite `agent_jobs` | team config JSON | N/A | N/A | `boulder-state` file | `OMC_JOBS_DIR` artifacts | session JSONL | N/A | + +--- + +### 1E. Permission / Safety + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Permission system** | **existing** `SafetySystem` + `ActionTier` | none | sandbox | `PermissionMode` per-agent (default/auto/ask/deny) | `PermissionV2.Ruleset` (allow/deny/ask) per-agent | `ToolTier` (read/write/exec) + approval modes | MCP allowlist | plugin/team scopes | none | `OMX_*` env controls | +| **Per-agent policy** | **gap** — tool whitelist only | tool whitelist | N/A | ✅ `permissionMode` field | ✅ `permissions` array | ✅ `tier` on each tool | N/A | N/A | N/A | N/A | +| **Classification levels** | 2 (auto/permission) | N/A | N/A | 4 (default/auto/ask/deny) | 3 (allow/deny/ask) | 3 (read/write/exec) | N/A | N/A | N/A | N/A | +| **Auto-approve for sub-agents** | **not wired** | via `handleSteps` | N/A | via `permissionMode` | N/A | tool-tier-based | N/A | N/A | N/A | N/A | +| **TUI permission flow** | ✅ `PermissionsApp` (existing) | none | none | none (CLI only) | N/A | N/A | N/A | N/A | N/A | N/A | +| **`disallowedTools`** | N/A | N/A | N/A | ✅ | N/A | `hidden` field | N/A | N/A | N/A | N/A | + +--- + +### 1F. Tool Execution + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Tool registry** | whitelist strings in TOML | typed `ToolName` union | hard-coded | `getTools()` config | `ToolsProvider` | `AgentTool` interface | tool discovery | MCP servers | typed `Tool` trait | sparkshell bridge | +| **Concurrency control** | N/A | N/A | N/A | N/A | N/A | ✅ `shared`/`exclusive` | N/A | N/A | N/A | N/A | +| **`loadMode`** | N/A | N/A | N/A | N/A | N/A | ✅ `essential`/`discoverable` | N/A | N/A | N/A | N/A | +| **`deferrable`** | N/A | ✅ | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A | +| **`nonAbortable`** | N/A | N/A | N/A | N/A | N/A | ✅ | N/A | N/A | N/A | N/A | +| **Validation** | runtime (registry) | Zod args | sandbox | Zod | Effect Schema | Zod (`zodToWireSchema`) | Zod | Zod | typed Rust | typed Rust | +| **`beforeToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (block/transform) | N/A | N/A | N/A | N/A | +| **`afterToolCall` hook** | N/A | N/A | N/A | N/A | N/A | ✅ (override) | N/A | N/A | N/A | N/A | +| **Structured output** | ✅ `OutputMode::StructuredOutput` | ✅ `set_output` + `outputSchema` | N/A | N/A | N/A | `set_output` | N/A | N/A | N/A | N/A | + +--- + +### 1G. Eval / Benchmark + +| Aspect | **jcode PR #313** | codebuff (BuffBench) | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|---------------------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Approach** | git-commit reconstruction (scaffold) | git-commit reconstruction (production) | e2e + bench scripts | N/A | N/A | LSP+DAP benchmarks | smoke tests | integration tests | N/A | sparkshell benchmark | +| **Multi-judge** | ✅ 3 judges + per-model timeout | 2 judges (20 min shared) | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Median scoring** | ✅ | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Lessons extractor** | ✅ scaffold | ✅ production | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **`meta-analyze`** | ✅ implemented | ✅ | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | +| **Feature flag** | ✅ `agent-runner` gate | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | + +--- + +### 1H. Prompt Utilities + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Placeholder substitution** | ✅ `prompt_placeholders.rs` (pure utility) | `PLACEHOLDER` constants | N/A | prompt templates | mode prompts | atlas prompts | `prompts-core` package | `atlas-prompts.ts` | N/A | `build_summary_prompt()` | +| **Supported tokens** | 7 tokens with length caps | `PLACEHOLDER` enum | N/A | env vars + dynamic | template engine | context-based | variant resolver | markdown | N/A | shell output | +| **Length caps** | ✅ 2500/10k/30k/100k chars | `FILE_TREE_PROMPT` only | N/A | N/A | N/A | provider-specific | model caps | N/A | N/A | N/A | +| **System reminder wrap** | ✅ `wrap_as_system_reminder()` | `` tags | N/A | injection | N/A | N/A | prompt-injection.ts | prompt-injection.ts | N/A | N/A | +| **Frontmatter parse** | N/A (TOML) | N/A | N/A | ✅ `parseAgentToolsFromFrontmatter` | ✅ `ConfigMarkdown.parseOption` | N/A | `shared/frontmatter.ts` | N/A | N/A | N/A | + +--- + +### 1I. Session / Persistence + +| Aspect | **jcode PR #313** | codebuff | codex | claude-code | opencode | oh-my-pi | oh-my-openagent | oh-my-claudecode | pi-agent-rust | oh-my-codex | +|--------|-------------------|----------|-------|-------------|----------|----------|-----------------|------------------|---------------|-------------| +| **Session format** | N/A (existing) | in-memory | SQLite + JSONL | config JSON | SQLite (Effect) | runtime state | `boulder-state` file | `OMC_JOBS_DIR` JSON | **JSONL + SHA-256 chain** | N/A | +| **Branching/history** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ tree structure | N/A | +| **Indexed search** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ `SessionIndex` | N/A | +| **Chain integrity** | N/A | N/A | N/A | N/A | N/A | N/A | N/A | N/A | ✅ SHA-256 per-entry | N/A | + +--- + +## 2. Top 5 Gaps (ROI-ranked) + +| Rank | Gap | Effort | Impact | Source repos | Concrete action | +|------|-----|--------|--------|--------------|-----------------| +| **1** | `permissionMode` per-agent — wire `SafetySystem` into `AgentDefinition` | 2-3 days | 🔴 Critical (security) | claude-code (`PermissionMode`), opencode (`allow/deny/ask` per action+resource) | ✅ DONE (commit f84cc127 + 795242b6) — `permission_mode` enum + field added, dcg_bridge wired | +| **2** | `Agent` tool — model-driven spawn | 1-2 weeks | 🔴 Critical (core feature) | codex (`SpawnAgent`/`WaitAgent`), claude-code (`AgentTool` + `TeamCreateTool`), codebuff (`spawn_agents`) | Phase 2: add `agent` tool that LLM calls; wire `spawnable_agents` whitelist; implement `AgentPath` tree from codex | +| **3** | `maxTurns` per-agent | 1 day | 🟡 Important (runaway prevention) | claude-code, opencode | ✅ DONE (commit 844fc412) — `max_turns` field added to `AgentDefinition` | +| **4** | `handleSteps` — programmatic agents | 1 week | 🟡 Important (flexibility) | codebuff (`handleSteps` Generator), oh-my-pi (`beforeToolCall`/`afterToolCall`) | Phase 2: add optional `handle_steps` field with Rust async generator or callback approach | +| **5** | Tool concurrency (`shared`/`exclusive`) | 2-3 days | 🟢 Nice-to-have (perf) | oh-my-pi (`AgentTool.concurrency`) | Add `concurrency` field to tool definition; runtime scheduler respects exclusive locks | + +--- + +## 3. Wire-up Plan: SafetySystem + AgentDefinition.permissionMode + +### Current state +- `SafetySystem` (crates/jcode-base/src/safety.rs): `ActionTier` = `AutoAllowed | RequiresPermission` +- `AgentDefinition` (crates/jcode-agent-runtime/src/definition.rs): `tool_names` whitelist only +- `PermissionsApp` (crates/jcode-tui/src/tui/permissions.rs): TUI approval flow exists + +### Proposed addition + +```rust +// crates/jcode-agent-runtime/src/definition.rs + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PermissionMode { + /// Inherit approval from parent agent (default for sub-agents). + Inherit, + /// Auto-approve all tool calls for this agent. + AutoApprove, + /// Always ask user for permission. + Ask, + /// Deny all tool calls (read-only agent). + Deny, +} + +impl Default for PermissionMode { + fn default() -> Self { PermissionMode::Inherit } +} + +// Add to AgentDefinition: +// pub permission_mode: Option, +``` + +### Resolution algorithm (runtime) + +``` +fn resolve_permission(action, tool_name, agent_def, parent_approval): + mode = agent_def.permission_mode.unwrap_or(Inherit) + match mode: + Deny → block + AutoApprove → approve + Ask → prompt user via PermissionsApp + Inherit → use parent_approval (or session-level classify) +``` + +### Migration path +- Default `None` = `Inherit` = existing behavior unchanged +- TOML agents opt-in: `permission_mode = "auto_approve"` for leaf agents +- Phase 2: auto-wire `bash` tool in `basher.toml` with `permission_mode = "auto_approve"` + +--- + +## 4. Roadmap: Phases After PR #313 + +| Phase | Scope | Dependencies | Estimated | +|-------|-------|--------------|-----------| +| **Phase 1** (this PR) | AgentDefinition + tier + registry + JBench scaffold | — | ✅ Done | +| **Phase 1.5** | `permissionMode` wire-up (SafetySystem + AgentDefinition) | Phase 1 | ✅ Done | +| **Phase 2** | Agent runtime engine: spawn, parent-child tree, `Agent` tool, `AgentPath` | Phase 1 | 2-3 weeks | +| **Phase 2.5** | `handleSteps` (programmatic agents), tool concurrency | Phase 2 | 1-2 weeks | +| **Phase 3** | Team pipeline (claude-code-style `TeamCreateTool`) | Phase 2 | 1 week | +| **Phase 4** | JBench production (full `pick-commits` → `gen-evals` → `run` → `judge` → `lessons` pipeline) | Phase 2 | 1-2 weeks | +| **Phase 5** | Multi-provider support (extend tier to per-provider catalogs) | Phase 2 | 1 week | + +--- + +## 5. PR #313 Strengths + +1. **Best-in-class agent discovery** — 3-tier priority, filename==id enforcement, non-fatal error collection +2. **Correct model routing philosophy** — slots not catalog, matches single-OAuth reality +3. **JBench exceeds BuffBench** — 3 judges with per-model timeout (vs BuffBench's shared 20-min timeout) +4. **Rust-idiomatic crate structure** — feature gates, clean separation, `serde(deny_unknown_fields)` +5. **Comprehensive documentation** — every module has a doc comment explaining WHY, not just WHAT + +--- + +## 6. PR #313 Actionable Issues + +| # | Issue | Severity | File | Fix | +|---|-------|----------|------|-----| +| 1 | `extract_diff_from_repo` uses sync `std::process::Command` in async fn | Medium | evals/jbench/src/agent_runner.rs:195 | ✅ FIXED (commit 2d7a020c) | +| 2 | `todo_step` calls `std::process::exit(0)` for unimplemented commands | Low | evals/jbench/src/bin/jbench.rs | ✅ FIXED (commit 2d7a020c) | +| 3 | `file-picker.toml` missing explicit `inherit_parent_system_prompt = false` | Low | .jcode/agents/file-picker.toml | Add for consistency with `basher.toml` | +| 4 | `edition = "2024"` in jbench may cause toolchain issues if workspace uses 2021 | Low | evals/jbench/Cargo.toml | Verify workspace edition consistency | +| 5 | `meta_analyze_impl` reads all `.run.json` files into memory | Low | evals/jbench/src/bin/jbench.rs:268 | Streaming deserializer for large runs | + +--- + +## 7. Implementation Status (2026-06-05) + +| Item | Status | Commit | +|------|--------|--------| +| Merge master into branch | ✅ Done | 25d3f21e | +| Reconcile src/lib.rs with master | ✅ Done | 60a61f0b | +| Review document (9 repos) | ✅ Done | d2942498 | +| `permissionMode` enum + field | ✅ Done | f84cc127 | +| `permissionMode` wire-up (dcg_bridge) | ✅ Done | 795242b6 | +| `maxTurns` field | ✅ Done | 844fc412 | +| TOML agents max_turns | ✅ Done | 6d8ecbc6 | +| Fix jbench warnings | ✅ Done | 2d7a020c | +| `Agent` tool (model-driven spawn) | 🔲 Phase 2 | — | +| `handleSteps` (programmatic agents) | 🔲 Phase 2 | — | +| Tool concurrency (shared/exclusive) | 🔲 Phase 2 | — | +| Team pipeline (TeamCreateTool) | 🔲 Phase 3 | — | +| JBench production | 🔲 Phase 4 | — | diff --git a/.omo/run-continuation/ses_16768f040ffejyGE92ednE0xRX.json b/.omo/run-continuation/ses_16768f040ffejyGE92ednE0xRX.json new file mode 100644 index 000000000..505da69ae --- /dev/null +++ b/.omo/run-continuation/ses_16768f040ffejyGE92ednE0xRX.json @@ -0,0 +1,10 @@ +{ + "sessionID": "ses_16768f040ffejyGE92ednE0xRX", + "updatedAt": "2026-06-05T16:50:33.226Z", + "sources": { + "background-task": { + "state": "idle", + "updatedAt": "2026-06-05T16:50:33.226Z" + } + } +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 1f5918154..efbedfaaf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1535,7 +1535,7 @@ dependencies = [ "bitflags 1.3.2", "core-foundation 0.9.4", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "libc", ] @@ -3037,6 +3037,15 @@ dependencies = [ "ttf-parser 0.25.1", ] +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared 0.1.1", +] + [[package]] name = "foreign-types" version = "0.5.0" @@ -3044,7 +3053,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d737d9aa519fb7b749cbc3b962edcf310a8dd1f4b67c91c4f83975dbdd17d965" dependencies = [ "foreign-types-macros", - "foreign-types-shared", + "foreign-types-shared 0.3.1", ] [[package]] @@ -3058,6 +3067,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "foreign-types-shared" version = "0.3.1" @@ -4583,6 +4598,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.10.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.20" @@ -5135,8 +5166,13 @@ dependencies = [ name = "jcode-agent-runtime" version = "0.1.0" dependencies = [ + "anyhow", + "serde", + "serde_json", "thiserror 1.0.69", "tokio", + "toml", + "tracing", ] [[package]] @@ -5483,6 +5519,21 @@ dependencies = [ "serde", ] +[[package]] +name = "jcode-jbench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "futures", + "jcode-agent-runtime", + "reqwest 0.12.28", + "serde", + "serde_json", + "tempfile", + "tokio", +] + [[package]] name = "jcode-logging" version = "0.1.0" @@ -6682,7 +6733,7 @@ dependencies = [ "bitflags 2.11.1", "block", "core-graphics-types", - "foreign-types", + "foreign-types 0.5.0", "log", "objc", "paste", @@ -6786,6 +6837,23 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.2.1", + "openssl-sys", + "schannel", + "security-framework 3.7.0", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -7231,6 +7299,31 @@ dependencies = [ "pathdiff", ] +[[package]] +name = "openssl" +version = "0.10.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a45fa2aa886c42762255da344f0a0d313e254066c46aad76f300c3d3da62d967" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "foreign-types 0.3.2", + "libc", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "openssl-probe" version = "0.1.6" @@ -7243,6 +7336,18 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "openssl-sys" +version = "0.9.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28a22dc7140cda5f096e5e7724a6962ca81a7f8bfd2979f9b18c11af56318c4" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -8434,10 +8539,12 @@ dependencies = [ "http-body-util", "hyper 1.10.1", "hyper-rustls 0.27.9", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -8449,6 +8556,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.4", "tokio-util", "tower", @@ -10121,6 +10229,16 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" diff --git a/Cargo.toml b/Cargo.toml index 967074981..600936731 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -67,6 +67,7 @@ members = [ "crates/jcode-desktop", "crates/jcode-mempalace-adapter", "crates/jcode-render-core", + "evals/jbench", ] # Local override: build against the fast_file_search main branch which diff --git a/crates/jcode-agent-runtime/Cargo.toml b/crates/jcode-agent-runtime/Cargo.toml index c475c51d8..9a769a299 100644 --- a/crates/jcode-agent-runtime/Cargo.toml +++ b/crates/jcode-agent-runtime/Cargo.toml @@ -10,3 +10,11 @@ path = "src/lib.rs" [dependencies] thiserror = "1" tokio = { version = "1", features = ["sync"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +toml = "0.8" +anyhow = "1" +tracing = "0.1" + +[dev-dependencies] +serde_json = "1" diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs new file mode 100644 index 000000000..61ce6190b --- /dev/null +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -0,0 +1,763 @@ +//! Declarative agent definitions. +//! +//! An `AgentDefinition` is the schema that describes a sub-agent: its model +//! preferences, the tools it's allowed to call, the agents it can spawn, +//! the prompts it ships, and how its output flows back to its parent. +//! +//! Definitions are loaded from TOML files in three locations (highest +//! priority first): +//! +//! 1. `.jcode/agents/.toml` (project-local, committed to repo) +//! 2. `~/.jcode/agents/.toml` (user-global) +//! 3. Embedded built-in agents bundled with the binary +//! +//! ## Design constraints +//! +//! - Definitions are **declarative TOML**, not Rust code, so users can +//! add agents without recompiling the binary. +//! - `model` is **not required**: agents inherit the session's current +//! model unless they explicitly opt into tier slots or override. +//! - `tool_names` is a whitelist — agents start with NO tools by +//! default and must list every tool they need. This is a security +//! property: a poorly-defined agent can't escalate by accident. +//! - `spawnable_agents` is also a whitelist for the same reason. +//! +//! ## Adapted from Codebuff +//! +//! Field names track Codebuff's `AgentDefinition` (snake_case Rust → +//! camelCase TS) so prior art is reusable. Differences: +//! +//! - No `model` field as required string — replaced by tier + override. +//! - No `providerOptions` — jcode's session has a single provider. +//! - `handle_steps` is a future addition (programmatic agents arrive in +//! Phase 2); for now agents are pure prompted. + +use crate::output::OutputMode; +use crate::permission::PermissionMode; +use crate::reasoning::ReasoningEffort; +use crate::tier::ModelTier; + +use serde::{Deserialize, Serialize}; + +/// Default version assigned when a definition omits `version`. +pub const DEFAULT_AGENT_VERSION: &str = "0.1.0"; + +/// Declarative description of one agent. +/// +/// Intentionally `Clone` so the runtime can hand each spawn its own copy +/// without locking the registry. Definitions are small (a few KB at most). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentDefinition { + // ----------------------------------------------------------------- + // Identity + // ----------------------------------------------------------------- + /// Unique agent id. Lowercase letters, digits, hyphens. e.g. `file-picker`. + /// Must be unique within the registry — duplicate ids are a load error. + pub id: String, + + /// Human-readable name shown in TUI / logs. e.g. `"Fletcher the File Fetcher"`. + pub display_name: String, + + /// Publisher / namespace id when this agent is shared across projects. + /// Optional for local agents; required if the agent is published to a + /// future agent registry. + #[serde(default)] + pub publisher: Option, + + /// Semver-ish version. Defaults to `DEFAULT_AGENT_VERSION`. + #[serde(default = "default_version")] + pub version: String, + + // ----------------------------------------------------------------- + // Model selection + // ----------------------------------------------------------------- + /// Optional tier slot to prefer when running this agent. The slot is + /// resolved against `JCODE_ROUTING_` env vars at run time. + /// Falls back to the session's current model if unset. + /// + /// See `tier.rs` for the full resolution algorithm. + #[serde(default)] + pub prefer_tier: Option, + + /// Optional explicit model id override. Highest priority — beats + /// `prefer_tier` and the session default. Use sparingly; hardcoding + /// model ids makes the agent file non-portable across providers. + #[serde(default)] + pub model_override: Option, + + /// Optional reasoning effort to forward to the provider request. + /// Defaults are model-specific; runtime fills in a sensible default + /// when this field is `None`. + #[serde(default)] + pub reasoning: Option, + + // ----------------------------------------------------------------- + // Tools and sub-agents + // ----------------------------------------------------------------- + /// Allowlist of tool names this agent may call. Empty list = no tools. + /// Whitelist semantics are deliberate — agents shouldn't have access + /// to tools they don't need. + #[serde(default)] + pub tool_names: Vec, + + /// Optional denylist of tool names this agent may NOT call, even if + /// they appear in `tool_names`. Takes precedence over `tool_names`. + /// Useful for inheriting a broad whitelist while blocking specific + /// dangerous tools (e.g. allow all except `bash`). + /// + /// Empty list = no additional denials (default). + #[serde(default)] + pub disallowed_tools: Vec, + + /// Allowlist of agent ids this agent may `spawn_agents` / `spawn_agent_inline`. + /// Empty list = no spawning. Use the local agent id (e.g. `file-picker`) + /// or the future `publisher/agent@version` form for shared agents. + #[serde(default)] + pub spawnable_agents: Vec, + + // ----------------------------------------------------------------- + // Prompts + // ----------------------------------------------------------------- + /// System prompt for this agent. Background, persona, mandates. + /// Mutually exclusive with `inherit_parent_system_prompt = true` + /// (which means "use the parent's system prompt instead, for cache + /// prefix sharing"). + #[serde(default)] + pub system_prompt: String, + + /// Instructions inserted after each user message. The most common + /// place to shape agent behavior — terser than `system_prompt`, + /// changes per turn allowed. + #[serde(default)] + pub instructions_prompt: Option, + + /// Optional reminder inserted at every agent step. Use sparingly — + /// strong models follow `instructions_prompt` reliably; this is for + /// weaker models or agents that need a per-step nudge. + #[serde(default)] + pub step_prompt: Option, + + /// Spawner-side prompt: when and why a parent agent should spawn this + /// agent. Used in `spawn_agents` tool documentation so the parent's + /// LLM picks the right sub-agent. + #[serde(default)] + pub spawner_prompt: Option, + + // ----------------------------------------------------------------- + // Context / cache behavior + // ----------------------------------------------------------------- + /// When true, child agent uses the parent's `system_prompt` instead + /// of its own. This is the **prompt cache prefix-sharing trick** — + /// editor / reviewer agents typically set this to `true` so the + /// expensive system prompt is cache-hit rather than re-sent. + /// + /// Mutually exclusive with a non-empty `system_prompt`. + #[serde(default)] + pub inherit_parent_system_prompt: bool, + + /// When true, child agent receives the parent's full message history. + /// Default false — most sub-agents work better with a clean slate + /// (file-picker doesn't need to see edit chatter). + #[serde(default)] + pub include_message_history: bool, + + // ----------------------------------------------------------------- + // Permissions + // ----------------------------------------------------------------- + /// Optional permission mode override for this agent's tool execution. + /// When set, the agent runs under this permission mode instead of the + /// session-global mode (set via CLI `--permission-mode` or cycled in + /// the TUI). + /// + /// Useful for: + /// - Restricting sub-agents: reviewer runs in `Plan` (read-only). + /// - Elevating leaf agents: `basher` runs in `AcceptEdits`. + /// - Background agents: CI runner uses `DontAsk`. + /// + /// If `None`, the agent inherits the session's current permission mode. + /// See `permission.rs` for the full mode descriptions. + #[serde(default)] + pub permission_mode: Option, + + /// Optional maximum number of agentic turns this agent may execute + /// before being stopped. Prevents runaway agents from consuming + /// unbounded tokens/time. + /// + /// If `None`, the agent has no per-agent turn limit (the session + /// global limit still applies). + #[serde(default)] + pub max_turns: Option, + + // ----------------------------------------------------------------- + // Output + // ----------------------------------------------------------------- + /// How the agent's output is delivered to the parent. Default + /// `LastMessage`. + #[serde(default)] + pub output_mode: OutputMode, + + /// JSON schema for `StructuredOutput` mode. Validated when the agent + /// calls `set_output`. Stored as raw JSON value because we don't + /// pull a JSON-schema crate yet — Phase 3 will add proper validation. + #[serde(default)] + pub output_schema: Option, +} + +fn default_version() -> String { + DEFAULT_AGENT_VERSION.to_string() +} + +/// Validation errors produced when an agent definition violates its +/// invariants. Displayed to users when a TOML file fails to load. +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] +pub enum DefinitionError { + #[error("agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen")] + InvalidId(String), + + #[error( + "agent `{id}` has both `inherit_parent_system_prompt = true` and a non-empty `system_prompt`. Set one or the other." + )] + SystemPromptConflict { id: String }, + + #[error("agent `{id}` has `output_mode = structured_output` but `output_schema` is missing")] + StructuredOutputMissingSchema { id: String }, + + #[error("agent `{id}` references itself in `spawnable_agents`")] + SelfSpawn { id: String }, + + #[error("agent `{id}` lists tool `{tool}` more than once in `tool_names`")] + DuplicateTool { id: String, tool: String }, + + #[error("agent `{id}` lists agent `{spawn}` more than once in `spawnable_agents`")] + DuplicateSpawnable { id: String, spawn: String }, +} + +/// Errors returned when cross-referencing an agent against the runtime +/// tool/agent universe (i.e. checking that `tool_names` actually exist). +/// +/// These are **separate from `DefinitionError`** because the runtime +/// universe isn't known at TOML-load time — it depends on feature flags, +/// MCP server connections, and the resolved agent registry. Callers +/// invoke `validate_tool_references` / `validate_spawn_references` at +/// agent spawn time. +#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] +pub enum ReferenceError { + #[error("agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}")] + UnknownTools { + id: String, + unknown: String, + available: String, + }, + + #[error( + "agent `{id}` references unknown sub-agent(s): {unknown}. Available agents: {available}" + )] + UnknownSpawnableAgents { + id: String, + unknown: String, + available: String, + }, +} + +impl AgentDefinition { + /// Validate id format + cross-field invariants. Returns `Ok(())` when + /// the definition is well-formed. + pub fn validate(&self) -> Result<(), DefinitionError> { + // 1. id format + if !is_valid_id(&self.id) { + return Err(DefinitionError::InvalidId(self.id.clone())); + } + + // 2. system_prompt vs inherit_parent_system_prompt mutual exclusion + if self.inherit_parent_system_prompt && !self.system_prompt.is_empty() { + return Err(DefinitionError::SystemPromptConflict { + id: self.id.clone(), + }); + } + + // 3. structured_output requires schema + if matches!(self.output_mode, OutputMode::StructuredOutput) && self.output_schema.is_none() + { + return Err(DefinitionError::StructuredOutputMissingSchema { + id: self.id.clone(), + }); + } + + // 4. cannot spawn self + if self.spawnable_agents.iter().any(|s| s == &self.id) { + return Err(DefinitionError::SelfSpawn { + id: self.id.clone(), + }); + } + + // 5. no duplicate tool names + let mut seen_tools = std::collections::HashSet::new(); + for tool in &self.tool_names { + if !seen_tools.insert(tool.clone()) { + return Err(DefinitionError::DuplicateTool { + id: self.id.clone(), + tool: tool.clone(), + }); + } + } + + // 6. no duplicate spawnable agent ids + let mut seen_spawn = std::collections::HashSet::new(); + for spawn in &self.spawnable_agents { + if !seen_spawn.insert(spawn.clone()) { + return Err(DefinitionError::DuplicateSpawnable { + id: self.id.clone(), + spawn: spawn.clone(), + }); + } + } + + Ok(()) + } + + /// Resolve the concrete model id to use for one invocation of this agent. + /// Convenience wrapper around `tier::resolve_model`. + pub fn resolve_model(&self, current_session_model: &str) -> String { + crate::tier::resolve_model( + self.model_override.as_deref(), + self.prefer_tier, + current_session_model, + ) + } + + /// Check that every entry in `tool_names` exists in the caller-provided + /// universe of tool names. Returns the list of unknown tools when any + /// fail. Caller decides whether unknown tools are fatal (likely yes + /// for production agents, no for under-development agents). + /// + /// Empty `tool_names` always validates — agents with no tools are + /// legal (e.g. pure-prompt summarizer). + pub fn validate_tool_references(&self, available: I) -> Result<(), ReferenceError> + where + I: IntoIterator, + S: AsRef, + { + let available: std::collections::HashSet = available + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect(); + let unknown: Vec<&String> = self + .tool_names + .iter() + .filter(|name| !available.contains(name.as_str())) + .collect(); + if unknown.is_empty() { + return Ok(()); + } + let mut sorted_unknown: Vec<&String> = unknown; + sorted_unknown.sort(); + let mut sorted_available: Vec<&String> = available.iter().collect(); + sorted_available.sort(); + Err(ReferenceError::UnknownTools { + id: self.id.clone(), + unknown: sorted_unknown + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + available: sorted_available + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + }) + } + + /// Check that every entry in `spawnable_agents` exists in the caller- + /// provided universe of agent ids. Returns unknown agents when any + /// fail. Same semantics as `validate_tool_references`. + pub fn validate_spawn_references(&self, available: I) -> Result<(), ReferenceError> + where + I: IntoIterator, + S: AsRef, + { + let available: std::collections::HashSet = available + .into_iter() + .map(|s| s.as_ref().to_string()) + .collect(); + let unknown: Vec<&String> = self + .spawnable_agents + .iter() + .filter(|name| !available.contains(name.as_str())) + .collect(); + if unknown.is_empty() { + return Ok(()); + } + let mut sorted_unknown: Vec<&String> = unknown; + sorted_unknown.sort(); + let mut sorted_available: Vec<&String> = available.iter().collect(); + sorted_available.sort(); + Err(ReferenceError::UnknownSpawnableAgents { + id: self.id.clone(), + unknown: sorted_unknown + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + available: sorted_available + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "), + }) + } +} + +/// Agent ids are intentionally restrictive: lowercase ASCII letters, digits, +/// and hyphens. No leading/trailing hyphen. Mirrors Codebuff's id rule and +/// avoids cross-platform path issues when ids become file names. +fn is_valid_id(id: &str) -> bool { + if id.is_empty() { + return false; + } + if id.starts_with('-') || id.ends_with('-') { + return false; + } + id.chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-') +} + +#[cfg(test)] +mod tests { + use super::*; + + fn minimal_definition(id: &str) -> AgentDefinition { + AgentDefinition { + id: id.to_string(), + display_name: format!("Display for {id}"), + publisher: None, + version: DEFAULT_AGENT_VERSION.to_string(), + prefer_tier: None, + model_override: None, + reasoning: None, + tool_names: Vec::new(), + disallowed_tools: Vec::new(), + spawnable_agents: Vec::new(), + system_prompt: String::new(), + instructions_prompt: None, + step_prompt: None, + spawner_prompt: None, + inherit_parent_system_prompt: false, + include_message_history: false, + permission_mode: None, + max_turns: None, + output_mode: OutputMode::LastMessage, + output_schema: None, + } + } + + #[test] + fn id_validation_rejects_uppercase() { + let mut d = minimal_definition("File-Picker"); + d.id = "File-Picker".to_string(); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); + } + + #[test] + fn id_validation_rejects_underscore() { + let mut d = minimal_definition("file_picker"); + d.id = "file_picker".to_string(); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); + } + + #[test] + fn id_validation_rejects_leading_hyphen() { + let mut d = minimal_definition("ok"); + d.id = "-bad".to_string(); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); + } + + #[test] + fn id_validation_accepts_normal_kebab() { + let d = minimal_definition("file-picker-max"); + assert!(d.validate().is_ok()); + } + + #[test] + fn inherit_and_system_prompt_conflict() { + let mut d = minimal_definition("editor"); + d.inherit_parent_system_prompt = true; + d.system_prompt = "should be empty".to_string(); + assert!(matches!( + d.validate(), + Err(DefinitionError::SystemPromptConflict { .. }) + )); + } + + #[test] + fn inherit_alone_is_fine() { + let mut d = minimal_definition("editor"); + d.inherit_parent_system_prompt = true; + d.system_prompt = String::new(); + assert!(d.validate().is_ok()); + } + + #[test] + fn structured_output_requires_schema() { + let mut d = minimal_definition("judge"); + d.output_mode = OutputMode::StructuredOutput; + d.output_schema = None; + assert!(matches!( + d.validate(), + Err(DefinitionError::StructuredOutputMissingSchema { .. }) + )); + } + + #[test] + fn structured_output_with_schema_ok() { + let mut d = minimal_definition("judge"); + d.output_mode = OutputMode::StructuredOutput; + d.output_schema = Some(serde_json::json!({"type": "object"})); + assert!(d.validate().is_ok()); + } + + #[test] + fn self_spawn_detected() { + let mut d = minimal_definition("editor"); + d.spawnable_agents.push("editor".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::SelfSpawn { .. }) + )); + } + + #[test] + fn duplicate_tool_detected() { + let mut d = minimal_definition("editor"); + d.tool_names.push("read".to_string()); + d.tool_names.push("read".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::DuplicateTool { .. }) + )); + } + + #[test] + fn duplicate_spawnable_detected() { + let mut d = minimal_definition("editor"); + d.spawnable_agents.push("file-picker".to_string()); + d.spawnable_agents.push("file-picker".to_string()); + assert!(matches!( + d.validate(), + Err(DefinitionError::DuplicateSpawnable { .. }) + )); + } + + #[test] + fn resolve_model_uses_session_default_when_no_overrides() { + let d = minimal_definition("any"); + assert_eq!(d.resolve_model("claude-sonnet"), "claude-sonnet"); + } + + #[test] + fn resolve_model_uses_override() { + let mut d = minimal_definition("any"); + d.model_override = Some("forced-model".to_string()); + assert_eq!(d.resolve_model("ignored"), "forced-model"); + } + + // ----------------------------------------------------------------- + // TOML round-trip — exercises serde defaults and field coverage + // ----------------------------------------------------------------- + #[test] + fn toml_minimal_loads_with_defaults() { + let src = r#" + id = "file-picker" + display_name = "Fletcher" + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.id, "file-picker"); + assert_eq!(d.version, DEFAULT_AGENT_VERSION); + assert_eq!(d.output_mode, OutputMode::LastMessage); + assert!(d.tool_names.is_empty()); + assert!(d.spawnable_agents.is_empty()); + assert!(!d.inherit_parent_system_prompt); + } + + #[test] + fn toml_full_definition_loads() { + let src = r#" + id = "editor" + display_name = "Code Editor" + version = "1.2.0" + publisher = "jcode" + prefer_tier = "thinking" + reasoning = "high" + tool_names = ["str_replace", "write_file"] + spawnable_agents = ["file-picker"] + inherit_parent_system_prompt = true + include_message_history = true + output_mode = "all_messages" + instructions_prompt = "Implement the requested change." + step_prompt = "Continue editing." + spawner_prompt = "Use this agent for code edits." + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.id, "editor"); + assert_eq!(d.version, "1.2.0"); + assert_eq!(d.publisher.as_deref(), Some("jcode")); + assert_eq!(d.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!(d.reasoning, Some(ReasoningEffort::High)); + assert_eq!(d.tool_names, vec!["str_replace", "write_file"]); + assert!(d.inherit_parent_system_prompt); + assert_eq!(d.output_mode, OutputMode::AllMessages); + } + + #[test] + fn toml_disallowed_tools_parses_and_defaults() { + // Explicit value + let src = r#" + id = "restricted" + display_name = "Restricted Agent" + tool_names = ["read", "write_file", "bash"] + disallowed_tools = ["bash"] + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + d.validate().expect("validate"); + assert_eq!(d.disallowed_tools, vec!["bash"]); + assert_eq!(d.tool_names, vec!["read", "write_file", "bash"]); + // disallowed_tools takes precedence: bash is listed in tool_names + // but also in disallowed_tools, so the effective allowlist is + // tool_names minus disallowed_tools = ["read", "write_file"]. + let effective: Vec<&str> = d + .tool_names + .iter() + .filter(|t| !d.disallowed_tools.contains(t)) + .map(|s| s.as_str()) + .collect(); + assert_eq!(effective, vec!["read", "write_file"]); + + // Omitted field defaults to empty + let src2 = r#" + id = "open" + display_name = "Open Agent" + tool_names = ["bash"] + "#; + let d2: AgentDefinition = toml::from_str(src2).expect("parse"); + assert!(d2.disallowed_tools.is_empty()); + } + + #[test] + fn toml_unknown_field_is_silently_ignored() { + let src = r#" + id = "ok" + display_name = "ok" + unknown_future_field = "value" + "#; + let def = toml::from_str::(src).expect("unknown fields should be ignored for forward compat"); + assert_eq!(def.id, "ok"); + assert_eq!(def.display_name, "ok"); + } + + // ----------------------------------------------------------------- + // Cross-reference validation (Phase 0.4) + // ----------------------------------------------------------------- + #[test] + fn validate_tool_references_passes_when_all_known() { + let mut d = minimal_definition("editor"); + d.tool_names = vec!["read".to_string(), "write_file".to_string()]; + d.validate_tool_references(["read", "write_file", "str_replace"]) + .expect("all tools known"); + } + + #[test] + fn validate_tool_references_fails_with_unknown_tools() { + let mut d = minimal_definition("editor"); + d.tool_names = vec!["read".to_string(), "magic".to_string()]; + let err = d + .validate_tool_references(["read", "write_file"]) + .expect_err("magic is unknown"); + match err { + ReferenceError::UnknownTools { + id, + unknown, + available, + } => { + assert_eq!(id, "editor"); + assert_eq!(unknown, "magic"); + assert!(available.contains("read")); + assert!(available.contains("write_file")); + } + other => panic!("expected UnknownTools, got {:?}", other), + } + } + + #[test] + fn validate_tool_references_empty_tool_names_always_ok() { + let d = minimal_definition("ask"); + // tool_names is empty by default; supplying empty universe is also fine. + d.validate_tool_references(Vec::::new()) + .expect("empty tool list always valid"); + } + + #[test] + fn validate_spawn_references_passes_when_all_known() { + let mut d = minimal_definition("base"); + d.spawnable_agents = vec!["file-picker".to_string(), "editor".to_string()]; + d.validate_spawn_references(["file-picker", "editor", "reviewer"]) + .expect("all known"); + } + + #[test] + fn validate_spawn_references_fails_with_unknown_agents() { + let mut d = minimal_definition("base"); + d.spawnable_agents = vec!["file-picker".to_string(), "ghost".to_string()]; + let err = d + .validate_spawn_references(["file-picker", "editor"]) + .expect_err("ghost unknown"); + match err { + ReferenceError::UnknownSpawnableAgents { + id, + unknown, + available: _, + } => { + assert_eq!(id, "base"); + assert_eq!(unknown, "ghost"); + } + other => panic!("expected UnknownSpawnableAgents, got {:?}", other), + } + } + + #[test] + fn validate_references_unknown_list_is_sorted_and_comma_joined() { + let mut d = minimal_definition("agent"); + d.tool_names = vec!["zeta".to_string(), "alpha".to_string(), "mid".to_string()]; + let err = d + .validate_tool_references(Vec::<&str>::new()) + .expect_err("none known"); + match err { + ReferenceError::UnknownTools { unknown, .. } => { + assert_eq!(unknown, "alpha, mid, zeta", "alphabetical order"); + } + _ => unreachable!(), + } + } + + #[test] + fn toml_max_turns_parses() { + let src = r#" + id = "test" + display_name = "Test" + max_turns = 50 + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + assert_eq!(d.max_turns, Some(50)); + } + + #[test] + fn toml_max_turns_none_when_absent() { + let src = r#" + id = "test" + display_name = "Test" + "#; + let d: AgentDefinition = toml::from_str(src).expect("parse"); + assert_eq!(d.max_turns, None); + } +} diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index 70bf958d6..818082509 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -1,91 +1,47 @@ -use std::sync::Arc; - -/// A soft interrupt message queued for injection at the next safe point. -#[derive(Debug, Clone)] -pub struct SoftInterruptMessage { - pub content: String, - /// If true, can skip remaining tools when injected at point C. - pub urgent: bool, - pub source: SoftInterruptSource, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SoftInterruptSource { - User, - System, - BackgroundTask, -} - -/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock. -pub type SoftInterruptQueue = Arc>>; - -/// Signal to move the currently executing tool to background. -/// Uses std::sync so it can be set without async from outside the agent lock. -pub type BackgroundToolSignal = Arc; - -/// Signal to gracefully stop generation. -pub type GracefulShutdownSignal = Arc; - -/// Async-aware interrupt signal that combines AtomicBool (sync read) with -/// tokio::Notify (async wake). Eliminates spin-loops during tool execution. -#[derive(Clone)] -pub struct InterruptSignal { - flag: Arc, - notify: Arc, -} - -impl InterruptSignal { - pub fn new() -> Self { - Self { - flag: Arc::new(std::sync::atomic::AtomicBool::new(false)), - notify: Arc::new(tokio::sync::Notify::new()), - } - } - - pub fn fire(&self) { - self.flag.store(true, std::sync::atomic::Ordering::SeqCst); - self.notify.notify_waiters(); - } - - pub fn is_set(&self) -> bool { - self.flag.load(std::sync::atomic::Ordering::SeqCst) - } - - pub fn reset(&self) { - self.flag.store(false, std::sync::atomic::Ordering::SeqCst); - } - - pub async fn notified(&self) { - let notified = self.notify.notified(); - if self.is_set() { - return; - } - notified.await; - } - - pub fn as_atomic(&self) -> Arc { - Arc::clone(&self.flag) - } -} - -impl Default for InterruptSignal { - fn default() -> Self { - Self::new() - } -} - -#[derive(Debug, thiserror::Error)] -#[error("{message}")] -pub struct StreamError { - pub message: String, - pub retry_after_secs: Option, -} - -impl StreamError { - pub fn new(message: String, retry_after_secs: Option) -> Self { - Self { - message, - retry_after_secs, - } - } -} +//! Agent runtime primitives: signals, declarative agent definitions, and +//! tier-based model resolution. +//! +//! This crate intentionally stays small and dependency-light. Heavier +//! engine work (loop, programmatic steps, spawn management) lives in +//! `src/agent.rs` and will migrate here incrementally as Phase 0 → Phase 2 +//! land. +//! +//! ## Modules +//! +//! - [`signals`] — soft-interrupt + cancellation primitives shared with +//! the server runtime. +//! - [`definition`] — declarative `AgentDefinition` schema loaded from +//! `.jcode/agents/*.toml`. +//! - [`tier`] — user-defined model tier slot resolution (extends +//! `model_routing.rs` #100). +//! - [`output`] — `OutputMode` enum (last_message / all_messages / +//! structured_output). +//! - [`reasoning`] — `ReasoningEffort` enum (minimal / low / medium / high). +//! +//! ## Re-exports +//! +//! All previous public types stay re-exported at the crate root so existing +//! consumers (`src/agent.rs`) compile unchanged. + +pub mod definition; +pub mod output; +pub mod permission; +pub mod reasoning; +pub mod registry; +pub mod signals; +pub mod tier; + +// Backwards-compatible re-exports for existing consumers. Do not remove +// without auditing `src/agent.rs` and other in-tree users. +pub use signals::{ + BackgroundToolSignal, GracefulShutdownSignal, InterruptSignal, SoftInterruptMessage, + SoftInterruptQueue, SoftInterruptSource, StreamError, +}; + +// New public surface (Phase 0). +pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError}; +pub use output::OutputMode; +pub use permission::PermissionMode; +pub use reasoning::ReasoningEffort; +pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; +pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source}; diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs new file mode 100644 index 000000000..bda4ee17d --- /dev/null +++ b/crates/jcode-agent-runtime/src/output.rs @@ -0,0 +1,78 @@ +//! How an agent's output is delivered back to its parent. +//! +//! Adapted from Codebuff's `outputMode` field. Three modes cover the +//! useful cases: +//! +//! - `LastMessage`: parent receives only the agent's final assistant turn. +//! Default. Good for "research-and-summarize" agents like file-picker. +//! - `AllMessages`: parent receives the full child message history +//! (text + tool calls + tool results). Good for editor-like agents +//! that need to expose their full edit trace. +//! - `StructuredOutput`: agent must call `set_output` with a JSON value +//! that conforms to `output_schema`. Good for judge agents, lessons +//! extractors, structured planners. + +use serde::{Deserialize, Serialize}; + +/// Output delivery mode for a sub-agent. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OutputMode { + /// Parent receives only the final assistant turn. (Default.) + #[default] + LastMessage, + /// Parent receives the full message history of the child agent. + AllMessages, + /// Agent must produce a JSON object conforming to its `output_schema`. + /// Validated on `set_output` tool call. + StructuredOutput, +} + +impl OutputMode { + pub fn as_str(&self) -> &'static str { + match self { + OutputMode::LastMessage => "last_message", + OutputMode::AllMessages => "all_messages", + OutputMode::StructuredOutput => "structured_output", + } + } + + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "last_message" | "lastmessage" | "last" => Some(OutputMode::LastMessage), + "all_messages" | "allmessages" | "all" => Some(OutputMode::AllMessages), + "structured_output" | "structured" | "json" => Some(OutputMode::StructuredOutput), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_aliases() { + assert_eq!( + OutputMode::parse("last_message"), + Some(OutputMode::LastMessage) + ); + assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages)); + assert_eq!( + OutputMode::parse("structured"), + Some(OutputMode::StructuredOutput) + ); + assert_eq!(OutputMode::parse("nonsense"), None); + } + + #[test] + fn default_is_last_message() { + assert_eq!(OutputMode::default(), OutputMode::LastMessage); + } + + #[test] + fn serde_uses_snake_case() { + let s = serde_json::to_string(&OutputMode::StructuredOutput).unwrap(); + assert_eq!(s, "\"structured_output\""); + } +} diff --git a/crates/jcode-agent-runtime/src/permission.rs b/crates/jcode-agent-runtime/src/permission.rs new file mode 100644 index 000000000..045922933 --- /dev/null +++ b/crates/jcode-agent-runtime/src/permission.rs @@ -0,0 +1,163 @@ +//! Per-agent permission mode for tool execution safety. +//! +//! Mirrors `dcg_core::Mode` but is intentionally self-contained in the +//! dependency-light `jcode-agent-runtime` crate. The runtime converts +//! this enum to `dcg_core::Mode` at spawn time. +//! +//! ## Design +//! +//! The permission mode controls how tool calls are evaluated during an +//! agent's execution: +//! +//! - `Default` — rule-based: read-only tools auto-allowed, writes prompt. +//! - `AcceptEdits` — file operations auto-allowed, network/spawn prompt. +//! - `Plan` — read-only: writes denied without prompting. +//! - `DontAsk` — allow-listed tools pass, never prompt. +//! - `BypassPermissions` — skip all evaluation. +//! - `Auto` — LLM-based classifier decides per call. +//! +//! When `AgentDefinition.permission_mode` is `None`, the agent inherits +//! the session's current permission mode (set via CLI `--permission-mode` +//! or cycled at runtime in the TUI). + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// Per-agent permission mode for tool execution safety. +/// +/// This enum intentionally mirrors `dcg_core::Mode` (from the +/// `destructive_command_guard` crate) so that `jcode-agent-runtime` +/// does not need to depend on `dcg-core` directly. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PermissionMode { + /// Rule-based classification using the legacy `AUTO_ALLOWED` list. + /// Read-only tools auto-allowed; writes require permission. + #[default] + Default, + /// File operations (edit, write, patch) auto-allowed. Network, + /// spawn, and irreversible operations still prompt. + AcceptEdits, + /// Read-only mode: write operations denied without prompting. + /// Useful for reviewer/observer agents. + Plan, + /// Only allow-listed tools pass; never prompt the user. + /// Useful for unattended/CI agents. + DontAsk, + /// Skip all permission evaluation. Use with caution. + BypassPermissions, + /// LLM-based classifier decides per tool call. + Auto, +} + +impl PermissionMode { + /// String representation matching the wire format used by TOML + /// definitions and the CLI. + pub fn as_str(&self) -> &'static str { + match self { + PermissionMode::Default => "default", + PermissionMode::AcceptEdits => "accept-edits", + PermissionMode::Plan => "plan", + PermissionMode::DontAsk => "dont-ask", + PermissionMode::BypassPermissions => "bypass-permissions", + PermissionMode::Auto => "auto", + } + } + + /// Parse a permission mode from a string. Only accepts kebab-case + /// variants matching the serde wire format for consistency. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "default" => Some(PermissionMode::Default), + "accept-edits" => Some(PermissionMode::AcceptEdits), + "plan" => Some(PermissionMode::Plan), + "dont-ask" => Some(PermissionMode::DontAsk), + "bypass-permissions" => Some(PermissionMode::BypassPermissions), + "auto" => Some(PermissionMode::Auto), + _ => None, + } + } +} + +impl fmt::Display for PermissionMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_kebab_case_only() { + assert_eq!( + PermissionMode::parse("default"), + Some(PermissionMode::Default) + ); + assert_eq!( + PermissionMode::parse("accept-edits"), + Some(PermissionMode::AcceptEdits) + ); + assert_eq!(PermissionMode::parse("plan"), Some(PermissionMode::Plan)); + assert_eq!( + PermissionMode::parse("dont-ask"), + Some(PermissionMode::DontAsk) + ); + assert_eq!( + PermissionMode::parse("bypass-permissions"), + Some(PermissionMode::BypassPermissions) + ); + assert_eq!(PermissionMode::parse("auto"), Some(PermissionMode::Auto)); + assert_eq!(PermissionMode::parse(""), None); + assert_eq!(PermissionMode::parse("nonsense"), None); + // Non-kebab-case variants are rejected for serde consistency + assert_eq!(PermissionMode::parse("accept_edits"), None); + assert_eq!(PermissionMode::parse("AcceptEdits"), None); + assert_eq!(PermissionMode::parse("bypass_permissions"), None); + } + + #[test] + fn default_is_default() { + assert_eq!(PermissionMode::default(), PermissionMode::Default); + } + + #[test] + fn serde_roundtrip_kebab_case() { + // TOML wire format uses kebab-case per serde(rename_all) + let s = serde_json::to_string(&PermissionMode::AcceptEdits).unwrap(); + assert_eq!(s, "\"accept-edits\""); + let back: PermissionMode = serde_json::from_str("\"accept-edits\"").unwrap(); + assert_eq!(back, PermissionMode::AcceptEdits); + } + + #[test] + fn serde_roundtrip_all_variants() { + for variant in [ + PermissionMode::Default, + PermissionMode::AcceptEdits, + PermissionMode::Plan, + PermissionMode::DontAsk, + PermissionMode::BypassPermissions, + PermissionMode::Auto, + ] { + let json = serde_json::to_string(&variant).unwrap(); + let back: PermissionMode = serde_json::from_str(&json).unwrap(); + assert_eq!(back, variant); + } + } + + #[test] + fn display_matches_as_str() { + for variant in [ + PermissionMode::Default, + PermissionMode::AcceptEdits, + PermissionMode::Plan, + PermissionMode::DontAsk, + PermissionMode::BypassPermissions, + PermissionMode::Auto, + ] { + assert_eq!(format!("{variant}"), variant.as_str()); + } + } +} diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs new file mode 100644 index 000000000..7cdf8d010 --- /dev/null +++ b/crates/jcode-agent-runtime/src/reasoning.rs @@ -0,0 +1,114 @@ +//! Reasoning effort levels for agents. +//! +//! Mirrors the OpenAI/Anthropic reasoning effort knobs. When an agent +//! definition specifies a reasoning effort, the agent runtime forwards it +//! to the provider request (where supported). Models that don't support +//! reasoning ignore the field. + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// How much reasoning the model should use for this agent. +/// +/// Maps roughly to: +/// - `Minimal` → `effort: "minimal"` (gpt-5 family) / no thinking budget (Claude) +/// - `Low` → `effort: "low"` / small thinking budget +/// - `Medium` → `effort: "medium"` / default thinking budget +/// - `High` → `effort: "high"` / large thinking budget (~32k tokens) +/// +/// Default is `Medium` because that matches most agents' baseline behavior. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ReasoningEffort { + Minimal, + Low, + #[default] + Medium, + High, +} + +impl ReasoningEffort { + /// String representation matching the wire format used by major providers + /// (OpenAI Responses API `reasoning.effort`, OpenRouter `reasoning.effort`). + pub fn as_str(&self) -> &'static str { + match self { + ReasoningEffort::Minimal => "minimal", + ReasoningEffort::Low => "low", + ReasoningEffort::Medium => "medium", + ReasoningEffort::High => "high", + } + } + + /// Numeric rank for threshold comparison (matches `model_routing.rs`). + /// Higher = more reasoning. + pub fn rank(&self) -> u8 { + match self { + ReasoningEffort::Minimal => 0, + ReasoningEffort::Low => 1, + ReasoningEffort::Medium => 2, + ReasoningEffort::High => 3, + } + } + + /// Parse a string value, accepting common aliases. Returns `None` for + /// unknown input so the caller can decide whether to error or default. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "minimal" | "none" | "off" => Some(ReasoningEffort::Minimal), + "low" => Some(ReasoningEffort::Low), + "medium" | "default" => Some(ReasoningEffort::Medium), + "high" | "max" => Some(ReasoningEffort::High), + _ => None, + } + } +} + +impl fmt::Display for ReasoningEffort { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_accepts_common_aliases() { + assert_eq!( + ReasoningEffort::parse("minimal"), + Some(ReasoningEffort::Minimal) + ); + assert_eq!( + ReasoningEffort::parse("OFF"), + Some(ReasoningEffort::Minimal) + ); + assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High)); + assert_eq!( + ReasoningEffort::parse("default"), + Some(ReasoningEffort::Medium) + ); + assert_eq!(ReasoningEffort::parse(""), None); + assert_eq!(ReasoningEffort::parse("absurd"), None); + } + + #[test] + fn rank_orders_efforts_correctly() { + assert!(ReasoningEffort::Minimal.rank() < ReasoningEffort::Low.rank()); + assert!(ReasoningEffort::Low.rank() < ReasoningEffort::Medium.rank()); + assert!(ReasoningEffort::Medium.rank() < ReasoningEffort::High.rank()); + } + + #[test] + fn default_is_medium() { + assert_eq!(ReasoningEffort::default(), ReasoningEffort::Medium); + } + + #[test] + fn serde_roundtrip_via_lowercase() { + let s = serde_json::to_string(&ReasoningEffort::High).unwrap(); + assert_eq!(s, "\"high\""); + let back: ReasoningEffort = serde_json::from_str("\"medium\"").unwrap(); + assert_eq!(back, ReasoningEffort::Medium); + } +} diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs new file mode 100644 index 000000000..9bc2398a8 --- /dev/null +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -0,0 +1,589 @@ +//! Agent registry: discovery + loading of `AgentDefinition`s from disk. +//! +//! ## Lookup paths (highest priority first) +//! +//! 1. **Project-local**: `/.jcode/agents/*.toml` +//! 2. **User-global**: `~/.jcode/agents/*.toml` +//! 3. **Builtins** registered programmatically via [`AgentRegistry::register_builtin`] +//! +//! When the same id appears in multiple sources, the higher-priority one +//! wins. The registry tracks where each agent came from so `jcode doctor` +//! can show provenance. +//! +//! ## What this module does NOT do +//! +//! - It does not validate that `tool_names` exist in the tool registry +//! (Phase 0.4) or that `spawnable_agents` resolve to known agents +//! (cross-reference). Both are caller responsibilities done at agent +//! spawn time, not load time, because the tool/agent universe may be +//! feature-gated. +//! - It does not watch for file changes. Agents are loaded once at +//! session start. Self-dev is welcome to call `reload_from_disk()`. + +use crate::definition::{AgentDefinition, DefinitionError}; +use crate::permission::PermissionMode; + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// Where an agent definition was loaded from. Surfaced in `jcode doctor` +/// and conflict warnings. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum AgentSource { + /// Compiled into the binary by name. Lowest priority. + Builtin, + /// Loaded from `~/.jcode/agents/`. + UserGlobal { path: PathBuf }, + /// Loaded from `/.jcode/agents/`. Highest priority. + ProjectLocal { path: PathBuf }, +} + +impl AgentSource { + fn priority(&self) -> u8 { + match self { + AgentSource::Builtin => 0, + AgentSource::UserGlobal { .. } => 1, + AgentSource::ProjectLocal { .. } => 2, + } + } + + /// Short human-readable label for `jcode doctor` output. + pub fn short_label(&self) -> String { + match self { + AgentSource::Builtin => "builtin".to_string(), + AgentSource::UserGlobal { path } => format!("user:{}", path.display()), + AgentSource::ProjectLocal { path } => format!("project:{}", path.display()), + } + } +} + +/// One loaded agent: its definition plus where it came from. +#[derive(Debug, Clone)] +pub struct LoadedAgent { + pub definition: AgentDefinition, + pub source: AgentSource, +} + +/// Errors surfaced when loading an agent file. We distinguish I/O, +/// parse, and validation errors so the TUI can render actionable +/// messages. +#[derive(Debug, thiserror::Error)] +pub enum LoadError { + #[error("failed to read `{path}`: {source}")] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + + #[error("failed to parse `{path}`: {source}")] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + + #[error("invalid agent definition in `{path}`: {source}")] + Invalid { + path: PathBuf, + #[source] + source: DefinitionError, + }, + + #[error("filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`.")] + FileNameMismatch { path: PathBuf, id: String }, +} + +/// In-memory registry of loaded agent definitions. Wrap in `Arc` if you +/// need to share — `LoadError` contains `io::Error` so the registry itself +/// is not `Clone`. +#[derive(Debug, Default)] +pub struct AgentRegistry { + by_id: HashMap, + /// Non-fatal load errors collected during discovery. Surfaced by + /// `jcode doctor` so users can see why a malformed file was skipped. + load_errors: Vec, +} + +impl AgentRegistry { + pub fn new() -> Self { + Self::default() + } + + /// Total number of registered agents. + pub fn len(&self) -> usize { + self.by_id.len() + } + + /// True if no agents are registered. + pub fn is_empty(&self) -> bool { + self.by_id.is_empty() + } + + /// Look up an agent by id. + pub fn get(&self, id: &str) -> Option<&LoadedAgent> { + self.by_id.get(id) + } + + /// Iterate over all agents in arbitrary order. + pub fn iter(&self) -> impl Iterator { + self.by_id.values() + } + + /// Sorted (by id) iteration — handy for stable doctor output. + pub fn iter_sorted(&self) -> Vec<&LoadedAgent> { + let mut v: Vec<_> = self.by_id.values().collect(); + v.sort_by(|a, b| a.definition.id.cmp(&b.definition.id)); + v + } + + /// Look up an agent referenced by a Skill MAS field (#94). + /// + /// `SKILL.md` front-matter has an optional `agent: ` field that + /// routes skill activation to a specific sub-agent rather than the + /// main agent. The id format is identical to `AgentDefinition::id`, + /// so this is functionally `get(id)` — the named alias exists to + /// document the integration point and keep future skill-routing + /// logic discoverable. + /// + /// Returns `None` if the skill references an unknown agent. The + /// caller (skill activation site) decides whether to log a warning + /// or fall back to the main agent. + pub fn lookup_for_skill_routing(&self, skill_agent_id: &str) -> Option<&LoadedAgent> { + self.get(skill_agent_id) + } + + /// Non-fatal errors accumulated during discovery. + pub fn load_errors(&self) -> &[LoadError] { + &self.load_errors + } + + /// Insert (or replace) an agent according to source priority. Returns + /// the previous entry if it was overridden. + pub fn insert(&mut self, loaded: LoadedAgent) -> Option { + let id = loaded.definition.id.clone(); + match self.by_id.get(&id) { + Some(existing) if existing.source.priority() > loaded.source.priority() => { + // existing has higher priority, drop the new one + Some(loaded) + } + _ => self.by_id.insert(id, loaded), + } + } + + /// Register a builtin agent. Builtins have the lowest priority and + /// are overridable by both user and project files of the same id. + pub fn register_builtin(&mut self, definition: AgentDefinition) -> Result<(), DefinitionError> { + definition.validate()?; + self.insert(LoadedAgent { + definition, + source: AgentSource::Builtin, + }); + Ok(()) + } + + /// Discover and load all agent files from `dir`. Non-recursive. + /// Files that don't end in `.toml` are skipped silently. Bad files + /// are recorded in `load_errors()` and skipped. + /// + /// `source_kind` decides whether each loaded file is tagged as + /// `UserGlobal` or `ProjectLocal`. + pub fn load_directory( + &mut self, + dir: &Path, + source_kind: SourceKind, + ) -> Result { + if !dir.exists() { + return Ok(0); + } + let mut loaded = 0; + for entry in std::fs::read_dir(dir)? { + let entry = match entry { + Ok(e) => e, + Err(err) => { + self.load_errors.push(LoadError::Io { + path: dir.to_path_buf(), + source: err, + }); + continue; + } + }; + let path = entry.path(); + if path.extension().and_then(|s| s.to_str()) != Some("toml") { + continue; + } + match Self::load_file(&path) { + Ok(definition) => { + let expected_stem = path.file_stem().and_then(|s| s.to_str()).unwrap_or(""); + if !expected_stem.is_empty() && expected_stem != definition.id { + self.load_errors.push(LoadError::FileNameMismatch { + path: path.clone(), + id: definition.id.clone(), + }); + continue; + } + let source = match source_kind { + SourceKind::UserGlobal => AgentSource::UserGlobal { path: path.clone() }, + SourceKind::ProjectLocal => { + AgentSource::ProjectLocal { path: path.clone() } + } + }; + let mut definition = definition; + if matches!(source, AgentSource::ProjectLocal { .. }) + && definition.permission_mode == Some(PermissionMode::BypassPermissions) + { + tracing::warn!( + agent_id = %definition.id, + "project-local agent definition attempted to set bypass-permissions; downgrading to default" + ); + definition.permission_mode = None; + } + self.insert(LoadedAgent { definition, source }); + loaded += 1; + } + Err(err) => { + self.load_errors.push(err); + } + } + } + Ok(loaded) + } + + /// Read + parse + validate a single TOML file into an `AgentDefinition`. + pub fn load_file(path: &Path) -> Result { + let raw = std::fs::read_to_string(path).map_err(|source| LoadError::Io { + path: path.to_path_buf(), + source, + })?; + let definition: AgentDefinition = + toml::from_str(&raw).map_err(|source| LoadError::Parse { + path: path.to_path_buf(), + source, + })?; + definition.validate().map_err(|source| LoadError::Invalid { + path: path.to_path_buf(), + source, + })?; + Ok(definition) + } + + /// Convenience: discover both user-global and project-local agent + /// directories using standard jcode paths. `home` defaults to + /// `dirs::home_dir()` (omitted here to keep this crate dep-light; + /// callers pass the resolved home to avoid pulling `dirs`). + pub fn discover_standard_paths( + &mut self, + home_dir: Option<&Path>, + project_root: Option<&Path>, + ) { + if let Some(home) = home_dir { + let user_dir = home.join(".jcode").join("agents"); + if let Err(err) = self.load_directory(&user_dir, SourceKind::UserGlobal) { + self.load_errors.push(LoadError::Io { + path: user_dir, + source: err, + }); + } + } + if let Some(root) = project_root { + let project_dir = root.join(".jcode").join("agents"); + if let Err(err) = self.load_directory(&project_dir, SourceKind::ProjectLocal) { + self.load_errors.push(LoadError::Io { + path: project_dir, + source: err, + }); + } + } + } +} + +/// Tag for `load_directory` so the caller decides how loaded entries are +/// labeled. The function itself doesn't care about jcode's path convention. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SourceKind { + UserGlobal, + ProjectLocal, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::output::OutputMode; + use std::fs; + + fn write_toml(dir: &Path, name: &str, body: &str) { + let path = dir.join(name); + fs::write(&path, body).expect("write toml"); + } + + fn temp_dir(name: &str) -> PathBuf { + let base = std::env::temp_dir().join(format!( + "jcode-agent-registry-test-{}-{}-{}", + name, + std::process::id(), + // Use atomics for a per-process counter so concurrent tests don't collide. + COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed) + )); + let _ = fs::remove_dir_all(&base); + fs::create_dir_all(&base).unwrap(); + base + } + + static COUNTER: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0); + + #[test] + fn missing_dir_is_zero_load_not_error() { + let mut reg = AgentRegistry::new(); + let n = reg + .load_directory( + Path::new("/nonexistent/jcode-test-dir"), + SourceKind::UserGlobal, + ) + .unwrap(); + assert_eq!(n, 0); + assert!(reg.is_empty()); + } + + #[test] + fn loads_minimal_agent() { + let dir = temp_dir("minimal"); + write_toml( + &dir, + "file-picker.toml", + r#" + id = "file-picker" + display_name = "Fletcher" + "#, + ); + let mut reg = AgentRegistry::new(); + let n = reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap(); + assert_eq!(n, 1); + let loaded = reg.get("file-picker").expect("registered"); + assert_eq!(loaded.definition.display_name, "Fletcher"); + assert!(matches!(loaded.source, AgentSource::ProjectLocal { .. })); + } + + #[test] + fn project_overrides_user_overrides_builtin() { + // Builtin + let mut reg = AgentRegistry::new(); + let mut builtin_def = AgentDefinition { + id: "editor".to_string(), + display_name: "Builtin Editor".to_string(), + publisher: None, + version: "0.1.0".to_string(), + prefer_tier: None, + model_override: None, + reasoning: None, + tool_names: vec![], + disallowed_tools: vec![], + spawnable_agents: vec![], + system_prompt: String::new(), + instructions_prompt: None, + step_prompt: None, + spawner_prompt: None, + inherit_parent_system_prompt: false, + include_message_history: false, + permission_mode: None, + max_turns: None, + output_mode: OutputMode::LastMessage, + output_schema: None, + }; + reg.register_builtin(builtin_def.clone()).unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Builtin Editor" + ); + + // User + let user_dir = temp_dir("user"); + write_toml( + &user_dir, + "editor.toml", + r#" + id = "editor" + display_name = "User Editor" + "#, + ); + reg.load_directory(&user_dir, SourceKind::UserGlobal) + .unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "User Editor" + ); + + // Project + let proj_dir = temp_dir("proj"); + write_toml( + &proj_dir, + "editor.toml", + r#" + id = "editor" + display_name = "Project Editor" + "#, + ); + reg.load_directory(&proj_dir, SourceKind::ProjectLocal) + .unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Project Editor" + ); + + // Re-register builtin should NOT override the project entry. + // (registers via the same `insert` priority path) + builtin_def.display_name = "Builtin Editor v2".to_string(); + reg.register_builtin(builtin_def).unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Project Editor", + "builtin should not override project-local" + ); + } + + #[test] + fn malformed_toml_collected_as_load_error() { + let dir = temp_dir("malformed"); + write_toml(&dir, "bad.toml", "id = \"this is missing close quote\n"); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty(), "no agents registered"); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!(reg.load_errors()[0], LoadError::Parse { .. })); + } + + #[test] + fn invalid_id_collected_as_load_error() { + let dir = temp_dir("invalid-id"); + write_toml( + &dir, + "Bad_File.toml", + r#" + id = "Bad_Id" + display_name = "Bad" + "#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty()); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!(reg.load_errors()[0], LoadError::Invalid { .. })); + } + + #[test] + fn filename_must_match_agent_id() { + let dir = temp_dir("name-mismatch"); + write_toml( + &dir, + "wrong-name.toml", + r#" + id = "right-name" + display_name = "X" + "#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert!(reg.is_empty()); + assert_eq!(reg.load_errors().len(), 1); + assert!(matches!( + reg.load_errors()[0], + LoadError::FileNameMismatch { .. } + )); + } + + #[test] + fn skips_non_toml_files() { + let dir = temp_dir("non-toml"); + fs::write(dir.join("README.md"), "not an agent").unwrap(); + fs::write(dir.join("config.json"), "{}").unwrap(); + write_toml( + &dir, + "valid.toml", + r#" + id = "valid" + display_name = "v" + "#, + ); + let mut reg = AgentRegistry::new(); + let n = reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + assert_eq!(n, 1); + assert_eq!(reg.len(), 1); + } + + #[test] + fn iter_sorted_is_deterministic() { + let dir = temp_dir("sort"); + for id in ["zeta", "alpha", "mid"] { + write_toml( + &dir, + &format!("{id}.toml"), + &format!( + r#"id = "{id}" +display_name = "{id}" +"# + ), + ); + } + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); + let ids: Vec<_> = reg + .iter_sorted() + .iter() + .map(|a| a.definition.id.clone()) + .collect(); + assert_eq!(ids, vec!["alpha", "mid", "zeta"]); + } + + #[test] + fn lookup_for_skill_routing_finds_agent() { + let dir = temp_dir("skill-mas-hit"); + write_toml( + &dir, + "code-reviewer.toml", + r#"id = "code-reviewer" +display_name = "Reviewer" +"#, + ); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal).unwrap(); + // Skill front-matter `agent: code-reviewer` → registry lookup. + let found = reg.lookup_for_skill_routing("code-reviewer"); + assert!(found.is_some()); + assert_eq!(found.unwrap().definition.id, "code-reviewer"); + } + + #[test] + fn lookup_for_skill_routing_returns_none_for_unknown_agent() { + let reg = AgentRegistry::new(); + // Caller (skill activation site) decides how to handle a missing + // routing target — we just report None. + assert!(reg.lookup_for_skill_routing("nonexistent").is_none()); + } + + #[test] + fn discover_standard_paths_reads_both() { + let home = temp_dir("home"); + let proj = temp_dir("proj"); + fs::create_dir_all(home.join(".jcode/agents")).unwrap(); + fs::create_dir_all(proj.join(".jcode/agents")).unwrap(); + write_toml( + &home.join(".jcode/agents"), + "user-only.toml", + r#"id = "user-only" +display_name = "U" +"#, + ); + write_toml( + &proj.join(".jcode/agents"), + "project-only.toml", + r#"id = "project-only" +display_name = "P" +"#, + ); + let mut reg = AgentRegistry::new(); + reg.discover_standard_paths(Some(&home), Some(&proj)); + assert_eq!(reg.len(), 2); + assert!(reg.get("user-only").is_some()); + assert!(reg.get("project-only").is_some()); + } +} diff --git a/crates/jcode-agent-runtime/src/signals.rs b/crates/jcode-agent-runtime/src/signals.rs new file mode 100644 index 000000000..67acf5082 --- /dev/null +++ b/crates/jcode-agent-runtime/src/signals.rs @@ -0,0 +1,98 @@ +//! Soft-interrupt + cancellation signals for the agent loop. +//! +//! These primitives are shared between the agent runtime, the server +//! lifecycle, and any callers that need to drive interrupts without +//! holding the agent lock. Keep this module dependency-light — `tokio` +//! sync + `std::sync` only. + +use std::sync::Arc; + +/// A soft interrupt message queued for injection at the next safe point. +#[derive(Debug, Clone)] +pub struct SoftInterruptMessage { + pub content: String, + /// If true, can skip remaining tools when injected at point C. + pub urgent: bool, + pub source: SoftInterruptSource, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SoftInterruptSource { + User, + System, + BackgroundTask, +} + +/// Thread-safe soft interrupt queue that can be accessed without holding the agent lock. +pub type SoftInterruptQueue = Arc>>; + +/// Signal to move the currently executing tool to background. +/// Uses std::sync so it can be set without async from outside the agent lock. +pub type BackgroundToolSignal = Arc; + +/// Signal to gracefully stop generation. +pub type GracefulShutdownSignal = Arc; + +/// Async-aware interrupt signal that combines AtomicBool (sync read) with +/// tokio::Notify (async wake). Eliminates spin-loops during tool execution. +#[derive(Clone)] +pub struct InterruptSignal { + flag: Arc, + notify: Arc, +} + +impl InterruptSignal { + pub fn new() -> Self { + Self { + flag: Arc::new(std::sync::atomic::AtomicBool::new(false)), + notify: Arc::new(tokio::sync::Notify::new()), + } + } + + pub fn fire(&self) { + self.flag.store(true, std::sync::atomic::Ordering::SeqCst); + self.notify.notify_waiters(); + } + + pub fn is_set(&self) -> bool { + self.flag.load(std::sync::atomic::Ordering::SeqCst) + } + + pub fn reset(&self) { + self.flag.store(false, std::sync::atomic::Ordering::SeqCst); + } + + pub async fn notified(&self) { + let notified = self.notify.notified(); + if self.is_set() { + return; + } + notified.await; + } + + pub fn as_atomic(&self) -> Arc { + Arc::clone(&self.flag) + } +} + +impl Default for InterruptSignal { + fn default() -> Self { + Self::new() + } +} + +#[derive(Debug, thiserror::Error)] +#[error("{message}")] +pub struct StreamError { + pub message: String, + pub retry_after_secs: Option, +} + +impl StreamError { + pub fn new(message: String, retry_after_secs: Option) -> Self { + Self { + message, + retry_after_secs, + } + } +} diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs new file mode 100644 index 000000000..b75916fa5 --- /dev/null +++ b/crates/jcode-agent-runtime/src/tier.rs @@ -0,0 +1,324 @@ +//! Model tier abstraction. +//! +//! A "tier" is a **user-defined named slot** that maps to a concrete model id. +//! It is intentionally NOT an opinionated catalog — jcode does not maintain +//! per-provider tier defaults like Codebuff/OpenRouter does. +//! +//! ## Why slots, not catalog? +//! +//! jcode users connect a single provider via OAuth (Claude Pro, ChatGPT Plus, +//! Gemini Advanced, etc.) and pay through that subscription. Auto-downgrading +//! to a "cheaper tier" without their consent is wrong — they already chose +//! the model they want. So the default is: agents inherit the session's +//! current model. +//! +//! Power users (pay-per-token API keys, multi-account setups) can opt in by +//! setting two env vars, exactly mirroring `model_routing.rs` (#100): +//! +//! ```bash +//! JCODE_ROUTING_ROUTINE=claude-haiku-4-5 +//! JCODE_ROUTING_THINKING=claude-opus-4-7 +//! ``` +//! +//! Agent definitions reference tiers by name: +//! +//! ```toml +//! [agent] +//! id = "file-picker" +//! prefer_tier = "routine" # uses JCODE_ROUTING_ROUTINE if set +//! ``` +//! +//! ## Resolution order +//! +//! 1. `agent.model_override` (explicit, highest priority) +//! 2. `agent.prefer_tier` + corresponding env var set +//! 3. Caller-provided `current_session_model` fallback +//! +//! No catalog. No magic. The only "magic" is reading the env var, which is +//! the existing #100 contract. + +use serde::{Deserialize, Serialize}; +use std::fmt; + +/// A user-defined tier slot. Currently only two are supported because that +/// matches `model_routing.rs` (#100). Adding tiers later is additive — the +/// env var name pattern is `JCODE_ROUTING_`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ModelTier { + /// Cheap / fast / lower-effort work: file pickers, basher, + /// summarizers. Reads `JCODE_ROUTING_ROUTINE`. + Routine, + /// Premium / reasoning work: editor, reviewer, planner. + /// Reads `JCODE_ROUTING_THINKING`. + Thinking, +} + +impl ModelTier { + /// The env var name that backs this tier slot. Returns the same string + /// shape as `model_routing.rs` (#100) so the two systems stay aligned. + pub fn env_var(&self) -> &'static str { + match self { + ModelTier::Routine => "JCODE_ROUTING_ROUTINE", + ModelTier::Thinking => "JCODE_ROUTING_THINKING", + } + } + + /// Read the user-configured model id for this tier from the environment. + /// Returns `None` when the env var is unset, blank, or whitespace-only — + /// callers should fall back to the session's current model. + pub fn read_user_override(&self) -> Option { + std::env::var(self.env_var()) + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + } + + /// Parse a tier name from a string, accepting common variants. + pub fn parse(s: &str) -> Option { + match s.trim().to_ascii_lowercase().as_str() { + "routine" | "fast" | "cheap" | "lite" => Some(ModelTier::Routine), + "thinking" | "reasoning" | "premium" | "deep" => Some(ModelTier::Thinking), + _ => None, + } + } +} + +impl fmt::Display for ModelTier { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ModelTier::Routine => f.write_str("routine"), + ModelTier::Thinking => f.write_str("thinking"), + } + } +} + +/// Resolve which model id to use for a given tier preference + override pair. +/// +/// Priority: +/// 1. `model_override` — explicit, highest priority. +/// 2. `prefer_tier` + corresponding env var set. +/// 3. `current_session_model` — caller-provided fallback. +/// +/// `current_session_model` is required because there's no other safe default: +/// the runtime doesn't know which provider/model the session is using. +pub fn resolve_model( + model_override: Option<&str>, + prefer_tier: Option, + current_session_model: &str, +) -> String { + if let Some(override_id) = model_override.and_then(|s| { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }) { + return override_id; + } + + if let Some(tier) = prefer_tier + && let Some(tier_model) = tier.read_user_override() + { + return tier_model; + } + + current_session_model.to_string() +} + +/// Diagnostic-friendly explanation of which slot was used. Useful for +/// `jcode doctor` output so users can see exactly why a given agent picked +/// the model it did. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ResolutionSource { + /// Used `agent.model_override` directly. + Override(String), + /// Used the env var backing `tier`. + Tier { tier: ModelTier, model: String }, + /// Tier was preferred but the env var was unset, so fell back to the + /// session's current model. + TierFallback { tier: ModelTier, model: String }, + /// No override or tier preference; using the session's current model. + SessionDefault(String), +} + +impl ResolutionSource { + pub fn model_id(&self) -> &str { + match self { + ResolutionSource::Override(m) + | ResolutionSource::Tier { model: m, .. } + | ResolutionSource::TierFallback { model: m, .. } + | ResolutionSource::SessionDefault(m) => m, + } + } +} + +/// Same as `resolve_model` but returns provenance information for diagnostics. +pub fn resolve_model_with_source( + model_override: Option<&str>, + prefer_tier: Option, + current_session_model: &str, +) -> ResolutionSource { + if let Some(override_id) = model_override.and_then(|s| { + let trimmed = s.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + }) { + return ResolutionSource::Override(override_id); + } + + if let Some(tier) = prefer_tier { + match tier.read_user_override() { + Some(model) => return ResolutionSource::Tier { tier, model }, + None => { + return ResolutionSource::TierFallback { + tier, + model: current_session_model.to_string(), + }; + } + } + } + + ResolutionSource::SessionDefault(current_session_model.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Mutex to serialize env-var manipulation across tests in this module. + /// Without this, `cargo test` runs tests in parallel and they trample + /// each other's `JCODE_ROUTING_*` state. + static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + fn with_env_lock(f: F) { + let guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + // Snapshot + restore env vars we mutate so test order is irrelevant. + let saved_routine = std::env::var_os("JCODE_ROUTING_ROUTINE"); + let saved_thinking = std::env::var_os("JCODE_ROUTING_THINKING"); + unsafe { + std::env::remove_var("JCODE_ROUTING_ROUTINE"); + std::env::remove_var("JCODE_ROUTING_THINKING"); + } + f(); + unsafe { + match saved_routine { + Some(v) => std::env::set_var("JCODE_ROUTING_ROUTINE", v), + None => std::env::remove_var("JCODE_ROUTING_ROUTINE"), + } + match saved_thinking { + Some(v) => std::env::set_var("JCODE_ROUTING_THINKING", v), + None => std::env::remove_var("JCODE_ROUTING_THINKING"), + } + } + drop(guard); + } + + #[test] + fn parse_tier_accepts_aliases() { + assert_eq!(ModelTier::parse("routine"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("Routine"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("FAST"), Some(ModelTier::Routine)); + assert_eq!(ModelTier::parse("thinking"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse("reasoning"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse("deep"), Some(ModelTier::Thinking)); + assert_eq!(ModelTier::parse(""), None); + assert_eq!(ModelTier::parse("nonsense"), None); + } + + #[test] + fn override_wins_over_tier_and_session_default() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_THINKING", "should-be-ignored"); + } + let got = resolve_model( + Some("explicit-model"), + Some(ModelTier::Thinking), + "session-default", + ); + assert_eq!(got, "explicit-model"); + }); + } + + #[test] + fn tier_uses_env_var_when_set() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_ROUTINE", "haiku-4-5"); + } + let got = resolve_model(None, Some(ModelTier::Routine), "session-default"); + assert_eq!(got, "haiku-4-5"); + }); + } + + #[test] + fn tier_falls_back_when_env_unset() { + with_env_lock(|| { + // env var explicitly removed by lock setup + let got = resolve_model(None, Some(ModelTier::Thinking), "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn no_tier_no_override_uses_session_default() { + with_env_lock(|| { + let got = resolve_model(None, None, "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn empty_override_string_treated_as_unset() { + with_env_lock(|| { + let got = resolve_model(Some(" "), None, "session-default"); + assert_eq!(got, "session-default"); + }); + } + + #[test] + fn resolution_source_reports_override() { + with_env_lock(|| { + let src = resolve_model_with_source(Some("forced"), None, "session"); + assert!(matches!(src, ResolutionSource::Override(ref m) if m == "forced")); + assert_eq!(src.model_id(), "forced"); + }); + } + + #[test] + fn resolution_source_reports_tier_hit() { + with_env_lock(|| { + unsafe { + std::env::set_var("JCODE_ROUTING_THINKING", "opus-4-7"); + } + let src = resolve_model_with_source(None, Some(ModelTier::Thinking), "fallback"); + match src { + ResolutionSource::Tier { tier, model } => { + assert_eq!(tier, ModelTier::Thinking); + assert_eq!(model, "opus-4-7"); + } + other => panic!("expected Tier, got {:?}", other), + } + }); + } + + #[test] + fn resolution_source_reports_tier_fallback() { + with_env_lock(|| { + // env unset + let src = resolve_model_with_source(None, Some(ModelTier::Routine), "session"); + match src { + ResolutionSource::TierFallback { tier, model } => { + assert_eq!(tier, ModelTier::Routine); + assert_eq!(model, "session"); + } + other => panic!("expected TierFallback, got {:?}", other), + } + }); + } +} diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs new file mode 100644 index 000000000..d2bf77d4d --- /dev/null +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -0,0 +1,232 @@ +//! Integration test: load the bundled sample agents in +//! `/.jcode/agents/` and assert the registry behaves as +//! documented. +//! +//! Lives in `tests/` so it exercises the public API the way real callers +//! will (the `jcode` binary, the future `cli/agents` module, etc.). +//! +//! If a future PR moves the sample agents elsewhere, update `SAMPLES_DIR`. + +use std::path::PathBuf; + +use jcode_agent_runtime::{ + AgentRegistry, ModelTier, OutputMode, PermissionMode, ReasoningEffort, SourceKind, +}; + +/// Path to the project-root sample agents directory, relative to the +/// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so +/// `cargo test --workspace` works regardless of the cwd the runner +/// chooses. +fn samples_dir() -> PathBuf { + let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // crates/jcode-agent-runtime → ../../ .jcode/agents + crate_dir + .parent() + .unwrap() + .parent() + .unwrap() + .join(".jcode/agents") +} + +#[test] +fn loads_bundled_sample_agents() { + let dir = samples_dir(); + assert!( + dir.exists(), + "sample agents directory missing: {}", + dir.display(), + ); + + let mut reg = AgentRegistry::new(); + let n = reg + .load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + assert!(n >= 2, "expected at least 2 sample agents, got {n}"); + assert!( + reg.load_errors().is_empty(), + "load errors: {:?}", + reg.load_errors() + ); +} + +#[test] +fn file_picker_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("file-picker") + .expect("file-picker registered") + .definition + .clone(); + + assert_eq!(agent.display_name, "Fletcher the File Fetcher"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); + assert!( + !agent.include_message_history, + "file picker uses clean slate" + ); + assert!(!agent.inherit_parent_system_prompt); + assert_eq!(agent.output_mode, OutputMode::LastMessage); + assert!(agent.tool_names.iter().any(|t| t == "read")); + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::Plan), + "file-picker is read-only (plan mode)" + ); + + // Resolve model with no env vars set should fall back to the + // session's current model. + let resolved = agent.resolve_model("session-model"); + assert_eq!( + resolved, "session-model", + "no JCODE_ROUTING_ROUTINE → session default" + ); +} + +#[test] +fn code_reviewer_uses_inherit_parent_system_prompt_for_cache_hit() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = ® + .get("code-reviewer") + .expect("code-reviewer registered") + .definition; + + assert!( + agent.inherit_parent_system_prompt, + "reviewer must inherit parent system prompt for prompt-cache hits" + ); + assert!( + agent.system_prompt.is_empty(), + "system_prompt must be empty when inheriting (enforced by validation)" + ); + assert!( + agent.include_message_history, + "reviewer needs context of the change it's reviewing" + ); + assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::Plan), + "code-reviewer is read-only (plan mode)" + ); +} + +#[test] +fn sample_agents_validate_cleanly() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + for loaded in reg.iter() { + loaded + .definition + .validate() + .unwrap_or_else(|err| panic!("{} failed validation: {err}", loaded.definition.id)); + } +} + +#[test] +fn basher_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("basher") + .expect("basher registered") + .definition + .clone(); + + assert_eq!(agent.id, "basher"); + assert_eq!(agent.display_name, "Basher"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); + assert!( + !agent.include_message_history, + "basher uses a clean slate per command" + ); + assert!( + !agent.inherit_parent_system_prompt, + "basher has its own short system prompt" + ); + assert_eq!(agent.output_mode, OutputMode::LastMessage); + assert_eq!(agent.tool_names, vec!["bash"]); + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::AcceptEdits), + "basher auto-approves file ops" + ); + + // No tier env var set → resolve falls back to the session model. + let resolved = agent.resolve_model("session-model"); + assert_eq!( + resolved, "session-model", + "no JCODE_ROUTING_ROUTINE → session default" + ); +} + +#[test] +fn editor_sample_has_expected_shape() { + let dir = samples_dir(); + let mut reg = AgentRegistry::new(); + reg.load_directory(&dir, SourceKind::ProjectLocal) + .expect("load_directory"); + + let agent = reg + .get("editor") + .expect("editor registered") + .definition + .clone(); + + assert_eq!(agent.id, "editor"); + assert_eq!(agent.display_name, "Code Editor"); + assert_eq!(agent.prefer_tier, Some(ModelTier::Thinking)); + assert_eq!(agent.reasoning, Some(ReasoningEffort::Medium)); + assert!( + agent.include_message_history, + "editor needs to see what the user asked for" + ); + assert!( + agent.inherit_parent_system_prompt, + "editor must inherit parent system prompt for prompt-cache hits" + ); + assert!( + agent.system_prompt.is_empty(), + "system_prompt must be empty when inheriting (enforced by validation)" + ); + assert_eq!(agent.output_mode, OutputMode::AllMessages); + for expected in [ + "read", + "str_replace", + "write", + "edit", + "multiedit", + "apply_patch", + "hashline_edit", + "patch", + ] { + assert!( + agent.tool_names.iter().any(|t| t == expected), + "editor tool_names missing `{expected}`: {:?}", + agent.tool_names, + ); + } + assert!(agent.spawnable_agents.is_empty(), "leaf agent"); + assert_eq!( + agent.permission_mode, + Some(PermissionMode::AcceptEdits), + "editor auto-approves file ops" + ); +} diff --git a/crates/jcode-app-core/src/agent.rs b/crates/jcode-app-core/src/agent.rs index f62fb15e7..5518ef25e 100644 --- a/crates/jcode-app-core/src/agent.rs +++ b/crates/jcode-app-core/src/agent.rs @@ -270,6 +270,10 @@ pub struct Agent { mcp_late_register_resolved: bool, /// Override system prompt (used by ambient mode to inject a custom prompt) system_prompt_override: Option, + /// Maximum number of tool-call turns before the agent is forced to + /// stop. `None` means unlimited. Set by `SubagentTool` from the agent + /// definition's `max_turns` field. + max_turns: Option, /// Whether memory features are enabled for this session memory_enabled: bool, /// One-step undo snapshot captured before the most recent rewind. @@ -328,6 +332,7 @@ impl Agent { locked_tools: None, mcp_late_register_resolved: false, system_prompt_override: crate::config::config().provider.system_prompt.clone(), + max_turns: None, memory_enabled: crate::config::config().features.memory, rewind_undo_snapshot: None, stdin_request_tx: None, diff --git a/crates/jcode-app-core/src/agent/prompting.rs b/crates/jcode-app-core/src/agent/prompting.rs index 98cb794c3..0a9d67c41 100644 --- a/crates/jcode-app-core/src/agent/prompting.rs +++ b/crates/jcode-app-core/src/agent/prompting.rs @@ -137,6 +137,48 @@ impl Agent { } } +/// Wrap a step prompt body in `...` tags. +/// +/// Step prompts are emitted by the harness (not typed by the user), but they +/// arrive in the conversation transcript at the same position a user message +/// would. Without disambiguation, the LLM tends to treat them as a fresh user +/// turn — re-greeting, re-asking, or otherwise breaking flow. +/// +/// Wrapping the body in `` tags signals "this is harness +/// scaffolding, not the user speaking" and lets the model continue its +/// existing turn cleanly. Returns an empty string when `prompt` is empty so +/// callers don't end up emitting an empty tag pair. +/// +/// This helper is intentionally not yet wired into step-prompt emission; +/// integration will land alongside the Phase 1 `AgentDefinition.step_prompt` +/// changes. +pub fn wrap_as_system_reminder(prompt: &str) -> String { + if prompt.is_empty() { + String::new() + } else { + format!("{}", prompt) + } +} + +#[cfg(test)] +mod wrap_as_system_reminder_tests { + use super::wrap_as_system_reminder; + + #[test] + fn wrap_as_system_reminder_empty_input_returns_empty() { + assert_eq!(wrap_as_system_reminder(""), ""); + } + + #[test] + fn wrap_as_system_reminder_non_empty_input_wrapped_correctly() { + let body = "remaining steps: 3"; + assert_eq!( + wrap_as_system_reminder(body), + "remaining steps: 3" + ); + } +} + // ---- Issue #358: mempalace per-turn pipeline -------------------------- /// Check if the mempalace backend is configured via environment or config. diff --git a/crates/jcode-app-core/src/agent/turn_execution.rs b/crates/jcode-app-core/src/agent/turn_execution.rs index 44393c474..f60916a0c 100644 --- a/crates/jcode-app-core/src/agent/turn_execution.rs +++ b/crates/jcode-app-core/src/agent/turn_execution.rs @@ -215,6 +215,10 @@ impl Agent { self.system_prompt_override = Some(prompt.to_string()); } + pub fn set_max_turns(&mut self, max: u32) { + self.max_turns = Some(max); + } + pub fn set_debug(&mut self, is_debug: bool) { self.session.set_debug(is_debug); if let Err(err) = self.session.save() { @@ -246,6 +250,7 @@ impl Agent { pub(super) async fn tool_definitions(&mut self) -> Vec { if self.session.is_canary { self.registry.register_selfdev_tools().await; + self.registry.register_experimental_tools().await; } // Return locked tools if available (prevents cache invalidation from @@ -325,8 +330,8 @@ impl Agent { fn apply_selfdev_tool_surface(tools: &mut [ToolDefinition], is_canary: bool) { for tool in tools.iter_mut() { if tool.name == "selfdev" { - tool.description = crate::tool::selfdev::SelfDevTool::description_for(is_canary) - .to_string(); + tool.description = + crate::tool::selfdev::SelfDevTool::description_for(is_canary).to_string(); tool.input_schema = crate::tool::selfdev::SelfDevTool::schema_for(is_canary); } } @@ -358,6 +363,7 @@ impl Agent { pub async fn tool_definitions_for_debug(&self) -> Vec { if self.session.is_canary { self.registry.register_selfdev_tools().await; + self.registry.register_experimental_tools().await; } let mut tools = self.registry.definitions(self.allowed_tools.as_ref()).await; if !self.disabled_tools.is_empty() { diff --git a/crates/jcode-app-core/src/agent/turn_loops.rs b/crates/jcode-app-core/src/agent/turn_loops.rs index 8be6df2db..96ccdbd15 100644 --- a/crates/jcode-app-core/src/agent/turn_loops.rs +++ b/crates/jcode-app-core/src/agent/turn_loops.rs @@ -14,8 +14,26 @@ impl Agent { let mut context_limit_retries = 0u32; let mut incomplete_continuations = 0u32; let mut empty_post_tool_continuations = 0u32; + let mut turn_count = 0u32; loop { + turn_count += 1; + if let Some(max) = self.max_turns { + if turn_count > max { + logging::info(&format!( + "max_turns limit reached ({}); forcing turn completion", + max + )); + if final_text.is_empty() { + final_text = format!( + "[agent stopped: reached max_turns limit of {}]", + max + ); + } + break; + } + } + let repaired = self.repair_missing_tool_outputs(); if repaired > 0 { logging::warn(&format!( diff --git a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs index a91adff4c..0f4b0faf5 100644 --- a/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs +++ b/crates/jcode-app-core/src/agent/turn_streaming_mpsc.rs @@ -396,8 +396,9 @@ impl Agent { // answer renders as a normal paragraph rather than as reasoning. if reasoning_open && !text.trim().is_empty() { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } text_content.push_str(&text); if !text_wrapped_detected { @@ -430,8 +431,9 @@ impl Agent { StreamEvent::ToolUseStart { id, name } => { if reasoning_open { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } let _ = event_tx.send(ServerEvent::ToolStart { id: id.clone(), @@ -595,8 +597,9 @@ impl Agent { // step) so the client flushes its live partial line. if reasoning_open { reasoning_open = false; - let _ = event_tx - .send(ServerEvent::ReasoningDone { duration_secs: None }); + let _ = event_tx.send(ServerEvent::ReasoningDone { + duration_secs: None, + }); } if reason.is_some() { stop_reason = reason; diff --git a/crates/jcode-app-core/src/agent_tests.rs b/crates/jcode-app-core/src/agent_tests.rs index 9f115a19a..3b3523f78 100644 --- a/crates/jcode-app-core/src/agent_tests.rs +++ b/crates/jcode-app-core/src/agent_tests.rs @@ -152,7 +152,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() { open_delay: Duration::from_secs(2), first_event_delay: Duration::from_secs(2), }); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( Role::User, @@ -219,7 +219,7 @@ async fn run_turn_streaming_mpsc_emits_keepalive_while_provider_is_quiet() { #[tokio::test] async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( @@ -260,7 +260,7 @@ async fn messages_for_provider_replays_persisted_native_compaction_in_auto_mode( #[tokio::test] async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.add_message( @@ -322,7 +322,7 @@ async fn oversized_openai_native_compaction_is_persisted_as_text_fallback() { #[tokio::test] async fn messages_for_provider_applies_manual_compaction_in_native_auto_mode() { let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); for i in 0..30 { @@ -449,7 +449,7 @@ async fn interrupt_signal_notified_completes_after_fire() { async fn new_agent_registers_active_pid_and_clear_swaps_it() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let first_session_id = agent.session_id().to_string(); @@ -491,7 +491,7 @@ async fn default_disabled_tools_are_not_exposed_or_executable() { crate::config::Config::invalidate_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let definitions = agent.tool_definitions().await; let tool_names = agent.tool_names().await; @@ -573,7 +573,7 @@ fn seed_transient_session_state(agent: &mut Agent) { async fn clear_resets_runtime_interrupt_and_queue_state() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); seed_transient_session_state(&mut agent); @@ -602,7 +602,7 @@ async fn clear_resets_runtime_interrupt_and_queue_state() { async fn restore_session_resets_runtime_interrupt_and_queue_state() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let mut restored_session = crate::session::Session::create_with_id( @@ -644,7 +644,7 @@ async fn restore_session_rehydrates_injected_memory_ids() { crate::memory::clear_all_pending_memory(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let mut restored_session = crate::session::Session::create_with_id( @@ -685,7 +685,7 @@ async fn build_memory_prompt_nonblocking_defers_pending_memory_during_tool_loop( crate::memory::clear_all_pending_memory(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Agent::new(provider, registry); let session_id = agent.session.id.clone(); @@ -735,7 +735,7 @@ async fn memory_injection_message_defaults_to_ephemeral_history() { crate::config::invalidate_config_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let before = agent.session.messages.len(); let memory = crate::memory::PendingMemory { @@ -768,7 +768,7 @@ async fn memory_injection_message_can_persist_to_history() { crate::config::invalidate_config_cache(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let before = agent.session.messages.len(); let memory = crate::memory::PendingMemory { @@ -806,7 +806,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() { crate::env::set_var("JCODE_HOME", temp.path()); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider.clone(), registry.clone()); let session_id = agent.session_id().to_string(); agent.session.save().expect("save active session"); @@ -842,7 +842,7 @@ async fn mark_closed_persists_soft_interrupts_for_restore_after_reload() { async fn env_snapshot_detail_is_minimal_for_empty_sessions_and_full_after_history() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); assert_eq!(agent.env_snapshot_detail(), EnvSnapshotDetail::Minimal); @@ -905,7 +905,7 @@ impl crate::tool::Tool for FakeMcpTool { async fn mcp_tools_registered_after_lock_are_visible_to_agent() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn locks the snapshot (this is what happens before the async MCP @@ -967,7 +967,7 @@ async fn mcp_tools_registered_after_lock_are_visible_to_agent() { async fn mcp_late_registration_rebuild_happens_at_most_once() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn locks the snapshot with no MCP tools yet. @@ -1039,7 +1039,7 @@ async fn mcp_late_registration_rebuild_happens_at_most_once() { async fn tool_snapshot_is_stable_without_new_mcp_tools() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(NativeAutoCompactionProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let first = agent.tool_definitions().await; diff --git a/crates/jcode-app-core/src/ambient/runner.rs b/crates/jcode-app-core/src/ambient/runner.rs index 790502351..8a973d842 100644 --- a/crates/jcode-app-core/src/ambient/runner.rs +++ b/crates/jcode-app-core/src/ambient/runner.rs @@ -385,9 +385,10 @@ impl AmbientRunnerHandle { ) -> anyhow::Result<()> { let session = Session::load(session_id)?; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; if session.is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } // Issue #89: ambient cycles previously skipped MCP registration, so // user-installed MCP tools were invisible to the cycle agent — @@ -470,9 +471,10 @@ impl AmbientRunnerHandle { let child_is_canary = child.is_canary; let child_is_debug = child.is_debug; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; if child_is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } // Issue #89: register MCP tools for ambient cycles (same as main session). registry @@ -928,7 +930,7 @@ impl AmbientRunnerHandle { self.set_running_detail("setting up tools").await; let cycle_provider = provider.fork(); - let registry = tool::Registry::new(cycle_provider.clone()).await; + let registry = tool::Registry::new(cycle_provider.clone(), tool::shared_agent_registry()).await; registry.register_ambient_tools().await; // Issue #89: register MCP tools so user-installed MCP servers are // available to the ambient agent — without this, the cycle agent diff --git a/crates/jcode-app-core/src/dcg_bridge.rs b/crates/jcode-app-core/src/dcg_bridge.rs index c9398d69a..1612992b3 100644 --- a/crates/jcode-app-core/src/dcg_bridge.rs +++ b/crates/jcode-app-core/src/dcg_bridge.rs @@ -32,10 +32,12 @@ //! `Default`, `Auto`, `BypassPermissions`; **deny under `Plan`** (which is //! read-only); prompt under `DontAsk` only if explicitly allow-listed. +use std::collections::HashMap; use std::path::PathBuf; use std::sync::{LazyLock, Mutex}; use dcg_core::{Decision, Effect, Engine, EngineConfig, Mode, Session, ToolCall}; +use jcode_agent_runtime::permission::PermissionMode; pub use crate::yolo_classifier::YoloClassifier; @@ -82,6 +84,32 @@ fn default_protected_paths() -> Vec { ] } +/// Convert a [`PermissionMode`] (from `jcode-agent-runtime`) into the +/// corresponding [`dcg_core::Mode`]. The two enums mirror each other +/// exactly; this function is the canonical bridge. +/// +/// We cannot implement `From for Mode` due to the orphan +/// rule (both types live in foreign crates). This free function serves +/// the same purpose. +#[must_use] +pub fn permission_mode_to_dcg(pm: PermissionMode) -> Mode { + match pm { + PermissionMode::Default => Mode::Default, + PermissionMode::AcceptEdits => Mode::AcceptEdits, + PermissionMode::Plan => Mode::Plan, + PermissionMode::DontAsk => Mode::DontAsk, + PermissionMode::BypassPermissions => Mode::BypassPermissions, + PermissionMode::Auto => Mode::Auto, + } +} + +/// Per-session permission mode overrides. When a subagent is spawned with +/// a specific `permission_mode` from its `AgentDefinition`, it is stored +/// here keyed by the child session id. `classify_for_agent` checks this +/// map before falling back to the global mode. +static SESSION_MODES: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + /// Set the global permission mode. Called from the CLI / config layer at /// process startup. Subsequent `classify` calls observe the new mode. pub fn set_mode(mode: Mode) { @@ -99,6 +127,92 @@ pub fn current_mode() -> Mode { .unwrap_or(Mode::Default) } +/// Store a per-session permission mode override. Called when a subagent +/// is spawned with an explicit `permission_mode` from its agent +/// definition. +pub fn set_session_mode(session_id: &str, mode: Mode) { + if let Ok(mut guard) = SESSION_MODES.lock() { + guard.insert(session_id.to_string(), mode); + } +} + +/// Remove the per-session permission mode override for a session that +/// has finished. Prevents unbounded growth of the map. +pub fn clear_session_mode(session_id: &str) { + if let Ok(mut guard) = SESSION_MODES.lock() { + guard.remove(session_id); + } +} + +/// Return the per-session mode override, if any. +#[must_use] +pub fn session_mode(session_id: &str) -> Option { + SESSION_MODES + .lock() + .ok() + .and_then(|guard| guard.get(session_id).copied()) +} + +/// RAII guard that clears a per-session permission mode on drop. +/// +/// Use this instead of manual `set_session_mode` / `clear_session_mode` +/// pairs to guarantee cleanup even when the subagent exits via early +/// return or error path. +pub struct SessionModeGuard { + session_id: String, +} + +impl SessionModeGuard { + /// Set the per-session mode and return a guard that will clear it on + /// drop. If `mode` is `None`, no override is set and the guard is a + /// no-op on drop (but still safe to hold). + #[must_use] + pub fn new(session_id: &str, mode: Option) -> Self { + if let Some(mode) = mode { + set_session_mode(session_id, mode); + } + Self { + session_id: session_id.to_string(), + } + } +} + +impl Drop for SessionModeGuard { + fn drop(&mut self) { + clear_session_mode(&self.session_id); + } +} + +/// Classify an action using the agent-specific permission mode when +/// provided, falling back to the global mode otherwise. +/// +/// This is the entry point that respects per-agent permission overrides. +/// Call sites that know the agent's `PermissionMode` (e.g. subagent tool +/// execution) should use this instead of [`classify`]. +#[must_use] +pub fn classify_for_agent( + action: &str, + agent_permission_mode: Option, +) -> BridgeDecision { + let mode = agent_permission_mode + .map(permission_mode_to_dcg) + .unwrap_or_else(current_mode); + classify_with_mode(action, mode) +} + +/// Classify an action using the per-session mode override when one exists +/// for `session_id`, falling back to the global mode otherwise. +/// +/// This is the session-aware variant of [`classify`]. Call sites that +/// know the session id (e.g. tool execution within a subagent) should +/// prefer this over the global [`classify`] so that per-session +/// permission overrides set via [`set_session_mode`] are honoured. +#[must_use] +pub fn classify_for_session(action: &str, session_id: &str) -> BridgeDecision { + let mode = session_mode(session_id).unwrap_or_else(current_mode); + classify_with_mode(action, mode) +} + /// Three-state outcome from the bridge. jcode's `SafetySystem` collapses /// `Allow` to `ActionTier::AutoAllowed` and `Prompt`/`Deny` to /// `ActionTier::RequiresPermission` — but exposing the full set here @@ -391,4 +505,58 @@ mod tests { // Restore so other tests aren't affected by ordering. set_mode(original); } + + #[test] + fn permission_mode_converts_to_dcg_mode() { + use jcode_agent_runtime::permission::PermissionMode as PM; + + assert_eq!(permission_mode_to_dcg(PM::Default), Mode::Default); + assert_eq!(permission_mode_to_dcg(PM::AcceptEdits), Mode::AcceptEdits); + assert_eq!(permission_mode_to_dcg(PM::Plan), Mode::Plan); + assert_eq!(permission_mode_to_dcg(PM::DontAsk), Mode::DontAsk); + assert_eq!( + permission_mode_to_dcg(PM::BypassPermissions), + Mode::BypassPermissions + ); + assert_eq!(permission_mode_to_dcg(PM::Auto), Mode::Auto); + } + + #[test] + fn classify_for_agent_uses_agent_mode_when_set() { + use jcode_agent_runtime::permission::PermissionMode as PM; + + // todowrite auto-allows in AcceptEdits but denies in Plan + assert_eq!( + classify_for_agent("todowrite", Some(PM::AcceptEdits)), + BridgeDecision::Allow, + "todowrite must allow in AcceptEdits" + ); + assert_eq!( + classify_for_agent("todowrite", Some(PM::Plan)), + BridgeDecision::Deny, + "todowrite must deny in Plan" + ); + } + + #[test] + fn classify_for_agent_falls_back_to_global_when_none() { + let original = current_mode(); + set_mode(Mode::BypassPermissions); + assert_eq!( + classify_for_agent("made_up_tool", None), + BridgeDecision::Allow, + "falls back to global BypassPermissions mode" + ); + set_mode(original); + } + + #[test] + fn session_mode_set_and_clear() { + let sid = "test_session_mode_123"; + assert!(session_mode(sid).is_none()); + set_session_mode(sid, Mode::Plan); + assert_eq!(session_mode(sid), Some(Mode::Plan)); + clear_session_mode(sid); + assert!(session_mode(sid).is_none()); + } } diff --git a/crates/jcode-app-core/src/lib.rs b/crates/jcode-app-core/src/lib.rs index b4cb41d24..27d8ee45e 100644 --- a/crates/jcode-app-core/src/lib.rs +++ b/crates/jcode-app-core/src/lib.rs @@ -39,6 +39,7 @@ pub mod network_retry; pub mod notifications; pub mod overnight; pub mod perf; +pub mod prompt_placeholders; pub mod prompt_templates; pub mod replay; pub mod restart_snapshot; diff --git a/crates/jcode-app-core/src/overnight.rs b/crates/jcode-app-core/src/overnight.rs index a619cdaaf..ee181ef3c 100644 --- a/crates/jcode-app-core/src/overnight.rs +++ b/crates/jcode-app-core/src/overnight.rs @@ -253,6 +253,7 @@ async fn run_supervisor( if child_is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } let mut agent = Agent::new_with_session(provider, registry, child, None); diff --git a/crates/jcode-app-core/src/prompt_placeholders.rs b/crates/jcode-app-core/src/prompt_placeholders.rs new file mode 100644 index 000000000..386201dae --- /dev/null +++ b/crates/jcode-app-core/src/prompt_placeholders.rs @@ -0,0 +1,220 @@ +//! Phase 4 prompt placeholder substitution helper. +//! +//! Provides a small `String -> String` transformation that replaces a fixed +//! set of `{{PLACEHOLDER}}` tokens with values supplied through a +//! [`PlaceholderContext`]. Designed to be a pure utility: no I/O, no errors, +//! no global state. Callers are responsible for assembling the context and +//! choosing where to apply substitution (system prompt, step prompt, etc.). +//! +//! Supported tokens (case-sensitive, exact match including the surrounding +//! double curly braces): +//! +//! - `{{FILE_TREE_SMALL}}` — truncated project tree, max 2500 chars. +//! - `{{FILE_TREE}}` — fuller project tree, max 10000 chars. +//! - `{{KNOWLEDGE_FILES}}` — concatenated knowledge / context files, max 100000 chars. +//! - `{{GIT_CHANGES}}` — `git diff` / status summary, max 30000 chars. +//! - `{{CURRENT_DATE}}` — ISO `YYYY-MM-DD` date string. +//! - `{{REMAINING_STEPS}}` — remaining-step counter (u32, decimal). +//! - `{{SYSTEM_INFO}}` — OS / arch / shell summary. +//! +//! Empty `String` fields and `remaining_steps == 0` are replaced with an +//! empty string rather than the literal placeholder text. Tokens that are +//! not in the supported list are left untouched in the output, so this +//! function is safe to apply to text that may contain other Mustache-like +//! syntax. + +/// Maximum char count retained for [`PlaceholderContext::file_tree_small`]. +pub const FILE_TREE_SMALL_MAX_CHARS: usize = 2_500; + +/// Maximum char count retained for [`PlaceholderContext::file_tree`]. +pub const FILE_TREE_MAX_CHARS: usize = 10_000; + +/// Maximum char count retained for [`PlaceholderContext::git_changes`]. +pub const GIT_CHANGES_MAX_CHARS: usize = 30_000; + +/// Maximum char count retained for [`PlaceholderContext::knowledge_files`]. +pub const KNOWLEDGE_FILES_MAX_CHARS: usize = 100_000; + +/// Container for values that can be substituted into prompt templates. +/// +/// All `String` fields default to empty and `remaining_steps` defaults to 0. +/// Use [`PlaceholderContext::default`] and assign the fields you have data +/// for; missing fields will simply substitute as empty. +#[derive(Debug, Default, Clone)] +pub struct PlaceholderContext { + /// Compact project file tree. Truncated to [`FILE_TREE_SMALL_MAX_CHARS`] + /// chars during substitution. + pub file_tree_small: String, + /// Fuller project file tree. Truncated to [`FILE_TREE_MAX_CHARS`] chars + /// during substitution. + pub file_tree: String, + /// Concatenated knowledge/context files. Truncated to [`KNOWLEDGE_FILES_MAX_CHARS`] + /// chars during substitution. + pub knowledge_files: String, + /// Git diff / status summary. Truncated to [`GIT_CHANGES_MAX_CHARS`] + /// chars during substitution. + pub git_changes: String, + /// Current date in ISO `YYYY-MM-DD` form. + pub current_date: String, + /// Remaining steps allowed for the current run/turn. Zero substitutes + /// to an empty string. + pub remaining_steps: u32, + /// Free-form system info (OS / arch / shell). + pub system_info: String, +} + +/// Return at most `max_chars` characters from `s`, respecting char +/// boundaries. If `s` already fits within the limit it is returned +/// unchanged (cloned). +fn truncate_chars(s: &str, max_chars: usize) -> String { + if s.chars().count() <= max_chars { + s.to_string() + } else { + s.chars().take(max_chars).collect() + } +} + +/// Replace every supported placeholder token in `prompt` with the matching +/// value from `ctx`. Unknown `{{TOKENS}}` are preserved verbatim. Empty +/// values (and `remaining_steps == 0`) replace the placeholder with an +/// empty string. +/// +/// Length caps documented on [`PlaceholderContext`] are enforced here, so +/// callers may pass un-truncated input and trust the output to be bounded. +/// +/// This is the **context-driven** substitution path used for built-in +/// Phase 4 placeholders. For user-supplied template bindings (arbitrary +/// `HashMap`), use +/// [`crate::prompt_templates::substitute_placeholders`] instead. +pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { + if prompt.is_empty() { + return String::new(); + } + + let file_tree_small = truncate_chars(&ctx.file_tree_small, FILE_TREE_SMALL_MAX_CHARS); + let file_tree = truncate_chars(&ctx.file_tree, FILE_TREE_MAX_CHARS); + let knowledge_files = truncate_chars(&ctx.knowledge_files, KNOWLEDGE_FILES_MAX_CHARS); + let git_changes = truncate_chars(&ctx.git_changes, GIT_CHANGES_MAX_CHARS); + let remaining_steps = if ctx.remaining_steps == 0 { + String::new() + } else { + ctx.remaining_steps.to_string() + }; + + // Each entry is (token, replacement). Order is irrelevant because + // tokens never overlap, but we keep it stable for determinism. + let replacements: [(&str, &str); 7] = [ + ("{{FILE_TREE_SMALL}}", file_tree_small.as_str()), + ("{{FILE_TREE}}", file_tree.as_str()), + ("{{KNOWLEDGE_FILES}}", knowledge_files.as_str()), + ("{{GIT_CHANGES}}", git_changes.as_str()), + ("{{CURRENT_DATE}}", ctx.current_date.as_str()), + ("{{REMAINING_STEPS}}", remaining_steps.as_str()), + ("{{SYSTEM_INFO}}", ctx.system_info.as_str()), + ]; + + let mut out = prompt.to_string(); + for (token, value) in replacements { + if out.contains(token) { + out = out.replace(token, value); + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_context_replaces_all_placeholders_with_empty() { + let ctx = PlaceholderContext::default(); + let input = "tree=[{{FILE_TREE_SMALL}}] full=[{{FILE_TREE}}] \ + k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \ + date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \ + sys=[{{SYSTEM_INFO}}]"; + let out = substitute_context_placeholders(input, &ctx); + assert_eq!(out, "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"); + } + + #[test] + fn individual_placeholder_works() { + let ctx = PlaceholderContext { + current_date: "2026-05-25".to_string(), + ..Default::default() + }; + let out = substitute_context_placeholders("today is {{CURRENT_DATE}}.", &ctx); + assert_eq!(out, "today is 2026-05-25."); + + // Unrelated placeholder stays empty in the same call. + let out2 = substitute_context_placeholders( + "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}", + &ctx, + ); + assert_eq!(out2, "date=2026-05-25 steps="); + } + + #[test] + fn multiple_placeholders_in_same_string_work() { + let ctx = PlaceholderContext { + file_tree_small: "src/\n lib.rs".to_string(), + knowledge_files: "AGENTS.md contents".to_string(), + current_date: "2026-05-25".to_string(), + remaining_steps: 7, + system_info: "linux x86_64".to_string(), + ..Default::default() + }; + let input = "## Tree\n{{FILE_TREE_SMALL}}\n\n## Knowledge\n\ + {{KNOWLEDGE_FILES}}\n\n## Meta\n\ + date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \ + sys={{SYSTEM_INFO}}"; + let out = substitute_context_placeholders(input, &ctx); + let expected = "## Tree\nsrc/\n lib.rs\n\n## Knowledge\n\ + AGENTS.md contents\n\n## Meta\n\ + date=2026-05-25 steps=7 sys=linux x86_64"; + assert_eq!(out, expected); + } + + #[test] + fn unknown_placeholder_text_remains_as_is() { + let ctx = PlaceholderContext { + current_date: "2026-05-25".to_string(), + ..Default::default() + }; + let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \ + other={{ALSO_BOGUS}}"; + let out = substitute_context_placeholders(input, &ctx); + assert_eq!( + out, + "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}" + ); + } + + #[test] + fn truncation_caps_long_inputs() { + // Build a string longer than the file-tree-small cap. + let big: String = "x".repeat(FILE_TREE_SMALL_MAX_CHARS + 1234); + let ctx = PlaceholderContext { + file_tree_small: big.clone(), + ..Default::default() + }; + let out = substitute_context_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); + // Two bracket characters plus the cap. + assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2); + assert!(out.starts_with('[')); + assert!(out.ends_with(']')); + } + + #[test] + fn knowledge_files_truncated_when_exceeds_cap() { + let big: String = "k".repeat(KNOWLEDGE_FILES_MAX_CHARS + 5000); + let ctx = PlaceholderContext { + knowledge_files: big.clone(), + ..Default::default() + }; + let out = substitute_context_placeholders("[{{KNOWLEDGE_FILES}}]", &ctx); + assert_eq!(out.chars().count(), KNOWLEDGE_FILES_MAX_CHARS + 2); + assert!(out.starts_with('[')); + assert!(out.ends_with(']')); + } +} diff --git a/crates/jcode-app-core/src/server.rs b/crates/jcode-app-core/src/server.rs index 5d8ee1043..6ae36c4bc 100644 --- a/crates/jcode-app-core/src/server.rs +++ b/crates/jcode-app-core/src/server.rs @@ -560,7 +560,7 @@ impl Server { tokio::spawn(async move { let start = Instant::now(); let provider = registry_warm_provider.fork(); - let _ = crate::tool::Registry::new(provider).await; + let _ = crate::tool::Registry::new(provider, None).await; crate::logging::info(&format!( "Registry prewarm completed in {}ms", start.elapsed().as_millis() @@ -636,9 +636,10 @@ impl Server { let previous_status = session.status.clone(); let provider = self.provider.fork(); - let registry = crate::tool::Registry::new(provider.clone()).await; + let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; if session.is_canary { registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } registry .register_mcp_tools( diff --git a/crates/jcode-app-core/src/server/client_actions_tests.rs b/crates/jcode-app-core/src/server/client_actions_tests.rs index 4d4923c27..8783446a8 100644 --- a/crates/jcode-app-core/src/server/client_actions_tests.rs +++ b/crates/jcode-app-core/src/server/client_actions_tests.rs @@ -141,7 +141,7 @@ fn clone_split_session_uses_persisted_session_state() { #[tokio::test] async fn enabling_swarm_does_not_auto_elect_coordinator() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let (member_event_tx, _member_event_rx) = mpsc::unbounded_channel(); let now = Instant::now(); @@ -242,7 +242,7 @@ async fn rename_session_event_uses_agent_session_id_even_when_client_id_is_stale crate::env::set_var("JCODE_HOME", temp.path()); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let agent_session_id = agent.lock().await.session_id().to_string(); let stale_client_session_id = "session_stale_client_id"; @@ -321,7 +321,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session() StreamEvent::MessageEnd { stop_reason: None }, ]); let provider_dyn: Arc = provider.clone(); - let registry = Registry::new(provider_dyn.clone()).await; + let registry = Registry::new(provider_dyn.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider_dyn, registry))); let session_id = agent.lock().await.session_id().to_string(); let sessions = Arc::new(RwLock::new(HashMap::>>::from([( @@ -422,7 +422,7 @@ async fn notify_session_runs_scheduled_task_immediately_for_idle_live_session() #[tokio::test] async fn notify_session_queues_soft_interrupt_when_live_session_is_busy() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let session_id = agent.lock().await.session_id().to_string(); let queue = agent.lock().await.soft_interrupt_queue(); diff --git a/crates/jcode-app-core/src/server/client_comm_tests.rs b/crates/jcode-app-core/src/server/client_comm_tests.rs index 0db9680bf..70c2354fd 100644 --- a/crates/jcode-app-core/src/server/client_comm_tests.rs +++ b/crates/jcode-app-core/src/server/client_comm_tests.rs @@ -39,7 +39,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/client_lifecycle.rs b/crates/jcode-app-core/src/server/client_lifecycle.rs index e437e6e49..38fc6d646 100644 --- a/crates/jcode-app-core/src/server/client_lifecycle.rs +++ b/crates/jcode-app-core/src/server/client_lifecycle.rs @@ -418,7 +418,7 @@ pub(super) async fn handle_client( let provider = provider_template.fork(); let t0 = std::time::Instant::now(); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; let registry_ms = t0.elapsed().as_millis(); let mut swarm_enabled = crate::config::config().features.swarm; diff --git a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs index c02140f5e..4513301fd 100644 --- a/crates/jcode-app-core/src/server/client_lifecycle_tests.rs +++ b/crates/jcode-app-core/src/server/client_lifecycle_tests.rs @@ -23,7 +23,7 @@ async fn session_control_handle_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let queue = Arc::new(std::sync::Mutex::new(Vec::new())); @@ -61,7 +61,7 @@ async fn refreshed_session_control_handle_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id( "session_busy_control_refresh".to_string(), None, @@ -106,7 +106,7 @@ async fn busy_agent_request_rejection_does_not_wait_for_agent_lock() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::new(AtomicBool::new(false)), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let agent = Arc::new(Mutex::new(Agent::new(provider, registry))); let (client_event_tx, mut client_event_rx) = mpsc::unbounded_channel::(); @@ -356,7 +356,7 @@ fn reload_starting_rejects_new_turn_without_spawning_processing_task() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::clone(&forked), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id("session_guard".to_string(), None, None); session.model = Some("panic-on-fork".to_string()); @@ -448,7 +448,7 @@ fn accepted_reload_recovery_continuation_marks_intent_delivered() -> anyhow::Res let rt = tokio::runtime::Runtime::new().expect("runtime"); rt.block_on(async { let provider: Arc = Arc::new(CompleteImmediatelyProvider); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some("complete-immediately".to_string()); @@ -537,7 +537,7 @@ fn reload_starting_rejects_new_turns_for_multiple_sessions() { let provider: Arc = Arc::new(PanicOnForkProvider { forked: Arc::clone(&forked), }); - let registry = Registry::new(Arc::clone(&provider)).await; + let registry = Registry::new(Arc::clone(&provider), None).await; let swarm_members = Arc::new(RwLock::new(HashMap::new())); let swarms_by_id = Arc::new(RwLock::new(HashMap::new())); let event_history = Arc::new(RwLock::new(std::collections::VecDeque::new())); diff --git a/crates/jcode-app-core/src/server/client_session.rs b/crates/jcode-app-core/src/server/client_session.rs index 01b229fd1..d0542800a 100644 --- a/crates/jcode-app-core/src/server/client_session.rs +++ b/crates/jcode-app-core/src/server/client_session.rs @@ -592,6 +592,7 @@ pub(super) async fn handle_subscribe( } drop(agent_guard); registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } let mcp_register_ms = if register_mcp_tools { @@ -1039,6 +1040,7 @@ pub(super) async fn handle_resume_session( if is_canary { *client_selfdev = true; registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } *client_session_id = session_id.clone(); @@ -1233,6 +1235,7 @@ pub(super) async fn handle_resume_session( if result.is_ok() && is_canary { *client_selfdev = true; registry.register_selfdev_tools().await; + registry.register_experimental_tools().await; } match result { diff --git a/crates/jcode-app-core/src/server/client_session_tests.rs b/crates/jcode-app-core/src/server/client_session_tests.rs index d8fd02226..2471090e5 100644 --- a/crates/jcode-app-core/src/server/client_session_tests.rs +++ b/crates/jcode-app-core/src/server/client_session_tests.rs @@ -90,7 +90,7 @@ fn test_agent(messages: Vec) -> Agent { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("runtime"); let _guard = rt.enter(); - let registry = rt.block_on(Registry::new(provider.clone())); + let registry = rt.block_on(Registry::new(provider.clone(), None)); build_test_agent(provider, registry, messages) } diff --git a/crates/jcode-app-core/src/server/client_session_tests/clear.rs b/crates/jcode-app-core/src/server/client_session_tests/clear.rs index 758515e19..09732a67f 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/clear.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/clear.rs @@ -8,7 +8,7 @@ async fn handle_clear_session_replaces_runtime_handles_and_updates_shutdown_regi let old_session_id = "session_before_clear"; let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/reload.rs b/crates/jcode-app-core/src/server/client_session_tests/reload.rs index aef88e3a2..4f5d37556 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/reload.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/reload.rs @@ -303,7 +303,7 @@ fn handle_reload_queues_signal_for_canary_session() -> Result<()> { rt.block_on(async { let mut rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = build_test_agent(provider, registry, Vec::new()); agent.set_canary("self-dev"); let agent = Arc::new(Mutex::new(agent)); @@ -407,7 +407,7 @@ async fn handle_reload_does_not_wait_for_busy_agent_lock() -> Result<()> { let mut rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = build_test_agent(provider, registry, Vec::new()); let agent = Arc::new(Mutex::new(agent)); let busy_agent_lock = agent.lock().await; diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs index d04acd44e..0057ce38a 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/attach_without_local_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<( persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_without_local_history() -> Result<( Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs index fc5cb93ff..b79f5a724 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/busy_existing_attach.rs @@ -20,7 +20,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy() }; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -28,7 +28,7 @@ async fn handle_resume_session_allows_live_attach_when_existing_agent_is_busy() vec![persisted_message], ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs index 96040ce38..fb134048a 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/different_client_attach.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_attach_from_different_client_instance() -> Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs index 97558cbdd..e45296af3 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/live_events_before_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_registers_live_events_before_history_replay() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs index 4dd0edd5a..6293e941d 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/multiple_live_attach.rs @@ -7,7 +7,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> { let temp_session_id = "session_temp_connecting"; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -15,7 +15,7 @@ async fn handle_resume_session_allows_multiple_live_tui_attach() -> Result<()> { Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs index 77aa96899..775090b6b 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/reconnect_takeover_with_history.rs @@ -14,7 +14,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() -> persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -22,7 +22,7 @@ async fn handle_resume_session_allows_reconnect_takeover_with_local_history() -> Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs index c044f0f48..cb6ce3b16 100644 --- a/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs +++ b/crates/jcode-app-core/src/server/client_session_tests/resume/same_client_takeover.rs @@ -16,7 +16,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca persisted.save()?; let provider: Arc = Arc::new(MockProvider); - let existing_registry = Registry::new(provider.clone()).await; + let existing_registry = Registry::new(provider.clone(), None).await; let existing_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), existing_registry, @@ -24,7 +24,7 @@ async fn handle_resume_session_allows_same_client_instance_takeover_without_loca Vec::new(), ))); - let new_registry = Registry::new(provider.clone()).await; + let new_registry = Registry::new(provider.clone(), None).await; let new_agent = Arc::new(Mutex::new(build_test_agent_with_id( provider.clone(), new_registry.clone(), diff --git a/crates/jcode-app-core/src/server/comm_control_tests.rs b/crates/jcode-app-core/src/server/comm_control_tests.rs index faddcae4f..5108018e0 100644 --- a/crates/jcode-app-core/src/server/comm_control_tests.rs +++ b/crates/jcode-app-core/src/server/comm_control_tests.rs @@ -124,7 +124,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/comm_session.rs b/crates/jcode-app-core/src/server/comm_session.rs index 3b4e27196..540c03de1 100644 --- a/crates/jcode-app-core/src/server/comm_session.rs +++ b/crates/jcode-app-core/src/server/comm_session.rs @@ -266,9 +266,10 @@ fn resolve_swarm_spawn_selection( } None => SwarmSpawnSelection { model: coordinator.model.clone(), - provider_key: coordinator.provider_key.clone().or_else(|| { - provider_key_for_spawn_model(coordinator.model.as_deref(), None) - }), + provider_key: coordinator + .provider_key + .clone() + .or_else(|| provider_key_for_spawn_model(coordinator.model.as_deref(), None)), route_api_method: coordinator.route_api_method.clone(), }, } diff --git a/crates/jcode-app-core/src/server/comm_session_tests.rs b/crates/jcode-app-core/src/server/comm_session_tests.rs index ed5c59185..eac745636 100644 --- a/crates/jcode-app-core/src/server/comm_session_tests.rs +++ b/crates/jcode-app-core/src/server/comm_session_tests.rs @@ -71,7 +71,7 @@ fn member( async fn test_agent_with_working_dir(session_id: &str, working_dir: &str) -> Arc> { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some("mock".to_string()); session.working_dir = Some(working_dir.to_string()); @@ -466,7 +466,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api() // the same API route, not Claude OAuth (the config default). let selection = resolve_swarm_spawn_selection( None, - &coordinator_identity(Some("claude-opus-4-6"), Some("claude-api"), Some("claude-api")), + &coordinator_identity( + Some("claude-opus-4-6"), + Some("claude-api"), + Some("claude-api"), + ), ); assert_eq!(selection.model.as_deref(), Some("claude-opus-4-6")); @@ -478,7 +482,11 @@ fn resolve_swarm_spawn_model_inherits_coordinator_auth_route_for_oauth_vs_api() fn resolve_swarm_spawn_model_keeps_provider_key_when_config_matches_coordinator() { let selection = resolve_swarm_spawn_selection( Some("custom-model".to_string()), - &coordinator_identity(Some("custom-model"), Some("custom-provider"), Some("custom-route")), + &coordinator_identity( + Some("custom-model"), + Some("custom-provider"), + Some("custom-route"), + ), ); assert_eq!(selection.model.as_deref(), Some("custom-model")); @@ -541,8 +549,7 @@ async fn coordinator_identity_falls_back_to_persisted_session_when_agent_busy() // Persist a coordinator session that records a concrete model + auth route. // Persist after the agent is built so it reflects the authoritative on-disk // snapshot the spawn path will read when the agent lock is unavailable. - let mut session = - crate::session::Session::create_with_id("coord_busy".to_string(), None, None); + let mut session = crate::session::Session::create_with_id("coord_busy".to_string(), None, None); session.model = Some("claude-opus-4-6".to_string()); session.provider_key = Some("claude-api".to_string()); session.route_api_method = Some("claude-api".to_string()); diff --git a/crates/jcode-app-core/src/server/debug_command_exec.rs b/crates/jcode-app-core/src/server/debug_command_exec.rs index d23f08176..63f7824fa 100644 --- a/crates/jcode-app-core/src/server/debug_command_exec.rs +++ b/crates/jcode-app-core/src/server/debug_command_exec.rs @@ -697,7 +697,7 @@ mod tests { let mut reload_rx = crate::server::subscribe_reload_signal_for_tests(); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_selfdev_tools().await; let mut agent = Agent::new(provider, registry); @@ -747,7 +747,7 @@ mod tests { #[tokio::test] async fn debug_cancel_does_not_wait_for_busy_agent_lock() { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let agent = Arc::new(AsyncMutex::new(Agent::new(provider, registry))); let session_id = agent.lock().await.session_id().to_string(); diff --git a/crates/jcode-app-core/src/server/debug_tests.rs b/crates/jcode-app-core/src/server/debug_tests.rs index 0c32dfc26..6e7b3ba65 100644 --- a/crates/jcode-app-core/src/server/debug_tests.rs +++ b/crates/jcode-app-core/src/server/debug_tests.rs @@ -646,7 +646,7 @@ mod debug_execution_tests { async fn test_agent() -> Arc> { let provider = Arc::new(TestProvider) as Arc; - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(AsyncMutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/server/headless.rs b/crates/jcode-app-core/src/server/headless.rs index 9e1a9610f..8dc03feaa 100644 --- a/crates/jcode-app-core/src/server/headless.rs +++ b/crates/jcode-app-core/src/server/headless.rs @@ -50,7 +50,7 @@ pub(super) async fn create_headless_session( }; let provider = provider_template.fork(); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; registry.enable_memory_test_mode().await; diff --git a/crates/jcode-app-core/src/server/provider_control.rs b/crates/jcode-app-core/src/server/provider_control.rs index d9e0c86c1..b8309f237 100644 --- a/crates/jcode-app-core/src/server/provider_control.rs +++ b/crates/jcode-app-core/src/server/provider_control.rs @@ -1318,7 +1318,7 @@ mod tests { ) { let provider = Arc::new(TestEffortProvider::default()); let provider_dyn: Arc = provider.clone(); - let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn)).await; + let registry = crate::tool::Registry::new(Arc::clone(&provider_dyn), None).await; let mut session = crate::session::Session::create_with_id(session_id.to_string(), None, None); session.model = Some(provider.model()); diff --git a/crates/jcode-app-core/src/server/queue_tests.rs b/crates/jcode-app-core/src/server/queue_tests.rs index 27eae2c06..35485d0df 100644 --- a/crates/jcode-app-core/src/server/queue_tests.rs +++ b/crates/jcode-app-core/src/server/queue_tests.rs @@ -41,7 +41,7 @@ impl Provider for TestProvider { async fn test_agent() -> Arc> { let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } @@ -165,7 +165,7 @@ async fn queue_soft_interrupt_for_session_persists_when_live_queue_is_unavailabl assert_eq!(persisted[0].source, SoftInterruptSource::BackgroundTask); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut restored = Agent::new(provider, registry); restored .restore_session(&session_id) diff --git a/crates/jcode-app-core/src/server/tests.rs b/crates/jcode-app-core/src/server/tests.rs index e2240f2ca..9a59fe918 100644 --- a/crates/jcode-app-core/src/server/tests.rs +++ b/crates/jcode-app-core/src/server/tests.rs @@ -172,7 +172,7 @@ impl Provider for StreamingMockProvider { } async fn test_agent(provider: Arc) -> Arc> { - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; Arc::new(Mutex::new(Agent::new(provider, registry))) } diff --git a/crates/jcode-app-core/src/tool/mod.rs b/crates/jcode-app-core/src/tool/mod.rs index 2d67d0657..2299e5d88 100644 --- a/crates/jcode-app-core/src/tool/mod.rs +++ b/crates/jcode-app-core/src/tool/mod.rs @@ -30,6 +30,8 @@ mod session_search; mod side_panel; mod skill; mod task; +pub mod task_management; +mod team; mod todo; mod webfetch; mod websearch; @@ -96,6 +98,26 @@ fn session_tool_policy(session_id: &str) -> Option { .cloned() } +static SHARED_AGENT_REGISTRY: LazyLock>> = + LazyLock::new(|| { + let home = dirs::home_dir(); + let cwd = std::env::current_dir().ok(); + let mut registry = jcode_agent_runtime::AgentRegistry::new(); + registry.discover_standard_paths( + home.as_deref(), + cwd.as_deref(), + ); + if registry.is_empty() { + None + } else { + Some(Arc::new(registry)) + } + }); + +pub fn shared_agent_registry() -> Option> { + SHARED_AGENT_REGISTRY.clone() +} + /// Registry of available tools (Arc-wrapped for sharing) /// /// Clone creates a fresh CompactionManager so each subagent gets independent @@ -275,7 +297,10 @@ impl Registry { tools } - pub async fn new(provider: Arc) -> Self { + pub async fn new( + provider: Arc, + agent_registry: Option>, + ) -> Self { let start = std::time::Instant::now(); let skills_start = std::time::Instant::now(); let skills = Self::shared_skills_registry(); @@ -324,7 +349,7 @@ impl Registry { Self::insert_tool( &mut tools_map, "subagent", - task::SubagentTool::new(provider, registry.clone()), + task::SubagentTool::new(provider, registry.clone(), agent_registry), ); Self::insert_tool( &mut tools_map, @@ -348,6 +373,45 @@ impl Registry { Self::insert_tool(&mut tools_map, "dcp_recompress", DcpRecompressTool::new()); } + // Register experimental team/task tools when opted in via env var. + // Canary sessions register these explicitly via register_experimental_tools(). + let experimental_tools_enabled = matches!( + std::env::var("JCODE_EXPERIMENTAL_TOOLS") + .ok() + .as_deref() + .map(str::trim) + .map(str::to_ascii_lowercase) + .as_deref(), + Some("1") | Some("true") | Some("yes") | Some("on") + ); + if experimental_tools_enabled && !no_builtin { + Self::insert_tool( + &mut tools_map, + "team_create", + team::TeamCreateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "team_delete", + team::TeamDeleteTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_create", + task_management::TaskCreateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_update", + task_management::TaskUpdateTool::new(), + ); + Self::insert_tool( + &mut tools_map, + "task_list", + task_management::TaskListTool::new(), + ); + } + let write_start = std::time::Instant::now(); *registry.tools.write().await = tools_map; let write_ms = write_start.elapsed().as_millis(); @@ -960,6 +1024,39 @@ impl Registry { .await; } + /// Register experimental team/task tools. + /// + /// Gated behind `JCODE_EXPERIMENTAL_TOOLS=1` or canary sessions. + /// These tools expose team and task management primitives that are + /// still under active development and not yet ready for general use. + pub async fn register_experimental_tools(&self) { + self.register( + "team_create".to_string(), + Arc::new(team::TeamCreateTool::new()) as Arc, + ) + .await; + self.register( + "team_delete".to_string(), + Arc::new(team::TeamDeleteTool::new()) as Arc, + ) + .await; + self.register( + "task_create".to_string(), + Arc::new(task_management::TaskCreateTool::new()) as Arc, + ) + .await; + self.register( + "task_update".to_string(), + Arc::new(task_management::TaskUpdateTool::new()) as Arc, + ) + .await; + self.register( + "task_list".to_string(), + Arc::new(task_management::TaskListTool::new()) as Arc, + ) + .await; + } + /// Register ambient-mode tools (only for ambient sessions) pub async fn register_ambient_tools(&self) { self.register( diff --git a/crates/jcode-app-core/src/tool/selfdev/setup.rs b/crates/jcode-app-core/src/tool/selfdev/setup.rs index 3f07483fb..496329daf 100644 --- a/crates/jcode-app-core/src/tool/selfdev/setup.rs +++ b/crates/jcode-app-core/src/tool/selfdev/setup.rs @@ -21,11 +21,7 @@ impl SetupCheck { } } - fn missing( - name: &'static str, - detail: impl Into, - fix: impl Into, - ) -> Self { + fn missing(name: &'static str, detail: impl Into, fix: impl Into) -> Self { Self { name, ok: false, @@ -102,36 +98,25 @@ impl SelfDevTool { if repo_dir.is_none() { // Only attempt a clone when git is available and we're not in a // synthetic test session. - let git_available = checks - .iter() - .any(|check| check.name == "git" && check.ok); + let git_available = checks.iter().any(|check| check.name == "git" && check.ok); if SelfDevTool::is_test_session() { - clone_note = Some( - "Test mode: skipped cloning the jcode source.".to_string(), - ); + clone_note = Some("Test mode: skipped cloning the jcode source.".to_string()); } else if git_available { match Self::clone_selfdev_source() { Ok(path) => { - clone_note = Some(format!( - "Cloned jcode source into {}.", - path.display() - )); + clone_note = Some(format!("Cloned jcode source into {}.", path.display())); repo_dir = Some(path); } Err(err) => { - clone_note = Some(format!( - "Could not clone jcode source automatically: {err}", - )); + clone_note = + Some(format!("Could not clone jcode source automatically: {err}",)); } } } } match &repo_dir { - Some(path) => checks.push(SetupCheck::ok( - "repository", - path.display().to_string(), - )), + Some(path) => checks.push(SetupCheck::ok("repository", path.display().to_string())), None => { let target = Self::selfdev_clone_dir() .map(|p| p.display().to_string()) @@ -152,10 +137,9 @@ impl SelfDevTool { // build before `selfdev reload`/`enter` can hand off into a dev binary. if let Some(repo) = repo_dir.as_deref() { match build::find_dev_binary(repo) { - Some(binary) => checks.push(SetupCheck::ok( - "dev binary", - binary.display().to_string(), - )), + Some(binary) => { + checks.push(SetupCheck::ok("dev binary", binary.display().to_string())) + } None => checks.push(SetupCheck::missing( "dev binary", "no built binary in target/selfdev or target/release", @@ -222,7 +206,11 @@ impl SelfDevTool { let format_path = |path: Option<&std::path::Path>| match path { Some(p) => { let exists = p.exists(); - format!("{} {}", p.display(), if exists { "(exists)" } else { "(missing)" }) + format!( + "{} {}", + p.display(), + if exists { "(exists)" } else { "(missing)" } + ) } None => "unavailable".to_string(), }; @@ -293,9 +281,7 @@ impl SelfDevTool { /// is strictly newer than the running process). pub(super) async fn do_reload_to_newer_build(&self, _ctx: &ToolContext) -> Result { if SelfDevTool::is_test_session() { - return Ok(ToolOutput::new( - "Test mode: skipped reload-to-newer-build.", - )); + return Ok(ToolOutput::new("Test mode: skipped reload-to-newer-build.")); } if !server::server_has_newer_binary() { diff --git a/crates/jcode-app-core/src/tool/selfdev/tests.rs b/crates/jcode-app-core/src/tool/selfdev/tests.rs index 4f633c3e6..d569cda02 100644 --- a/crates/jcode-app-core/src/tool/selfdev/tests.rs +++ b/crates/jcode-app-core/src/tool/selfdev/tests.rs @@ -325,7 +325,13 @@ fn non_selfdev_schema_only_exposes_onramp_actions() { sorted, vec!["enter", "find-config", "reload", "setup", "status"] ); - for hidden in ["build", "test", "cancel-build", "socket-info", "socket-help"] { + for hidden in [ + "build", + "test", + "cancel-build", + "socket-info", + "socket-help", + ] { assert!( !actions.contains(&hidden), "on-ramp schema should not expose {hidden}" diff --git a/crates/jcode-app-core/src/tool/task.rs b/crates/jcode-app-core/src/tool/task.rs index c390a836e..31546dddf 100644 --- a/crates/jcode-app-core/src/tool/task.rs +++ b/crates/jcode-app-core/src/tool/task.rs @@ -1,12 +1,15 @@ use super::{Registry, Tool, ToolContext, ToolOutput}; use crate::agent::Agent; use crate::bus::{Bus, BusEvent, ToolSummary, ToolSummaryState}; +use crate::dcg_bridge; use crate::logging; use crate::protocol::HistoryMessage; use crate::provider::Provider; use crate::session::Session; use anyhow::Result; use async_trait::async_trait; +use jcode_agent_runtime::permission::PermissionMode; +use jcode_agent_runtime::registry::AgentRegistry; use serde::Deserialize; use serde_json::{Value, json}; use std::collections::{HashMap, HashSet}; @@ -16,11 +19,20 @@ use tokio::sync::broadcast; pub struct SubagentTool { provider: Arc, registry: Registry, + agent_registry: Option>, } impl SubagentTool { - pub fn new(provider: Arc, registry: Registry) -> Self { - Self { provider, registry } + pub fn new( + provider: Arc, + registry: Registry, + agent_registry: Option>, + ) -> Self { + Self { + provider, + registry, + agent_registry, + } } fn preferred_parent_subagent_model(parent_session_id: &str) -> Option { @@ -55,6 +67,11 @@ struct SubagentInput { session_id: Option, #[serde(default)] output_mode: SubagentOutputMode, + /// Optional permission mode override from the agent definition. + /// When set, the child session runs under this mode instead of + /// the session-global permission mode. + #[serde(default)] + permission_mode: Option, #[serde(rename = "command", default)] _command: Option, } @@ -115,6 +132,11 @@ impl Tool for SubagentTool { "enum": ["answer", "compact", "full_transcript"], "description": "Return mode. 'answer' returns the final answer only, 'compact' adds a user-visible transcript, and 'full_transcript' adds raw persisted messages. Defaults to 'answer'." }, + "permission_mode": { + "type": "string", + "enum": ["default", "accept-edits", "plan", "dont-ask", "bypass-permissions", "auto"], + "description": "Permission mode override from the agent definition. When set, the child session uses this mode instead of the session-global permission mode." + }, "command": { "type": "string", "description": "Source command." @@ -126,6 +148,38 @@ impl Tool for SubagentTool { async fn execute(&self, input: Value, ctx: ToolContext) -> Result { let params: SubagentInput = serde_json::from_value(input)?; + // Look up the agent definition from the registry (if available). + // When found, its fields (tool_names, system_prompt, permission_mode, + // output_mode, max_turns) inform how the child agent is spawned. + let agent_def = self + .agent_registry + .as_ref() + .and_then(|reg| reg.get(¶ms.subagent_type)) + .map(|la| &la.definition); + + // Merge permission_mode: params (LLM override) takes precedence, + // then agent definition, then None (inherits session default). + let effective_permission_mode = params + .permission_mode + .or_else(|| agent_def.and_then(|d| d.permission_mode)); + + // Merge output_mode: if the LLM didn't explicitly set output_mode + // (i.e. it's the default Answer), prefer the agent definition's value. + let effective_output_mode = if params.output_mode == SubagentOutputMode::Answer { + agent_def + .map(|d| subagent_output_mode_from_definition(d.output_mode)) + .unwrap_or(params.output_mode) + } else { + params.output_mode + }; + + if agent_def.is_some() { + logging::info(&format!( + "[tool:subagent] matched agent definition for type '{}'", + params.subagent_type + )); + } + let mut session = if let Some(session_id) = ¶ms.session_id { Session::load(session_id).unwrap_or_else(|err| { logging::warn(&format!( @@ -139,24 +193,92 @@ impl Tool for SubagentTool { }; let parent_subagent_model = Self::preferred_parent_subagent_model(&ctx.session_id); let provider_model = self.provider.model(); - let resolved_model = Self::resolve_model( - params.model.as_deref(), - session.model.as_deref(), - parent_subagent_model.as_deref(), - &provider_model, - ); + // When the agent definition specifies model_override or prefer_tier, + // use its resolve_model() which honours those fields. Otherwise fall + // back to the standard resolution chain. + let resolved_model = if let Some(def) = agent_def { + if def.model_override.is_some() || def.prefer_tier.is_some() { + def.resolve_model(&provider_model) + } else { + Self::resolve_model( + params.model.as_deref(), + session.model.as_deref(), + parent_subagent_model.as_deref(), + &provider_model, + ) + } + } else { + Self::resolve_model( + params.model.as_deref(), + session.model.as_deref(), + parent_subagent_model.as_deref(), + &provider_model, + ) + }; session.model = Some(resolved_model.clone()); if let Some(ref working_dir) = ctx.working_dir { session.working_dir = Some(working_dir.display().to_string()); } + // Register child in parent's session. + // NOTE: This load→mutate→save sequence is not atomic. Concurrent + // subagent spawns sharing the same parent could clobber each + // other's `children` entries. Acceptable for experimental Phase 0; + // a file-lock or in-memory session cache would fix this properly. + if let Ok(mut parent_session) = Session::load(&ctx.session_id) { + parent_session.add_child(session.id.clone()); + let _ = parent_session.save(); + } + session.save()?; - let mut allowed: HashSet = self.registry.tool_names().await.into_iter().collect(); + // Propagate the effective permission mode to the child session so + // that `dcg_bridge::classify_for_session` / `session_mode` observe + // it during the child's tool execution. The guard clears the + // override on drop (both success and error paths). + let child_session_id = session.id.clone(); + let _mode_guard = dcg_bridge::SessionModeGuard::new( + &child_session_id, + effective_permission_mode.map(dcg_bridge::permission_mode_to_dcg), + ); + if effective_permission_mode.is_some() { + logging::info(&format!( + "[tool:subagent] session {} permission mode: {} (from agent definition)", + child_session_id, + effective_permission_mode.unwrap().as_str(), + )); + } + + // Build the allowed tool set for the child agent. + // If the agent definition specifies `tool_names`, use that whitelist + // (intersected with actually-available tools) instead of "all minus + // blocked". `disallowed_tools` from the definition are always removed. + let mut allowed: HashSet = if let Some(def) = agent_def { + if !def.tool_names.is_empty() { + let available: HashSet = + self.registry.tool_names().await.into_iter().collect(); + def.tool_names + .iter() + .filter(|t| available.contains(t.as_str())) + .cloned() + .collect() + } else { + self.registry.tool_names().await.into_iter().collect() + } + } else { + self.registry.tool_names().await.into_iter().collect() + }; + // Always block self-referential / meta tools. for blocked in ["subagent", "task", "todo", "todowrite", "todoread"] { allowed.remove(blocked); } + // Remove agent-definition-level disallowed tools. + if let Some(def) = agent_def { + for blocked in &def.disallowed_tools { + allowed.remove(blocked); + } + } crate::config::config() .tools .apply_to_allowed_set(&mut allowed); @@ -214,25 +336,48 @@ impl Tool for SubagentTool { Some(allowed), ); + // Apply agent definition's system prompt override when the definition + // provides one and does not request parent prompt inheritance. + if let Some(def) = agent_def { + if !def.system_prompt.is_empty() && !def.inherit_parent_system_prompt { + agent.set_system_prompt(&def.system_prompt); + logging::info(&format!( + "[tool:subagent] applied system_prompt from agent definition '{}' ({} chars)", + params.subagent_type, + def.system_prompt.len(), + )); + } + if let Some(max_turns) = def.max_turns { + agent.set_max_turns(max_turns); + logging::info(&format!( + "[tool:subagent] agent definition '{}' max_turns={} enforced", + params.subagent_type, max_turns, + )); + } + } + let start = std::time::Instant::now(); - let final_text = agent.run_once_capture(¶ms.prompt).await.map_err(|err| { - logging::warn(&format!( - "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}", - params.description, - params.subagent_type, - agent.session_id(), - resolved_model, - err - )); - err - })?; + let final_text = match agent.run_once_capture(¶ms.prompt).await { + Ok(text) => text, + Err(err) => { + logging::warn(&format!( + "[tool:subagent] subagent failed description={} type={} session_id={} model={} error={}", + params.description, + params.subagent_type, + agent.session_id(), + resolved_model, + err + )); + return Err(err); + } + }; let sub_session_id = agent.session_id().to_string(); - let history = if params.output_mode == SubagentOutputMode::Compact { + let history = if effective_output_mode == SubagentOutputMode::Compact { Some(agent.get_history()) } else { None }; - let full_transcript = if params.output_mode == SubagentOutputMode::FullTranscript { + let full_transcript = if effective_output_mode == SubagentOutputMode::FullTranscript { let session = Session::load(&sub_session_id)?; Some(serde_json::to_string_pretty(&session.messages)?) } else { @@ -245,6 +390,8 @@ impl Tool for SubagentTool { start.elapsed().as_secs_f64() )); + // _mode_guard drops here, clearing the per-session permission override. + listener.abort(); let mut summary: Vec = summary_map @@ -258,7 +405,7 @@ impl Tool for SubagentTool { let output = format_subagent_output( &final_text, &sub_session_id, - params.output_mode, + effective_output_mode, history.as_deref(), full_transcript.as_deref(), ); @@ -269,7 +416,7 @@ impl Tool for SubagentTool { "summary": summary, "sessionId": sub_session_id, "model": resolved_model, - "outputMode": params.output_mode.as_str(), + "outputMode": effective_output_mode.as_str(), }))) } } @@ -288,6 +435,22 @@ fn subagent_display_title(params: &SubagentInput, model: &str) -> String { ) } +/// Map an `AgentDefinition`'s `OutputMode` to the subagent tool's internal +/// `SubagentOutputMode`. The mapping is intentionally conservative: +/// - `LastMessage` → `Answer` (default low-token behaviour) +/// - `AllMessages` → `Compact` (human-readable transcript) +/// - `StructuredOutput` → `Answer` (structured output is a separate mechanism) +fn subagent_output_mode_from_definition( + def_mode: jcode_agent_runtime::output::OutputMode, +) -> SubagentOutputMode { + use jcode_agent_runtime::output::OutputMode as DefOutputMode; + match def_mode { + DefOutputMode::LastMessage => SubagentOutputMode::Answer, + DefOutputMode::AllMessages => SubagentOutputMode::Compact, + DefOutputMode::StructuredOutput => SubagentOutputMode::Answer, + } +} + impl SubagentOutputMode { fn as_str(self) -> &'static str { match self { @@ -382,6 +545,7 @@ mod tests { model: None, session_id: None, output_mode: SubagentOutputMode::Answer, + permission_mode: None, _command: None, }; diff --git a/crates/jcode-app-core/src/tool/task_management.rs b/crates/jcode-app-core/src/tool/task_management.rs new file mode 100644 index 000000000..6533f0b27 --- /dev/null +++ b/crates/jcode-app-core/src/tool/task_management.rs @@ -0,0 +1,261 @@ +use super::team::{TeamConfig, TeamTask}; +use super::{Tool, ToolContext, ToolOutput}; +use anyhow::Result; +use async_trait::async_trait; +use serde::Deserialize; +use serde_json::{Value, json}; + +// --------------------------------------------------------------------------- +// TaskCreateTool +// --------------------------------------------------------------------------- + +pub struct TaskCreateTool; + +impl TaskCreateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskCreateInput { + team_name: String, + subject: String, + description: String, +} + +#[async_trait] +impl Tool for TaskCreateTool { + fn name(&self) -> &str { + "task_create" + } + + fn description(&self) -> &str { + "Create a new task within a team. The task starts with status 'pending' \ + and no owner assigned." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name", "subject", "description"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team to add the task to." + }, + "subject": { + "type": "string", + "description": "Short task title." + }, + "description": { + "type": "string", + "description": "Detailed task description." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskCreateInput = serde_json::from_value(input)?; + + let mut team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!( + "Team '{}' not found. Create it first with team_create.", + params.team_name + )); + } + }; + + let task_id = format!("task-{}", uuid::Uuid::new_v4().as_simple()); + let task = TeamTask { + id: task_id.clone(), + subject: params.subject, + description: params.description, + status: "pending".to_string(), + owner: None, + }; + team.tasks.push(task); + team.save()?; + + Ok(ToolOutput::new(format!( + "Task '{}' created in team '{}'.", + task_id, params.team_name + )) + .with_title(format!("Task created: {}", task_id))) + } +} + +// --------------------------------------------------------------------------- +// TaskUpdateTool +// --------------------------------------------------------------------------- + +pub struct TaskUpdateTool; + +impl TaskUpdateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskUpdateInput { + team_name: String, + task_id: String, + #[serde(default)] + status: Option, + #[serde(default)] + owner: Option, +} + +#[async_trait] +impl Tool for TaskUpdateTool { + fn name(&self) -> &str { + "task_update" + } + + fn description(&self) -> &str { + "Update a task's status or owner within a team." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name", "task_id"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team containing the task." + }, + "task_id": { + "type": "string", + "description": "Task ID to update." + }, + "status": { + "type": "string", + "enum": ["pending", "in_progress", "completed"], + "description": "New status for the task." + }, + "owner": { + "type": "string", + "description": "Assign or reassign the task to a team member name." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskUpdateInput = serde_json::from_value(input)?; + + let mut team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name)); + } + }; + + let task = team + .tasks + .iter_mut() + .find(|t| t.id == params.task_id) + .ok_or_else(|| anyhow::anyhow!("Task '{}' not found.", params.task_id))?; + + if let Some(status) = params.status { + task.status = status; + } + if let Some(owner) = params.owner { + task.owner = Some(owner); + } + + let updated = task.clone(); + team.save()?; + + Ok(ToolOutput::new(format!( + "Task '{}' updated.\n\n{}", + params.task_id, + serde_json::to_string_pretty(&updated)? + )) + .with_title(format!("Task '{}' updated", params.task_id))) + } +} + +// --------------------------------------------------------------------------- +// TaskListTool +// --------------------------------------------------------------------------- + +pub struct TaskListTool; + +impl TaskListTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TaskListInput { + team_name: String, +} + +#[async_trait] +impl Tool for TaskListTool { + fn name(&self) -> &str { + "task_list" + } + + fn description(&self) -> &str { + "List all tasks in a team, showing their status and owner." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["team_name"], + "properties": { + "intent": super::intent_schema_property(), + "team_name": { + "type": "string", + "description": "Team to list tasks for." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TaskListInput = serde_json::from_value(input)?; + + let team = match TeamConfig::load(¶ms.team_name)? { + Some(t) => t, + None => { + return Err(anyhow::anyhow!("Team '{}' not found.", params.team_name)); + } + }; + + let output = serde_json::to_string_pretty(&team.tasks)?; + let summary = format!( + "Team '{}': {} task(s) total, {} pending, {} in_progress, {} completed.", + params.team_name, + team.tasks.len(), + team.tasks.iter().filter(|t| t.status == "pending").count(), + team.tasks + .iter() + .filter(|t| t.status == "in_progress") + .count(), + team.tasks + .iter() + .filter(|t| t.status == "completed") + .count(), + ); + + Ok( + ToolOutput::new(format!("{}\n\n{}", summary, output)).with_title(format!( + "{} tasks in '{}'", + team.tasks.len(), + params.team_name + )), + ) + } +} diff --git a/crates/jcode-app-core/src/tool/team.rs b/crates/jcode-app-core/src/tool/team.rs new file mode 100644 index 000000000..39b48fc75 --- /dev/null +++ b/crates/jcode-app-core/src/tool/team.rs @@ -0,0 +1,233 @@ +use super::{Tool, ToolContext, ToolOutput}; +use anyhow::Result; +use async_trait::async_trait; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::path::PathBuf; + +/// Get the teams directory path (~/.jcode/teams/). +fn teams_dir() -> PathBuf { + dirs::home_dir() + .unwrap_or_else(|| PathBuf::from(".")) + .join(".jcode") + .join("teams") +} + +/// Team configuration stored as JSON on disk. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamConfig { + pub name: String, + pub description: String, + pub created_at: String, + pub members: Vec, + pub tasks: Vec, +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamMember { + pub name: String, + pub session_id: String, + pub agent_type: String, + pub status: String, // "active" | "idle" | "shutdown" +} + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +pub struct TeamTask { + pub id: String, + pub subject: String, + pub description: String, + pub status: String, // "pending" | "in_progress" | "completed" + pub owner: Option, // member name +} + +/// Validate that a team name is safe for use as a filename. +/// Rejects path traversal attempts and special characters. +fn validate_team_name(name: &str) -> Result<()> { + if name.is_empty() { + anyhow::bail!("Team name cannot be empty"); + } + if name.contains("..") || name.contains('/') || name.contains('\\') { + anyhow::bail!( + "Team name '{}' is invalid: must not contain '..', '/', or '\\'", + name + ); + } + if !name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') { + anyhow::bail!( + "Team name '{}' is invalid: only alphanumeric, hyphen, and underscore allowed", + name + ); + } + Ok(()) +} + +impl TeamConfig { + /// Load a team config from disk by name. + pub fn load(name: &str) -> Result> { + validate_team_name(name)?; + let path = teams_dir().join(format!("{name}.json")); + if !path.exists() { + return Ok(None); + } + let text = std::fs::read_to_string(&path)?; + Ok(Some(serde_json::from_str(&text)?)) + } + + /// Save this team config to disk. + pub fn save(&self) -> Result<()> { + validate_team_name(&self.name)?; + let dir = teams_dir(); + std::fs::create_dir_all(&dir)?; + let path = dir.join(format!("{}.json", self.name)); + let json = serde_json::to_string_pretty(self)?; + std::fs::write(&path, json)?; + Ok(()) + } + + /// Delete a team config from disk by name. + pub fn delete(name: &str) -> Result<()> { + validate_team_name(name)?; + let path = teams_dir().join(format!("{name}.json")); + if path.exists() { + std::fs::remove_file(&path)?; + } + Ok(()) + } +} + +// --------------------------------------------------------------------------- +// TeamCreateTool +// --------------------------------------------------------------------------- + +pub struct TeamCreateTool; + +impl TeamCreateTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TeamCreateInput { + name: String, + description: String, +} + +#[async_trait] +impl Tool for TeamCreateTool { + fn name(&self) -> &str { + "team_create" + } + + fn description(&self) -> &str { + "Create a new team for coordinating sub-agents. Stores a lightweight \ + team config file at ~/.jcode/teams/.json that tracks members, \ + tasks, and status." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["name", "description"], + "properties": { + "intent": super::intent_schema_property(), + "name": { + "type": "string", + "description": "Unique team name (used as filename)." + }, + "description": { + "type": "string", + "description": "What this team is for." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TeamCreateInput = serde_json::from_value(input)?; + + if let Some(existing) = TeamConfig::load(¶ms.name)? { + return Ok(ToolOutput::new(format!( + "Team '{}' already exists.\n\n{}", + params.name, + serde_json::to_string_pretty(&existing)? + )) + .with_title(format!("Team '{}' already exists", params.name))); + } + + let team = TeamConfig { + name: params.name.clone(), + description: params.description.clone(), + created_at: chrono::Utc::now().to_rfc3339(), + members: Vec::new(), + tasks: Vec::new(), + }; + team.save()?; + + let output = serde_json::to_string_pretty(&team)?; + Ok( + ToolOutput::new(format!("Team '{}' created.\n\n{}", params.name, output)) + .with_title(format!("Team '{}' created", params.name)), + ) + } +} + +// --------------------------------------------------------------------------- +// TeamDeleteTool +// --------------------------------------------------------------------------- + +pub struct TeamDeleteTool; + +impl TeamDeleteTool { + pub fn new() -> Self { + Self + } +} + +#[derive(Deserialize)] +struct TeamDeleteInput { + name: String, +} + +#[async_trait] +impl Tool for TeamDeleteTool { + fn name(&self) -> &str { + "team_delete" + } + + fn description(&self) -> &str { + "Delete a team configuration. Removes the team config file from \ + ~/.jcode/teams/.json." + } + + fn parameters_schema(&self) -> Value { + json!({ + "type": "object", + "required": ["name"], + "properties": { + "intent": super::intent_schema_property(), + "name": { + "type": "string", + "description": "Team name to delete." + } + } + }) + } + + async fn execute(&self, input: Value, _ctx: ToolContext) -> Result { + let params: TeamDeleteInput = serde_json::from_value(input)?; + + let existed = TeamConfig::load(¶ms.name)?.is_some(); + TeamConfig::delete(¶ms.name)?; + + if existed { + Ok(ToolOutput::new(format!("Team '{}' deleted.", params.name)) + .with_title(format!("Team '{}' deleted", params.name))) + } else { + Ok( + ToolOutput::new(format!("Team '{}' did not exist (no-op).", params.name)) + .with_title(format!("Team '{}' not found", params.name)), + ) + } + } +} diff --git a/crates/jcode-app-core/src/tool/tests.rs b/crates/jcode-app-core/src/tool/tests.rs index 5f6f4f295..8fdbef2f8 100644 --- a/crates/jcode-app-core/src/tool/tests.rs +++ b/crates/jcode-app-core/src/tool/tests.rs @@ -33,7 +33,7 @@ impl Provider for MockProvider { async fn test_tool_definitions_are_sorted() { // Create registry with mock provider let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; // Get definitions multiple times and verify they're always in the same order let defs1 = registry.definitions(None).await; @@ -98,7 +98,7 @@ fn tool_definitions_do_not_auto_inject_intent() { #[tokio::test] async fn first_party_tool_definitions_include_optional_intent_explicitly() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; registry.register_ambient_tools().await; let defs = registry.definitions(None).await; @@ -160,7 +160,7 @@ fn test_resolve_tool_name_oauth_aliases() { #[tokio::test] async fn test_batch_resolves_oauth_names() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let temp_dir = std::env::temp_dir(); let temp_dir_str = temp_dir.to_string_lossy().to_string(); @@ -188,7 +188,7 @@ async fn test_batch_resolves_oauth_names() { #[tokio::test] async fn registry_execute_enforces_session_tool_policy_after_alias_resolution() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let temp_dir = std::env::temp_dir(); let session_id = "test-policy-deny"; set_session_tool_policy(session_id, None, HashSet::from(["grep".to_string()])); @@ -225,7 +225,7 @@ async fn registry_execute_enforces_session_tool_policy_after_alias_resolution() #[tokio::test] async fn test_definitions_keep_batch_schema_generic() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let batch_def = defs @@ -255,7 +255,7 @@ fn resolve_tool_name_maps_communicate_to_swarm() { #[ignore] async fn print_tool_definition_token_report() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let mut defs = registry.definitions(None).await; defs.sort_by_key(|def| std::cmp::Reverse(def.prompt_token_estimate())); @@ -324,7 +324,7 @@ fn collect_schema_errors(schema: &Value, path: &str, errors: &mut Vec) { #[tokio::test] async fn test_tool_definitions_do_not_expose_invalid_array_schemas() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let mut errors = Vec::new(); @@ -449,7 +449,7 @@ async fn test_context_guard_zero_budget_passes_through() { #[tokio::test] async fn test_request_permission_is_ambient_only() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -476,7 +476,7 @@ async fn test_no_builtin_tools_env_disables_registry() { crate::env::set_var("JCODE_NO_BUILTIN_TOOLS", "1"); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -502,7 +502,7 @@ async fn test_default_registry_has_builtin_tools() { crate::env::remove_var("JCODE_NO_BUILTIN_TOOLS"); let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; assert!( @@ -537,7 +537,7 @@ fn closest_tool_names_suggests_near_misses() { #[tokio::test] async fn unknown_tool_error_lists_available_tools_and_suggestions() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; registry.register_ambient_tools().await; let ctx = ToolContext { diff --git a/crates/jcode-base/src/auth/live_provider_probes.rs b/crates/jcode-base/src/auth/live_provider_probes.rs index e551bc0d4..a47f4697a 100644 --- a/crates/jcode-base/src/auth/live_provider_probes.rs +++ b/crates/jcode-base/src/auth/live_provider_probes.rs @@ -1341,7 +1341,10 @@ pub async fn run_live_native_provider_smoke( .with_duration_ms(started.elapsed().as_millis() as u64) .with_evidence("model", serde_json::json!(model)) .with_evidence("matched_expected_content", serde_json::json!(true)) - .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone())); + .with_evidence( + "stop_reason", + serde_json::json!(outcome.stop_reason.clone()), + ); if let Some(usage) = outcome.usage_evidence() { stage = stage.with_evidence("usage", usage); } @@ -1429,7 +1432,10 @@ pub async fn run_live_native_provider_stream_smoke( .with_evidence("attempts", serde_json::json!(attempts)) .with_evidence("total_events", serde_json::json!(outcome.total_events)) .with_evidence("matched_expected_content", serde_json::json!(true)) - .with_evidence("stop_reason", serde_json::json!(outcome.stop_reason.clone())); + .with_evidence( + "stop_reason", + serde_json::json!(outcome.stop_reason.clone()), + ); if let Some(usage) = outcome.usage_evidence() { stage = stage.with_evidence("usage", usage); } diff --git a/crates/jcode-base/src/auth/provider_e2e.rs b/crates/jcode-base/src/auth/provider_e2e.rs index cf1f2b3c1..391de4515 100644 --- a/crates/jcode-base/src/auth/provider_e2e.rs +++ b/crates/jcode-base/src/auth/provider_e2e.rs @@ -1321,8 +1321,8 @@ impl NativeProviderKind { /// Returns an error only when the runtime cannot be constructed at all (e.g. /// Copilot with no credential file); model selection happens later. fn build_runtime(self) -> anyhow::Result> { - use anyhow::Context as _; use crate::provider::Provider; + use anyhow::Context as _; let runtime: std::sync::Arc = match self { Self::OpenAi => { let credentials = crate::auth::codex::load_credentials().unwrap_or_else(|_| { @@ -1337,9 +1337,7 @@ impl NativeProviderKind { std::sync::Arc::new(crate::provider::openai::OpenAIProvider::new(credentials)) } Self::Gemini => std::sync::Arc::new(crate::provider::gemini::GeminiProvider::new()), - Self::Cursor => { - std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new()) - } + Self::Cursor => std::sync::Arc::new(crate::provider::cursor::CursorCliProvider::new()), Self::Copilot => { // `new()` requires a loadable GitHub token; fall back to an empty // token so the offline tier can still construct the runtime for @@ -1354,18 +1352,14 @@ impl NativeProviderKind { crate::env::set_var("JCODE_COPILOT_PREFETCH_STARTUP_GRACE_MS", "0"); let runtime = match crate::provider::copilot::CopilotApiProvider::new() { Ok(runtime) => runtime, - Err(_) => crate::provider::copilot::CopilotApiProvider::new_with_token( - String::new(), - ), + Err(_) => { + crate::provider::copilot::CopilotApiProvider::new_with_token(String::new()) + } }; std::sync::Arc::new(runtime) } - Self::Bedrock => { - std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new()) - } - Self::Jcode => { - std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new()) - } + Self::Bedrock => std::sync::Arc::new(crate::provider::bedrock::BedrockProvider::new()), + Self::Jcode => std::sync::Arc::new(crate::provider::jcode::JcodeProvider::new()), Self::Azure => { // Azure OpenAI is the OpenRouter transport configured via Azure // env; apply that env (endpoint/key/header wiring) before building @@ -1696,8 +1690,14 @@ pub async fn run_generic_native_e2e( )); } } else { - run_generic_native_api_checks(runtime.as_ref(), &selected, spec.label, &mut checks, &mut spend) - .await; + run_generic_native_api_checks( + runtime.as_ref(), + &selected, + spec.label, + &mut checks, + &mut spend, + ) + .await; } } else { for checkpoint in API_DEPENDENT_CHECKPOINTS { diff --git a/crates/jcode-base/src/provider/gemini.rs b/crates/jcode-base/src/provider/gemini.rs index 8e8dc9174..485fb0786 100644 --- a/crates/jcode-base/src/provider/gemini.rs +++ b/crates/jcode-base/src/provider/gemini.rs @@ -849,9 +849,7 @@ impl Provider for GeminiProvider { .await; let _ = tx.send(Ok(StreamEvent::ToolUseEnd)).await; if let Some(signature) = signature { - let _ = tx - .send(Ok(StreamEvent::ToolUseSignature(signature))) - .await; + let _ = tx.send(Ok(StreamEvent::ToolUseSignature(signature))).await; } } else if let Some(signature) = part_signature { // Standalone signature part; remember it for the next diff --git a/crates/jcode-base/src/provider/gemini_tests.rs b/crates/jcode-base/src/provider/gemini_tests.rs index 8d2917a04..b59ce9225 100644 --- a/crates/jcode-base/src/provider/gemini_tests.rs +++ b/crates/jcode-base/src/provider/gemini_tests.rs @@ -386,7 +386,10 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() { assert!(!schema_contains_key(parameters, "additionalProperties")); assert!(!schema_contains_key(parameters, "$schema")); // Real schema content is preserved. - assert_eq!(parameters["properties"]["file_path"]["type"], json!("string")); + assert_eq!( + parameters["properties"]["file_path"]["type"], + json!("string") + ); assert_eq!( parameters["properties"]["opts"]["properties"]["limit"]["type"], json!("integer") @@ -397,7 +400,7 @@ fn build_tools_strips_additional_properties_for_gemini_schema_compatibility() { #[tokio::test] async fn build_tools_from_registry_definitions_omits_const_keywords() { let provider: Arc = Arc::new(MockProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let defs = registry.definitions(None).await; let built = build_tools(&defs).expect("gemini tools"); diff --git a/crates/jcode-base/src/provider/mod.rs b/crates/jcode-base/src/provider/mod.rs index ef4011e37..bfeccd7f9 100644 --- a/crates/jcode-base/src/provider/mod.rs +++ b/crates/jcode-base/src/provider/mod.rs @@ -48,6 +48,7 @@ pub use catalog_routes::{ remote_model_routes_lightweight_fallback, remote_model_should_offer_copilot_route, remote_openai_compatible_route_for_model, simplified_model_routes_for_picker, }; +pub use jcode_provider_core::cli_provider_arg_for_session_key; pub use jcode_provider_core::{ ALL_CLAUDE_MODELS, ALL_OPENAI_MODELS, CHEAPNESS_REFERENCE_INPUT_TOKENS, CHEAPNESS_REFERENCE_OUTPUT_TOKENS, DEFAULT_CONTEXT_LIMIT, EventStream, JCODE_USER_AGENT, @@ -58,7 +59,6 @@ pub use jcode_provider_core::{ normalize_copilot_model_name, provider_from_model_key, shared_http_client, summarize_model_catalog_refresh, }; -pub use jcode_provider_core::cli_provider_arg_for_session_key; pub use jcode_provider_core::{ProviderFailoverPrompt, parse_failover_prompt_message}; pub use route_builders::{ build_anthropic_oauth_route, build_copilot_route, build_openai_api_key_route, diff --git a/crates/jcode-base/src/session.rs b/crates/jcode-base/src/session.rs index a4ca35ad9..59e7725ca 100644 --- a/crates/jcode-base/src/session.rs +++ b/crates/jcode-base/src/session.rs @@ -133,6 +133,11 @@ pub struct Session { /// Optional user-provided label for saved sessions #[serde(default, skip_serializing_if = "Option::is_none")] pub save_label: Option, + /// IDs of child sessions spawned from this session. + /// Populated at spawn time by SubagentTool. Persisted so the TUI + /// can display the agent tree. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub children: Vec, /// Environment snapshots for post-mortem debugging #[serde(default, skip_serializing_if = "Vec::is_empty")] pub env_snapshots: Vec, @@ -475,6 +480,7 @@ impl Session { is_debug: self.is_debug, saved: self.saved, save_label: self.save_label.clone(), + children: self.children.clone(), } } @@ -659,6 +665,7 @@ impl Session { self.is_debug = meta.is_debug; self.saved = meta.saved; self.save_label = meta.save_label; + self.children = meta.children; self.mark_memory_profile_dirty(); } @@ -699,6 +706,7 @@ impl Session { is_debug, saved: false, save_label: None, + children: Vec::new(), env_snapshots: Vec::new(), memory_injections: Vec::new(), replay_events: Vec::new(), @@ -760,6 +768,7 @@ impl Session { is_debug, saved: false, save_label: None, + children: Vec::new(), env_snapshots: Vec::new(), memory_injections: Vec::new(), replay_events: Vec::new(), @@ -775,6 +784,14 @@ impl Session { session } + /// Register a child session id. Called by SubagentTool after + /// creating the child session. + pub fn add_child(&mut self, child_id: String) { + if !self.children.contains(&child_id) { + self.children.push(child_id); + } + } + /// Mark this session as a debug/test session pub fn set_debug(&mut self, is_debug: bool) { self.is_debug = is_debug; diff --git a/crates/jcode-base/src/session/journal.rs b/crates/jcode-base/src/session/journal.rs index 5336e1b86..ba7f5619d 100644 --- a/crates/jcode-base/src/session/journal.rs +++ b/crates/jcode-base/src/session/journal.rs @@ -33,6 +33,8 @@ pub(super) struct SessionJournalMeta { pub(super) is_debug: bool, pub(super) saved: bool, pub(super) save_label: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub(super) children: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -91,4 +93,5 @@ pub(super) fn metadata_requires_snapshot( || prev.is_debug != current.is_debug || prev.saved != current.saved || prev.save_label != current.save_label + || prev.children != current.children } diff --git a/crates/jcode-base/src/session/persistence.rs b/crates/jcode-base/src/session/persistence.rs index 23165746e..c6d402c12 100644 --- a/crates/jcode-base/src/session/persistence.rs +++ b/crates/jcode-base/src/session/persistence.rs @@ -241,6 +241,7 @@ impl Session { is_debug: self.is_debug, saved: false, save_label: None, + children: Vec::new(), ..Self::create(Some(self.id.clone()), None) } } diff --git a/crates/jcode-base/src/telemetry/tests.rs b/crates/jcode-base/src/telemetry/tests.rs index 0cade87aa..a5871b080 100644 --- a/crates/jcode-base/src/telemetry/tests.rs +++ b/crates/jcode-base/src/telemetry/tests.rs @@ -30,12 +30,25 @@ fn test_do_not_track() { fn test_is_ci_detects_ci_env() { let _guard = lock_test_env(); // Clear any inherited CI markers so the baseline is deterministic. - for key in ["CI", "GITHUB_ACTIONS", "BUILDKITE", "JENKINS_URL", "GITLAB_CI", "CIRCLECI"] { + for key in [ + "CI", + "GITHUB_ACTIONS", + "BUILDKITE", + "JENKINS_URL", + "GITLAB_CI", + "CIRCLECI", + ] { crate::env::remove_var(key); } - assert!(!is_ci(), "expected non-CI baseline after clearing CI markers"); + assert!( + !is_ci(), + "expected non-CI baseline after clearing CI markers" + ); crate::env::set_var("CI", "true"); - assert!(is_ci(), "CI env var should mark the run as CI (gates install skip)"); + assert!( + is_ci(), + "CI env var should mark the run as CI (gates install skip)" + ); crate::env::remove_var("CI"); assert!(!is_ci()); } diff --git a/crates/jcode-provider-core/src/lib.rs b/crates/jcode-provider-core/src/lib.rs index 93fe676e7..73433d8ad 100644 --- a/crates/jcode-provider-core/src/lib.rs +++ b/crates/jcode-provider-core/src/lib.rs @@ -26,10 +26,10 @@ pub use models::{ provider_for_model_with_hint as core_provider_for_model_with_hint, provider_key_from_hint, }; pub use selection::{ - ActiveProvider, ProviderAvailability, auto_default_provider, - cli_provider_arg_for_session_key, dedupe_model_routes, explicit_model_provider_prefix, - fallback_sequence, model_name_for_provider, parse_provider_hint, provider_from_model_key, - provider_key, provider_label, + ActiveProvider, ProviderAvailability, auto_default_provider, cli_provider_arg_for_session_key, + dedupe_model_routes, explicit_model_provider_prefix, fallback_sequence, + model_name_for_provider, parse_provider_hint, provider_from_model_key, provider_key, + provider_label, }; use anyhow::Result; diff --git a/crates/jcode-provider-core/src/selection.rs b/crates/jcode-provider-core/src/selection.rs index 1c4139cba..bc83ae280 100644 --- a/crates/jcode-provider-core/src/selection.rs +++ b/crates/jcode-provider-core/src/selection.rs @@ -361,16 +361,25 @@ mod tests { Some("anthropic-api") ); // Anthropic OAuth -> claude. - assert_eq!(cli_provider_arg_for_session_key("claude-oauth"), Some("claude")); + assert_eq!( + cli_provider_arg_for_session_key("claude-oauth"), + Some("claude") + ); assert_eq!(cli_provider_arg_for_session_key("claude"), Some("claude")); // OpenAI variants. - assert_eq!(cli_provider_arg_for_session_key("openai-oauth"), Some("openai")); + assert_eq!( + cli_provider_arg_for_session_key("openai-oauth"), + Some("openai") + ); assert_eq!( cli_provider_arg_for_session_key("openai-api-key"), Some("openai-api") ); // Passthrough providers. - assert_eq!(cli_provider_arg_for_session_key("openrouter"), Some("openrouter")); + assert_eq!( + cli_provider_arg_for_session_key("openrouter"), + Some("openrouter") + ); assert_eq!(cli_provider_arg_for_session_key("copilot"), Some("copilot")); assert_eq!(cli_provider_arg_for_session_key("gemini"), Some("gemini")); assert_eq!(cli_provider_arg_for_session_key("bedrock"), Some("bedrock")); diff --git a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs index 2b4a35903..220a2bd14 100644 --- a/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs +++ b/crates/jcode-tui-markdown/src/render_core_adapter_tests.rs @@ -290,13 +290,34 @@ impl Rng { } const WORDS: &[&str] = &[ - "alpha", "beta", "gamma", "delta", "x", "y", "z", "the", "quick", "brown", - "fox", "中文", "데이터", "emoji", "lorem", "ipsum", "a", "I", "we", "code", + "alpha", + "beta", + "gamma", + "delta", + "x", + "y", + "z", + "the", + "quick", + "brown", + "fox", + "中文", + "데이터", + "emoji", + "lorem", + "ipsum", + "a", + "I", + "we", + "code", ]; fn gen_words(rng: &mut Rng, max: usize) -> String { let n = 1 + rng.below(max); - (0..n).map(|_| *rng.pick(WORDS)).collect::>().join(" ") + (0..n) + .map(|_| *rng.pick(WORDS)) + .collect::>() + .join(" ") } /// Generate an inline fragment (no leading/trailing block structure). @@ -307,7 +328,11 @@ fn gen_inline(rng: &mut Rng, depth: usize) -> String { 2 => format!("_{}_", gen_words(rng, 3)), 3 => format!("`{}`", gen_words(rng, 2)), 4 => format!("~~{}~~", gen_words(rng, 2)), - 5 => format!("[{}](http://example.com/{})", gen_words(rng, 2), rng.below(99)), + 5 => format!( + "[{}](http://example.com/{})", + gen_words(rng, 2), + rng.below(99) + ), 6 => format!("${}+{}$", rng.pick(WORDS), rng.pick(WORDS)), 7 => format!("${}", rng.below(999)), // currency _ => format!( @@ -536,8 +561,3 @@ fn fuzz_random_documents_wrapped_parity() { .join("\n\n") ); } - - - - - diff --git a/crates/jcode-tui/src/tui/app/misc_ui.rs b/crates/jcode-tui/src/tui/app/misc_ui.rs index 0d408cc80..58789734d 100644 --- a/crates/jcode-tui/src/tui/app/misc_ui.rs +++ b/crates/jcode-tui/src/tui/app/misc_ui.rs @@ -38,9 +38,8 @@ impl ResolvedTokenPricing { cache_read_tokens: u64, cache_creation_tokens: u64, ) -> f32 { - let split_accounting = self.is_anthropic - || cache_creation_tokens > 0 - || cache_read_tokens > input_tokens; + let split_accounting = + self.is_anthropic || cache_creation_tokens > 0 || cache_read_tokens > input_tokens; let fresh_input_tokens = if split_accounting { input_tokens @@ -275,8 +274,7 @@ impl App { let model = ::provider_model(self); let provider_name = ::provider_name(self).to_lowercase(); - let is_anthropic = - provider_name.contains("anthropic") || provider_name.contains("claude"); + let is_anthropic = provider_name.contains("anthropic") || provider_name.contains("claude"); let is_openai = provider_name.contains("openai"); // The server resolves the active credential authoritatively; only bill diff --git a/crates/jcode-tui/src/tui/app/remote_tests.rs b/crates/jcode-tui/src/tui/app/remote_tests.rs index 155178be5..78481dbcf 100644 --- a/crates/jcode-tui/src/tui/app/remote_tests.rs +++ b/crates/jcode-tui/src/tui/app/remote_tests.rs @@ -40,7 +40,7 @@ impl Provider for MockProvider { fn create_test_app() -> crate::tui::app::App { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("runtime"); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = crate::tui::app::App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/app/tests.rs b/crates/jcode-tui/src/tui/app/tests.rs index ea81b409b..0cc1ef805 100644 --- a/crates/jcode-tui/src/tui/app/tests.rs +++ b/crates/jcode-tui/src/tui/app/tests.rs @@ -452,7 +452,10 @@ fn skills_command_marks_active_skill_in_remote_mode() { assert!(content.contains("- /optimization (active)"), "{content}"); assert!(content.contains("- /firefox-browser\n"), "{content}"); // Endorsed list should mark remote-installed skills as installed. - assert!(content.contains("/firefox-browser [installed]"), "{content}"); + assert!( + content.contains("/firefox-browser [installed]"), + "{content}" + ); } #[test] diff --git a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs index aa830a570..014d543e9 100644 --- a/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs +++ b/crates/jcode-tui/src/tui/app/tests/state_model_poke_03.rs @@ -451,7 +451,7 @@ fn test_model_picker_reuses_cached_entries_until_invalidated() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -492,7 +492,7 @@ fn test_shift_tab_model_favorite_hotkey_preserves_input_line() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -528,7 +528,7 @@ fn test_tui_api_key_auth_refreshes_catalog_shows_diff_without_opening_picker() { let refreshes = provider.refreshes.clone(); let provider: Arc = Arc::new(provider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -625,7 +625,7 @@ fn test_tui_cerebras_paste_key_lifecycle_has_no_degraded_success_messages() { let set_model_requests = fake_provider.set_model_requests.clone(); let provider: Arc = Arc::new(fake_provider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -917,7 +917,7 @@ fn test_tui_openai_compatible_empty_catalog_does_not_switch_to_profile_default() set_model_attempts: StdArc::clone(&set_model_attempts), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -977,7 +977,7 @@ fn test_tui_openai_compatible_local_refresh_failure_is_pending_not_final_failure set_model_attempts: StdArc::clone(&set_model_attempts), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1044,7 +1044,7 @@ fn test_model_picker_opens_simplified_state_before_async_routes_complete() { delay: Duration::from_millis(75), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1083,7 +1083,7 @@ fn test_model_picker_state_space_preserves_provider_labels_after_route_hydration model: StdArc::new(StdMutex::new("gpt-5.5".to_string())), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1154,7 +1154,7 @@ fn test_model_picker_does_not_cache_single_model_fallback() { delay: Duration::ZERO, }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1215,7 +1215,7 @@ fn test_login_completed_spawns_auth_refresh_when_runtime_is_available() { delay: Duration::from_millis(150), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -1424,7 +1424,7 @@ fn test_azure_login_completion_switches_local_model_without_completion() { complete_calls: StdArc::clone(&complete_calls), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs index 4d05361ee..5af4ec460 100644 --- a/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs +++ b/crates/jcode-tui/src/tui/app/tests/support_failover/part_01.rs @@ -182,7 +182,7 @@ fn create_test_app() -> App { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -196,7 +196,7 @@ fn create_named_provider_test_app(name: &'static str, model: &'static str) -> Ap let provider: Arc = Arc::new(NamedMockProvider { name, model }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -222,7 +222,7 @@ fn create_refresh_summary_test_app(summary: crate::provider::ModelCatalogRefresh let provider: Arc = Arc::new(RefreshSummaryProvider { summary }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -239,7 +239,7 @@ fn create_openrouter_spec_capture_test_app() -> (App, StdArc (App, StdArc App { logged_in: StdArc::new(StdMutex::new(false)), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -331,7 +331,7 @@ fn create_antigravity_picker_test_app() -> App { model: StdArc::new(StdMutex::new("default".to_string())), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -455,7 +455,7 @@ fn create_login_smoke_model_app() -> App { let provider: Arc = Arc::new(LoginSmokeModelProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -512,7 +512,7 @@ fn create_failing_model_switch_test_app() -> App { let provider: Arc = Arc::new(FailingModelSwitchProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -537,7 +537,7 @@ fn create_fast_test_app() -> App { service_tier: StdArc::new(StdMutex::new(None)), }); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; @@ -574,7 +574,7 @@ fn create_gemini_test_app() -> App { let provider: Arc = Arc::new(GeminiMockProvider); let rt = tokio::runtime::Runtime::new().unwrap(); - let registry = rt.block_on(crate::tool::Registry::new(provider.clone())); + let registry = rt.block_on(crate::tool::Registry::new(provider.clone(), None)); let mut app = App::new_for_test_harness(provider, registry); app.queue_mode = false; app.diff_mode = crate::config::DiffDisplayMode::Inline; diff --git a/crates/jcode-tui/src/tui/info_widget.rs b/crates/jcode-tui/src/tui/info_widget.rs index 9448ba98f..e6d669fbe 100644 --- a/crates/jcode-tui/src/tui/info_widget.rs +++ b/crates/jcode-tui/src/tui/info_widget.rs @@ -419,7 +419,11 @@ pub struct CacheMissAttribution { impl CacheHitInfo { /// Effective total prompt tokens across the session (read denominator). fn effective_reported_tokens(&self) -> u64 { - effective_prompt_tokens(self.reported_input_tokens, self.read_tokens, self.creation_tokens) + effective_prompt_tokens( + self.reported_input_tokens, + self.read_tokens, + self.creation_tokens, + ) } /// Fraction of the session's prompt tokens that were served from cache. diff --git a/crates/jcode-tui/src/tui/ui_header.rs b/crates/jcode-tui/src/tui/ui_header.rs index 4bc96213d..f780dceb7 100644 --- a/crates/jcode-tui/src/tui/ui_header.rs +++ b/crates/jcode-tui/src/tui/ui_header.rs @@ -789,7 +789,7 @@ mod tests { let provider: Arc = Arc::new(MockProvider); let rt = tokio::runtime::Runtime::new().expect("test runtime"); - let registry = rt.block_on(Registry::new(provider.clone())); + let registry = rt.block_on(Registry::new(provider.clone(), None)); crate::tui::app::App::new_for_test_harness(provider, registry) } diff --git a/evals/jbench/Cargo.toml b/evals/jbench/Cargo.toml new file mode 100644 index 000000000..6a360ffc8 --- /dev/null +++ b/evals/jbench/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "jcode-jbench" +version = "0.1.0" +edition = "2024" +description = "JBench — jcode's git-commit-reconstruction eval framework (scaffold)" + +[lib] +name = "jcode_jbench" +path = "src/lib.rs" + +[[bin]] +name = "jbench" +path = "src/bin/jbench.rs" + +[dependencies] +jcode-agent-runtime = { path = "../../crates/jcode-agent-runtime" } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +anyhow = "1" +tokio = { version = "1", default-features = false, features = ["rt-multi-thread", "macros", "io-util", "process", "time", "sync"] } +futures = "0.3" +reqwest = { version = "0.12", features = ["json"] } +clap = { version = "4", features = ["derive", "env"] } + +[features] +default = [] +agent-runner = [] + +[dev-dependencies] +serde_json = "1" +tempfile = "3" diff --git a/evals/jbench/README.md b/evals/jbench/README.md new file mode 100644 index 000000000..ffd7c01a6 --- /dev/null +++ b/evals/jbench/README.md @@ -0,0 +1,110 @@ +# JBench + +JBench is jcode's evaluation framework for measuring AI coding agent +performance through real-world git commit reconstruction tasks. It is the +Rust port and adaptation of [Codebuff's BuffBench](https://github.com/codebuff/codebuff/tree/main/evals/buffbench) +to the jcode multi-agent foundation. + +> **Status: scaffolding.** This crate currently provides typed data +> models, module skeletons, and a CLI shell. The actual eval +> orchestration (cloning repos, spawning agents, calling judge models, +> running lessons extraction) is intentionally left as `unimplemented!()` +> stubs so reviewers can validate the shape of the public API before any +> end-to-end behavior lands. Real implementations will arrive in Phases +> 5.3 (`agent_runner`), 5.4 (`judge`), and 5.5 (`lessons`). + +## Why git commit reconstruction? + +The core idea, borrowed directly from BuffBench, is that real git history +contains a near-infinite stream of well-scoped, naturally-occurring tasks +with built-in ground truth: each commit is a self-contained change with a +known intent (the message / spec) and a known correct outcome (the diff). + +For each evaluation: + +1. Pick a commit `C` from a target repository. +2. Reset the working tree to `parent(C)`. +3. Hand the agent a natural-language prompt derived from `C`'s spec. +4. Let the agent edit the repo. +5. Compare the agent's diff against the ground-truth diff in `C`. + +This yields fair head-to-head comparisons across agents because every +agent works from the exact same starting state and is judged against the +same target. + +## Three-judge median + +A single LLM judge is noisy. JBench follows BuffBench's approach: every +agent diff is judged by **three** different frontier models in parallel +(today the planned slate is `gpt-5`, `gemini-pro`, and `claude-sonnet`), +and the median `overall_score` is reported as the canonical result. Per- +dimension averages (`completion_score`, `code_quality_score`, +`overall_score`) are reported alongside the median's qualitative +analysis. + +The three-judge pipeline lives in `src/judge.rs` (currently +`unimplemented!()`). See `/tmp/codebuff/evals/buffbench/judge.ts` for the +TypeScript original we are mirroring. + +## Lessons extractor + +After each run, the lessons extractor compares the agent's diff and +trace against the ground-truth diff and emits a small list of +`Lesson { what_went_wrong, what_should_have_been_done }` items. These +lessons are intended to be appended to per-agent lesson files that can +later be folded into the agent's system prompt or memory graph — the +classic "learn from your mistakes" loop. + +The lessons module lives in `src/lessons.rs`. + +## Reuse of `jcode-agent-runtime` + +JBench is built on top of the new agent foundation in +[`crates/jcode-agent-runtime`](../../crates/jcode-agent-runtime/), which +provides: + +- `AgentRegistry` — discovery and loading of `.jcode/agents/*.toml` + agent definitions. +- `AgentDefinition` — the declarative schema describing an agent's + model, tools, system prompt, output mode, etc. + +The agent runner (`src/agent_runner.rs`) will resolve agent IDs against +the registry, spawn a `jcode` subprocess in a clean clone of the target +repo, capture the trace, and return an `EvalRun` populated with the diff +and judging result. + +## Module map + +| Module | Purpose | +| --- | --- | +| `types` | Serializable data structures (`EvalCommit`, `FileDiff`, `EvalDataV2`, `EvalRun`, `JudgingResult`, `AgentEvalResults`). Roundtrip-tested. | +| `judge` | Three-judge median pipeline. **Stub.** | +| `agent_runner` | Spawn an agent in a repo, capture trace + diff. **Stub.** | +| `lessons` | Extract lessons from a failed/imperfect run. **Stub.** | +| `bin/jbench.rs` | CLI: `pick-commits`, `gen-evals`, `run`, `judge`, `meta-analyze`. Each subcommand currently prints a TODO and exits 0. | + +## Workflow (planned) + +``` +pick-commits → select high-quality commits from a repo +gen-evals → produce eval-{repo}.json with EvalDataV2 schema +run → run agents against eval data, emit EvalRun per commit +judge → re-judge an existing run with the 3-model median +meta-analyze → aggregate analysis across all tasks for an agent +``` + +## Running + +```bash +cargo check -p jcode-jbench +cargo test -p jcode-jbench +cargo run -p jcode-jbench --bin jbench -- run --help +``` + +## References + +- BuffBench source: `/tmp/codebuff/evals/buffbench/` +- BuffBench README: `/tmp/codebuff/evals/buffbench/README.md` +- Judge design: `/tmp/codebuff/evals/buffbench/judge.ts` +- Agent runner design: `/tmp/codebuff/evals/buffbench/agent-runner.ts` +- Lessons extractor design: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts` diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs new file mode 100644 index 000000000..8b56d4a46 --- /dev/null +++ b/evals/jbench/src/agent_runner.rs @@ -0,0 +1,197 @@ +//! Spawn a jcode agent inside a freshly-prepared repo clone, run a +//! single eval task, and capture the resulting diff and trace. +//! +//! The runner resolves the configured `agent_id` through the +//! [`jcode_agent_runtime::AgentRegistry`] (loaded from +//! `.jcode/agents/*.toml`), spawns the binary as a subprocess in the +//! repo working directory, streams the trace, and finally extracts the +//! unified diff against the parent commit. +//! +//! Design source: `/tmp/codebuff/evals/buffbench/agent-runner.ts`. + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::time::{Duration, Instant}; + +use anyhow::{Context, Result}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::process::Command; +use tokio::time::timeout; + +use crate::types::EvalRun; + +/// Configuration for a single agent evaluation run. +/// +/// `repo_path` should already contain a clean checkout of the eval +/// commit's parent SHA; the runner does not clone for the caller. +#[derive(Debug, Clone)] +pub struct AgentRunConfig { + /// ID of the agent to run, matching an entry in the + /// `jcode-agent-runtime` registry. + pub agent_id: String, + /// Natural-language prompt to send to the agent (typically + /// `EvalCommit::prompt`). + pub prompt: String, + /// Working directory containing the prepared repo at the parent + /// commit. + pub repo_path: PathBuf, + /// Hard cap on the number of agent turns before the run is + /// aborted; mirrors BuffBench's per-task turn budget. + pub max_turns: u32, + /// Timeout for the entire run in seconds (defaults to 60 minutes). + pub timeout_secs: u64, + /// Extra environment variables applied to the agent subprocess on + /// top of the calling process's environment. + pub env: HashMap, + /// Path to the `jcode` binary. Defaults to searching $PATH. + pub jcode_binary: Option, +} + +impl Default for AgentRunConfig { + fn default() -> Self { + Self { + agent_id: String::new(), + prompt: String::new(), + repo_path: PathBuf::new(), + max_turns: 100, + timeout_secs: 60 * 60, + env: HashMap::new(), + jcode_binary: None, + } + } +} + +/// Spawn the configured agent in `config.repo_path`, run it to +/// completion (or the turn / time budget), and return an [`EvalRun`] +/// populated with the agent's diff, judging placeholder, cost, and +/// duration. +pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { + let start = Instant::now(); + let timeout_duration = Duration::from_secs(config.timeout_secs); + + let jcode_bin = config + .jcode_binary + .clone() + .unwrap_or_else(|| PathBuf::from("jcode")); + + let mut env_vars: HashMap = std::env::vars().collect(); + env_vars.extend(config.env); + env_vars.insert("JCODE_AGENT_ID".to_owned(), config.agent_id.clone()); + + let mut child = Command::new(&jcode_bin) + .current_dir(&config.repo_path) + .envs(&env_vars) + .args([ + "agent", + "run", + "--agent", + &config.agent_id, + "--output-mode", + "stream", + "--no-interactive", + ]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .with_context(|| format!("failed to spawn jcode binary at {:?}", jcode_bin))?; + + let mut child_stdin = child.stdin.take().expect("stdin captured"); + let stdout = child.stdout.take().expect("stdout captured"); + + // Write the prompt to stdin + { + use tokio::io::AsyncWriteExt; + let mut stdin = tokio::io::BufWriter::new(&mut child_stdin); + stdin.write_all(config.prompt.as_bytes()).await?; + stdin.flush().await?; + drop(stdin); + } + + let mut trace_lines = Vec::new(); + let reader = BufReader::new(stdout); + let mut lines_stream = reader.lines(); + let timed_out = loop { + let line = timeout(timeout_duration, lines_stream.next_line()).await; + match line { + Ok(Ok(Some(l))) => trace_lines.push(l), + Ok(Ok(None)) => break false, // EOF — clean exit + Ok(Err(_)) => break false, // read error + Err(_) => break true, // timeout + } + }; + + if timed_out { + // Kill the child process so it doesn't become an orphan + let _ = child.kill().await; + // Consume the exit status after kill + let _ = child.wait().await; + return Ok(EvalRun { + commit_sha: String::new(), + prompt: config.prompt, + diff: extract_diff_from_repo(&config.repo_path) + .await + .unwrap_or_default(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: start.elapsed().as_millis() as u64, + error: Some("Timed out waiting for jcode subprocess".to_owned()), + }); + } + + let status = child + .wait() + .await + .context("failed to wait for jcode subprocess")?; + + let diff = extract_diff_from_repo(&config.repo_path).await?; + let error = if !status.success() { + Some(format!("jcode exited with status {:?}", status)) + } else { + None + }; + + Ok(EvalRun { + commit_sha: String::new(), + prompt: config.prompt, + diff, + judging: Default::default(), + cost_usd: 0.0, + duration_ms: start.elapsed().as_millis() as u64, + error, + }) +} + +/// Produce a unified diff describing all uncommitted changes in +/// `repo_path` against its currently-checked-out HEAD. +pub async fn extract_diff_from_repo(repo_path: &Path) -> Result { + let repo_path = repo_path.to_owned(); + tokio::task::spawn_blocking(move || { + let output = std::process::Command::new("git") + .args(["diff", "--no-color", "HEAD"]) + .current_dir(&repo_path) + .output() + .context("git diff failed")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("git diff exited with error: {stderr}"); + } + + Ok(String::from_utf8_lossy(&output.stdout).to_string()) + }) + .await + .context("spawn_blocking panicked")? +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn extract_diff_from_repo_nonexistent() { + let result = extract_diff_from_repo(Path::new("/tmp/does-not-exist")).await; + assert!(result.is_err()); + } +} diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs new file mode 100644 index 000000000..5e3c651d0 --- /dev/null +++ b/evals/jbench/src/bin/jbench.rs @@ -0,0 +1,623 @@ +//! `jbench` CLI entry point. +//! +//! Dispatches to the [`jcode_jbench`] library for real work. + +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use clap::{Parser, Subcommand}; + +#[cfg(feature = "agent-runner")] +use jcode_jbench::agent_runner::AgentRunConfig; +#[cfg(feature = "agent-runner")] +use jcode_jbench::types::EvalDataV2; +use jcode_jbench::types::EvalRun; + +/// Top-level `jbench` CLI. +#[derive(Debug, Parser)] +#[command( + name = "jbench", + about = "JBench — jcode's git-commit-reconstruction eval framework", + version +)] +struct Cli { + /// Subcommand to dispatch to. + #[command(subcommand)] + command: Command, +} + +/// JBench subcommands. +#[derive(Debug, Subcommand)] +enum Command { + /// Select high-quality commits from a target repo to use as eval + /// tasks. + PickCommits { + /// URL of the repository to pick commits from. + repo_url: String, + /// Minimum commit message length. + #[arg(long, default_value = "10")] + min_msg_len: usize, + /// Maximum number of commits to pick. + #[arg(long, default_value = "50")] + max_picks: usize, + /// Output file (default: stdout). + #[arg(short, long)] + output: Option, + }, + /// Generate an `eval-{repo}.json` file (`EvalDataV2`) from a list + /// of picked commits. + GenEvals { + /// Input commit list (from pick-commits). + input: PathBuf, + /// Output eval JSON file. + #[arg(short, long)] + output: PathBuf, + }, + /// Run one or more agents against an eval data file and emit + /// per-commit `EvalRun`s. + Run { + /// Path to eval data JSON file. + eval_file: PathBuf, + /// Agent ID to run (must be registered in jcode registry). + #[arg(short, long)] + agent_id: String, + /// Output directory for EvalRun JSON files. + #[arg(short, long)] + output_dir: PathBuf, + /// Path to jcode binary (auto-detected if not set). + #[arg(long)] + jcode_binary: Option, + /// Maximum turns per run. + #[arg(long, default_value = "100")] + max_turns: u32, + /// Timeout per run in seconds. + #[arg(long, default_value = "3600")] + timeout_secs: u64, + }, + /// Re-judge an existing run with the three-judge median pipeline. + Judge { + /// Directory containing EvalRun JSON files. + runs_dir: PathBuf, + /// API base URL. + #[arg(long, env = "JBENCH_API_BASE")] + api_base: Option, + /// API key. + #[arg(long, env = "JBENCH_API_KEY")] + api_key: Option, + }, + /// Aggregate and analyze results across all tasks for an agent. + MetaAnalyze { + /// Directory containing EvalRun JSON files. + runs_dir: PathBuf, + /// Output file for aggregated results. + #[arg(short, long)] + output: Option, + }, +} + +#[tokio::main] +async fn main() -> Result<()> { + let cli = Cli::parse(); + match cli.command { + Command::PickCommits { + repo_url, + min_msg_len, + max_picks, + output, + } => { + pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?; + } + Command::GenEvals { input, output } => { + gen_evals_impl(&input, &output).await?; + } + Command::Run { + eval_file: _eval_file, + agent_id: _agent_id, + output_dir: _output_dir, + jcode_binary: _jcode_binary, + max_turns: _max_turns, + timeout_secs: _timeout_secs, + } => { + #[cfg(feature = "agent-runner")] + { + run_impl( + &_eval_file, + &_agent_id, + &_output_dir, + _jcode_binary.as_ref(), + _max_turns, + _timeout_secs, + ) + .await?; + } + #[cfg(not(feature = "agent-runner"))] + anyhow::bail!( + "'jbench run' requires the 'agent-runner' feature. Enable with: cargo build --features agent-runner" + ); + } + Command::Judge { + runs_dir, + api_base, + api_key, + } => { + judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?; + } + Command::MetaAnalyze { runs_dir, output } => { + meta_analyze_impl(&runs_dir, output.as_ref()).await?; + } + } + Ok(()) +} + +async fn pick_commits_impl( + repo_path: &str, + min_msg_len: usize, + max_picks: usize, + output: Option, +) -> Result<()> { + // Verify the path is a git repository. + let check = std::process::Command::new("git") + .args(["-C", repo_path, "rev-parse", "--is-inside-work-tree"]) + .output() + .context("failed to run git rev-parse")?; + if !check.status.success() { + anyhow::bail!("{} is not a git repository", repo_path); + } + + // Get commit log: SHA, first parent, subject, then shortstat on the + // following line. `COMMIT` acts as a block separator. + let log_out = std::process::Command::new("git") + .args([ + "-C", + repo_path, + "log", + "--format=COMMIT%n%H%n%P%n%s", + "--shortstat", + ]) + .output() + .context("failed to run git log")?; + + if !log_out.status.success() { + let stderr = String::from_utf8_lossy(&log_out.stderr); + anyhow::bail!("git log failed: {}", stderr); + } + + let stdout = String::from_utf8_lossy(&log_out.stdout); + let mut picked: Vec = Vec::new(); + + for block in stdout.split("COMMIT\n").skip(1) { + let lines: Vec<&str> = block.lines().collect(); + if lines.len() < 3 { + continue; + } + + let sha = lines[0].trim(); + let parent_sha = lines[1].split_whitespace().next().unwrap_or("").to_string(); + let subject = lines[2].trim(); + + // Skip root commits (no parent). + if parent_sha.is_empty() { + continue; + } + + // Filter: commit message must meet minimum length. + if subject.len() < min_msg_len { + continue; + } + + // Parse file count from shortstat (e.g. " 3 files changed, …"). + let file_count = lines + .iter() + .rev() + .find(|l| l.contains(" file")) + .and_then(|l| l.split_whitespace().next()?.parse::().ok()) + .unwrap_or(0); + + // Filter: bounded scope — not zero files, not a mega-commit. + if file_count == 0 || file_count > 10 { + continue; + } + + picked.push(serde_json::json!({ + "sha": sha, + "parent_sha": parent_sha, + "spec": subject, + "prompt": subject, + })); + + if picked.len() >= max_picks { + break; + } + } + + let json = serde_json::to_string_pretty(&picked)?; + if let Some(path) = output { + std::fs::write(&path, &json)?; + eprintln!("Wrote {} commits to {}", picked.len(), path.display()); + } else { + println!("{json}"); + } + + Ok(()) +} + +async fn gen_evals_impl(input: &PathBuf, output: &PathBuf) -> Result<()> { + use jcode_jbench::types::{EvalCommit, EvalDataV2}; + + // Intermediate struct matching the pick-commits output format. + #[derive(serde::Deserialize)] + struct PickedCommit { + sha: String, + parent_sha: String, + spec: String, + prompt: String, + } + + // Read input JSON. + let input_text = std::fs::read_to_string(input) + .with_context(|| format!("failed to read input file {}", input.display()))?; + let picked: Vec = serde_json::from_str(&input_text) + .context("failed to parse input JSON as array of picked commits")?; + + if picked.is_empty() { + anyhow::bail!("input file contains no commits"); + } + + // Detect repo URL from the local git remote. + let repo_url = get_repo_url().unwrap_or_else(|| "unknown".to_owned()); + + let mut eval_commits = Vec::with_capacity(picked.len()); + + for pc in &picked { + let id = format!("{}-eval", &pc.sha[..std::cmp::min(8, pc.sha.len())]); + + // git diff --name-status to get file statuses. + let name_status = run_git(&[ + "diff", + "--name-status", + &format!("{}..{}", pc.parent_sha, pc.sha), + ]) + .with_context(|| { + format!( + "git diff --name-status failed for {}..{}", + pc.parent_sha, pc.sha + ) + })?; + + // git diff to get the full unified diff. + let full_diff = run_git(&["diff", &format!("{}..{}", pc.parent_sha, pc.sha)]) + .with_context(|| format!("git diff failed for {}..{}", pc.parent_sha, pc.sha))?; + + let file_diffs = parse_diffs(&name_status, &full_diff); + + eval_commits.push(EvalCommit { + id, + sha: pc.sha.clone(), + parent_sha: pc.parent_sha.clone(), + spec: pc.spec.clone(), + prompt: pc.prompt.clone(), + supplemental_files: Vec::new(), + file_diffs, + }); + } + + let eval_data = EvalDataV2 { + repo_url, + test_repo_name: None, + generation_date: chrono_now(), + init_command: None, + env: std::collections::HashMap::new(), + final_check_commands: Vec::new(), + eval_commits, + }; + + let json = + serde_json::to_string_pretty(&eval_data).context("failed to serialize EvalDataV2")?; + std::fs::write(output, &json) + .with_context(|| format!("failed to write output file {}", output.display()))?; + + println!( + "Wrote {} eval commits to {}", + eval_data.eval_commits.len(), + output.display() + ); + Ok(()) +} + +#[cfg(feature = "agent-runner")] +async fn run_impl( + eval_file: &PathBuf, + agent_id: &str, + output_dir: &PathBuf, + jcode_binary: Option<&PathBuf>, + max_turns: u32, + timeout_secs: u64, +) -> Result<()> { + use std::fs; + use std::time::Duration; + use tokio::time::timeout as tk_timeout; + + // Load eval data + let eval_data: EvalDataV2 = { + let text = fs::read_to_string(eval_file)?; + serde_json::from_str(&text).context("failed to parse eval JSON")? + }; + + if !output_dir.exists() { + fs::create_dir_all(output_dir)?; + } + + for commit in &eval_data.eval_commits { + let config = AgentRunConfig { + agent_id: agent_id.to_owned(), + prompt: commit.prompt.clone(), + repo_path: output_dir.join(&commit.id), + max_turns, + timeout_secs, + env: eval_data.env.clone(), + jcode_binary: jcode_binary.cloned(), + ..Default::default() + }; + + let result = match tk_timeout( + Duration::from_secs(timeout_secs), + jcode_jbench::agent_runner::run_agent_in_repo(config), + ) + .await + { + Ok(Ok(run)) => run, + Ok(Err(err)) => EvalRun { + commit_sha: commit.sha.clone(), + prompt: commit.prompt.clone(), + diff: String::new(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: 0, + error: Some(format!("Agent error: {err:#}")), + }, + Err(_elapsed) => EvalRun { + commit_sha: commit.sha.clone(), + prompt: commit.prompt.clone(), + diff: String::new(), + judging: Default::default(), + cost_usd: 0.0, + duration_ms: 0, + error: Some("Timed out waiting for run_agent_in_repo".to_owned()), + }, + }; + + let run_file = output_dir.join(format!("{}.run.json", commit.id)); + let json = serde_json::to_string_pretty(&result).context("failed to serialize EvalRun")?; + fs::write(&run_file, json)?; + println!("Wrote {}", run_file.display()); + } + + Ok(()) +} + +async fn judge_impl( + _runs_dir: &PathBuf, + _api_base: Option<&str>, + _api_key: Option<&str>, +) -> Result<()> { + todo_step( + "Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields", + ) +} + +async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Result<()> { + use jcode_jbench::types::AgentEvalResults; + use std::fs; + + let mut all_runs = Vec::new(); + + for entry in fs::read_dir(runs_dir)? { + let entry = entry?; + let path = entry.path(); + // `Path::extension` returns only the trailing component (`json`), + // so matching against `"run.json"` never fires. Match on the full + // file name suffix instead. + let is_run_file = path + .file_name() + .and_then(|s| s.to_str()) + .is_some_and(|s| s.ends_with(".run.json")); + if is_run_file { + let text = fs::read_to_string(&path)?; + if let Ok(run) = serde_json::from_str::(&text) { + all_runs.push(run); + } + } + } + + if all_runs.is_empty() { + anyhow::bail!("No .run.json files found in {}", runs_dir.display()); + } + + let avg_score = all_runs + .iter() + .map(|r| r.judging.overall_score) + .sum::() + / all_runs.len() as f64; + let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() / all_runs.len() as f64; + let avg_duration = (all_runs.iter().map(|r| r.duration_ms as f64).sum::() + / all_runs.len() as f64) + .round() as u64; + + let summary = AgentEvalResults { + agent_id: "unknown".to_owned(), + runs: all_runs, + average_score: (avg_score * 10.0).round() / 10.0, + average_cost: (avg_cost * 100.0).round() / 100.0, + average_duration_ms: avg_duration, + }; + + let json = serde_json::to_string_pretty(&summary).context("failed to serialize summary")?; + + if let Some(out) = output { + fs::write(out, &json)?; + println!("Wrote {}", out.display()); + } else { + println!("{json}"); + } + + Ok(()) +} + +fn todo_step(phase: &str) -> Result<()> { + eprintln!("{phase}"); + std::process::exit(2); +} + +/// Run a `git` subcommand and return its stdout as a `String`. +fn run_git(args: &[&str]) -> Result { + let output = std::process::Command::new("git") + .args(args) + .output() + .context("failed to spawn git")?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("git {} failed: {}", args.join(" "), stderr.trim()); + } + Ok(String::from_utf8_lossy(&output.stdout).into_owned()) +} + +/// Try to detect the repo URL from `git remote get-url origin`. +fn get_repo_url() -> Option { + std::process::Command::new("git") + .args(["remote", "get-url", "origin"]) + .output() + .ok() + .filter(|o| o.status.success()) + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_owned()) +} + +/// ISO-8601 timestamp without pulling in a full datetime crate. +fn chrono_now() -> String { + // Use a simple approach: seconds since epoch formatted manually + // would be ideal, but for simplicity just use a debug-friendly format. + // The `chrono` crate isn't in deps, so we format from SystemTime. + use std::time::SystemTime; + let dur = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default(); + let secs = dur.as_secs(); + // Break into Y-M-D H:M:S (UTC, simplified leap-year handling). + let days = secs / 86400; + let time_of_day = secs % 86400; + let h = time_of_day / 3600; + let m = (time_of_day % 3600) / 60; + let s = time_of_day % 60; + // Days since 1970-01-01 -> Y/M/D via a simple civil calendar. + let (y, mo, d) = civil_from_days(days as i64); + format!("{y:04}-{mo:02}-{d:02}T{h:02}:{m:02}:{s:02}Z") +} + +/// Convert days since 1970-01-01 to (year, month, day). +/// Uses Howard Hinnant's algorithm. +fn civil_from_days(days: i64) -> (i64, u32, u32) { + let z = days + 719468; + let era = if z >= 0 { z } else { z - 146096 } / 146097; + let doe = (z - era * 146097) as u32; + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = if m <= 2 { y + 1 } else { y }; + (y, m, d) +} + +/// Parse `git diff --name-status` output and the full unified diff into +/// `FileDiff` structs. +/// +/// The name-status output gives us file paths and status codes; we split +/// the full diff by file to associate each chunk with the right file. +fn parse_diffs(name_status: &str, full_diff: &str) -> Vec { + use jcode_jbench::types::{FileDiff, FileDiffStatus}; + + // Parse name-status lines: e.g. "M\tpath/to/file.rs" or "R100\told\tnew". + let mut file_entries: Vec<(FileDiffStatus, String, Option)> = Vec::new(); + for line in name_status.lines() { + let line = line.trim(); + if line.is_empty() { + continue; + } + let parts: Vec<&str> = line.split('\t').collect(); + if parts.len() < 2 { + continue; + } + let code = parts[0]; + let (status, path, old_path) = match code { + "M" => (FileDiffStatus::Modified, parts[1].to_owned(), None), + "A" => (FileDiffStatus::Added, parts[1].to_owned(), None), + "D" => (FileDiffStatus::Deleted, parts[1].to_owned(), None), + r if r.starts_with('R') => { + // Renamed: "R100\told_path\tnew_path" + if parts.len() >= 3 { + ( + FileDiffStatus::Renamed, + parts[2].to_owned(), + Some(parts[1].to_owned()), + ) + } else { + (FileDiffStatus::Modified, parts[1].to_owned(), None) + } + } + "C" => { + // Copied — treat as Added for our purposes. + let path = if parts.len() >= 3 { parts[2] } else { parts[1] }; + (FileDiffStatus::Added, path.to_owned(), None) + } + _ => (FileDiffStatus::Modified, parts[1].to_owned(), None), + }; + file_entries.push((status, path, old_path)); + } + + // Split the full diff by "diff --git" boundaries to get per-file chunks. + let file_diffs_map = split_diff_by_file(full_diff); + + // Build FileDiff structs, matching by path. + let mut result = Vec::with_capacity(file_entries.len()); + for (status, path, old_path) in file_entries { + let diff_text = file_diffs_map.get(&path).cloned().unwrap_or_default(); + result.push(FileDiff { + path, + status, + old_path, + diff: diff_text, + }); + } + + result +} + +/// Split a unified diff into per-file chunks keyed by the post-image path. +fn split_diff_by_file(full_diff: &str) -> std::collections::HashMap { + let mut map = std::collections::HashMap::new(); + let mut current_path: Option = None; + let mut current_chunk = String::new(); + + for line in full_diff.lines() { + if line.starts_with("diff --git ") { + // Save previous chunk. + if let Some(ref p) = current_path { + map.insert(p.clone(), current_chunk.clone()); + } + // Extract the post-image path from "diff --git a/path b/path". + let path = line.splitn(2, " b/").nth(1).unwrap_or("").to_owned(); + current_path = Some(path); + current_chunk.clear(); + } + if current_path.is_some() { + current_chunk.push_str(line); + current_chunk.push('\n'); + } + } + // Don't forget the last chunk. + if let Some(p) = current_path { + map.insert(p, current_chunk); + } + + map +} diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs new file mode 100644 index 000000000..7b461a44c --- /dev/null +++ b/evals/jbench/src/judge.rs @@ -0,0 +1,518 @@ +//! Three-judge median pipeline. +//! +//! Each agent diff is graded by **three** frontier models in parallel +//! (planned slate: `gpt-5`, `gemini-pro`, `claude-sonnet`); the median +//! `overall_score` selects which judge's qualitative analysis is +//! reported, while the per-dimension scores are averaged across all +//! valid judges. This mirrors the design of BuffBench's +//! `judgeCommitResult` in `/tmp/codebuff/evals/buffbench/judge.ts`. +//! +//! Judge prompts are rendered from fixed templates (deduced from the TS +//! original); the judge agent definitions are embedded here so the +//! pipeline stays self-contained and does not depend on the full jcode +//! agent runtime at evaluation time. + +use std::collections::HashMap; +use std::sync::OnceLock; +use std::time::Duration; + +use anyhow::{Context, Result}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use tokio::time::timeout; + +// Re-export JudgingResult so callers get it from the public types. +pub use crate::types::JudgingResult; + +use crate::types::{EvalCommit, JudgingResult as Scorecard}; + +/// Timeout for a single judge call. +const JUDGE_TIMEOUT_SECS: u64 = 20 * 60; + +/// How many judges must succeed for the pipeline to produce a result. +/// If fewer succeed, we return a zero-score error result. +const MIN_JUDGE_SUCCESS_COUNT: usize = 2; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum JudgeProviderKind { + OpenAI, // OpenAI Responses API + output_schema + Anthropic, // Anthropic Messages API + structured_outputs +} + +impl JudgeProviderKind { + pub fn for_model(model: &str) -> Self { + if model.contains("claude") || model.contains("anthropic") { + Self::Anthropic + } else { + Self::OpenAI + } + } +} + +/// Configuration for the judging pipeline. +#[derive(Debug, Clone)] +pub struct JudgeConfig { + /// API base URL for the OpenAI-compatible judge backend. + pub api_base: String, + /// API key for the OpenAI-compatible judge backend. + pub api_key: String, + /// Optional separate base URL for Anthropic-routed judges (e.g. + /// `https://api.anthropic.com`). Falls back to `api_base` when + /// `None`, which only makes sense if the OpenAI-compatible host + /// proxies the Anthropic Messages API too. + pub anthropic_api_base: Option, + /// Optional separate API key for Anthropic-routed judges. Falls + /// back to `api_key` when `None`. + pub anthropic_api_key: Option, + /// Model IDs for the three judges. Order determines the median + /// computation. + pub models: [String; 3], + /// Optional override for judge timeout per call. + pub timeout_secs: Option, + /// Custom HTTP client (uses shared client if None). + pub http_client: Option, +} + +impl Default for JudgeConfig { + fn default() -> Self { + Self { + // Sensible defaults — override before use in production + api_base: std::env::var("JBENCH_API_BASE") + .unwrap_or_else(|_| "https://api.openai.com".to_owned()), + api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + anthropic_api_base: std::env::var("JBENCH_ANTHROPIC_API_BASE").ok(), + anthropic_api_key: std::env::var("JBENCH_ANTHROPIC_API_KEY").ok(), + models: [ + "gpt-5-2026-05".to_owned(), + "google/gemini-3.1-pro".to_owned(), + "anthropic/claude-sonnet-4-2026-05".to_owned(), + ], + timeout_secs: None, + http_client: None, + } + } +} + +/// Render the full judge prompt from commit + diff + context. +fn render_judge_prompt( + commit: &EvalCommit, + agent_diff: &str, + context_files: &HashMap, +) -> String { + let ground_truth_diffs = commit + .file_diffs + .iter() + .map(|fd| format!("### {}\n```diff\n{}\n```", fd.path, fd.diff)) + .collect::>() + .join("\n\n"); + + let context_content = context_files + .iter() + .map(|(path, content)| format!("### {path}\n```\n{content}\n```")) + .collect::>() + .join("\n\n"); + + format!( + "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```", + commit.prompt, context_content, ground_truth_diffs, agent_diff + ) +} + +/// System prompt for the judge agent (mirrors the TS `judgeAgentBase.systemPrompt`). +fn judge_system_prompt() -> &'static str { + r#"You are an expert software engineer evaluating AI-generated code changes with empathy for the task given. + +## Your Role + +You will receive: +1. The user prompt that the coding agent was given +2. Context files from the codebase +3. The ground truth changes (expected outcome) +4. The agent's actual changes + +## Evaluation Philosophy + +**Judge based on what the agent was asked to do, not on perfection.** + +- If the prompt is vague or high-level (e.g., "add authentication"), be lenient and accept any reasonable implementation that achieves the goal +- If the prompt is specific and detailed, expect the implementation to match those details more closely +- Focus on whether the agent understood and addressed the user's intent +- Consider that there are often multiple valid ways to implement the same feature + +## Evaluation Criteria + +- **Completion** (0-10): How well did the agent address what was asked in the prompt? Consider the specificity of the prompt. +- **Code Quality** (0-10): How well-structured and maintainable is the code? +- **Overall** (0-10): Combined assessment of whether the agent successfully completed the task as requested + +## Ground Truth + +The ground truth shows ONE valid implementation, but it's not the only correct answer. The agent's implementation should be judged on: +- Does it achieve the same functional outcome? +- Is it a reasonable approach given the prompt? +- Does it maintain code quality? + +Provide detailed analysis, strengths, weaknesses, and numerical scores."# +} + +#[derive(Serialize)] +struct JudgeRequest<'a> { + model: &'a str, + input: &'a str, + tools: &'a [serde_json::Value], + #[serde(skip_serializing_if = "Option::is_none")] + output_schema: Option<&'a serde_json::Value>, +} + +#[derive(Deserialize)] +struct JudgeResponse { + output: Option, + #[serde(default)] + choices: Vec, +} + +/// Invoke a single judge model with a fully-rendered prompt. +/// +/// `anthropic_api_base` / `anthropic_api_key` are only consulted when +/// the model routes through `JudgeProviderKind::Anthropic`; OpenAI-bound +/// requests always use the primary `api_base` / `api_key`. +/// +/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`). +pub async fn run_single_judge( + model: &str, + prompt: &str, + api_base: &str, + api_key: &str, + anthropic_api_base: Option<&str>, + anthropic_api_key: Option<&str>, + http_client: &Client, +) -> Result { + let kind = JudgeProviderKind::for_model(model); + let system = judge_system_prompt(); + + if kind == JudgeProviderKind::OpenAI { + run_openai_judge(model, prompt, system, api_base, api_key, http_client).await + } else { + // Fall back to the primary host/key only if no Anthropic-specific + // overrides were configured. The caller is expected to set both + // overrides when targeting `api.anthropic.com` directly. + let base = anthropic_api_base.unwrap_or(api_base); + let key = anthropic_api_key.unwrap_or(api_key); + run_anthropic_judge(model, prompt, system, base, key, http_client).await + } +} + +async fn run_openai_judge( + model: &str, + prompt: &str, + system: &str, + api_base: &str, + api_key: &str, + http_client: &Client, +) -> Result { + let output_schema = serde_json::json!({ + "type": "object", + "properties": { + "analysis": { "type": "string", "description": "Detailed analysis comparing agent changes to ground truth" }, + "strengths": { "type": "array", "items": { "type": "string" }, "description": "Key strengths of the implementation" }, + "weaknesses": { "type": "array", "items": { "type": "string" }, "description": "Key weaknesses or issues found" }, + "completionScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "How completely the prompt was addressed" }, + "codeQualityScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Code structure and maintainability" }, + "overallScore": { "type": "number", "minimum": 0, "maximum": 10, "description": "Combined assessment" } + }, + "required": ["analysis", "strengths", "weaknesses", "completionScore", "codeQualityScore", "overallScore"] + }); + + let request_body = serde_json::json!({ + "model": model, + "input": [ + { "role": "system", "content": system }, + { "role": "user", "content": prompt } + ], + "tools": [ + { + "type": "function", + "name": "set_output", + "description": "Submit the evaluation result", + "parameters": output_schema.clone() + } + ], + "tool_choice": { "type": "function", "name": "set_output" }, + "output_schema": output_schema, + }); + + let url = format!("{api_base}/v1/responses"); + let response = http_client + .post(&url) + .header("Authorization", format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .json(&request_body) + .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS)) + .send() + .await + .context("judge HTTP request failed")?; + + let status = response.status(); + let body: serde_json::Value = response + .json() + .await + .context("failed to parse judge response")?; + + if !status.is_success() { + anyhow::bail!("judge API returned {status}: {body}"); + } + + let output = body + .get("output") + .and_then(|o| o.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("content")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()); + + let output_value = output + .and_then(|t| serde_json::from_str::(t).ok()) + .or_else(|| body.get("output").cloned()) + .unwrap_or(serde_json::json!({ + "analysis": "No structured output received", + "strengths": [], + "weaknesses": ["Judge failed to return structured output"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); + + parse_scorecard(output_value) +} + +async fn run_anthropic_judge( + model: &str, + prompt: &str, + system: &str, + api_base: &str, + api_key: &str, + http_client: &Client, +) -> Result { + let request_body = serde_json::json!({ + "model": model, + "messages": [ + { "role": "user", "content": prompt } + ], + "system": system, + "max_tokens": 4096, + "thinking": { + "type": "enabled", + "budget_tokens": 1024 + }, + }); + + // Anthropic Messages API authenticates via `x-api-key`, not + // `Authorization: Bearer ...`. Using the wrong header returns 401 + // even with a valid key, which previously made this branch + // permanently dead. + let url = format!("{api_base}/v1/messages"); + let response = http_client + .post(&url) + .header("x-api-key", api_key) + .header("Content-Type", "application/json") + .header("anthropic-version", "2023-06-01") + .json(&request_body) + .timeout(Duration::from_secs(JUDGE_TIMEOUT_SECS)) + .send() + .await + .context("judge HTTP request failed")?; + + let body: serde_json::Value = response + .json() + .await + .context("failed to parse anthropic judge response")?; + + // Anthropic returns content blocks — try to parse the final text block as JSON + let text = body + .get("content") + .and_then(|c| c.as_array()) + .and_then(|arr| arr.last()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()) + .unwrap_or_default(); + + let parsed = serde_json::from_str::(text).unwrap_or(serde_json::json!({ + "analysis": text.to_owned(), + "strengths": [], + "weaknesses": ["Could not parse structured output from Anthropic judge"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); + + parse_scorecard(parsed) +} + +fn parse_scorecard(value: serde_json::Value) -> Result { + serde_json::from_value(value).context("failed to parse JudgingResult from judge output") +} + +/// Judge an agent's diff against the ground truth using three models in +/// parallel and return a [`JudgingResult`] whose qualitative analysis +/// comes from the median judge and whose numeric scores are averaged +/// across all judges that returned successfully. +/// +/// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` +/// (`judgeCommitResult`). +pub async fn judge_with_three_models( + commit: &EvalCommit, + agent_diff: &str, + context_files: &HashMap, + config: &JudgeConfig, +) -> Result { + let prompt = render_judge_prompt(commit, agent_diff, context_files); + let http: &reqwest::Client = match &config.http_client { + Some(c) => c, + None => shared_client(), + }; + + let timeout_duration = Duration::from_secs(config.timeout_secs.unwrap_or(JUDGE_TIMEOUT_SECS)); + + // Each judge gets its own timeout so a slow model doesn't starve the others. + let judge_futures: Vec<_> = config + .models + .iter() + .map(|model| { + let http = http.clone(); + let prompt = prompt.clone(); + async move { + timeout( + timeout_duration, + run_single_judge( + model, + &prompt, + &config.api_base, + &config.api_key, + config.anthropic_api_base.as_deref(), + config.anthropic_api_key.as_deref(), + &http, + ), + ) + .await + .ok() + .and_then(|r| r.ok()) + } + }) + .collect(); + + let valid: Vec = futures::future::join_all(judge_futures) + .await + .into_iter() + .filter_map(|r| r) + .collect(); + + if valid.len() < MIN_JUDGE_SUCCESS_COUNT { + return Ok(Scorecard { + analysis: format!( + "Error running judge agent — only {}/{} judges succeeded", + valid.len(), + 3 + ), + strengths: vec![], + weaknesses: vec![format!("Only {}/{} judges succeeded", valid.len(), 3)], + completion_score: 0.0, + code_quality_score: 0.0, + overall_score: 0.0, + }); + } + + // Median analysis — sort by overall_score and pick the middle + let mut sorted = valid.clone(); + sorted.sort_by(|a, b| { + a.overall_score + .partial_cmp(&b.overall_score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let median_idx = sorted.len() / 2; + let median = &sorted[median_idx]; + + let avg_completion = valid.iter().map(|r| r.completion_score).sum::() / valid.len() as f64; + let avg_quality = valid.iter().map(|r| r.code_quality_score).sum::() / valid.len() as f64; + let avg_overall = valid.iter().map(|r| r.overall_score).sum::() / valid.len() as f64; + + Ok(Scorecard { + analysis: median.analysis.clone(), + strengths: median.strengths.clone(), + weaknesses: median.weaknesses.clone(), + completion_score: (avg_completion * 10.0).round() / 10.0, + code_quality_score: (avg_quality * 10.0).round() / 10.0, + overall_score: (avg_overall * 10.0).round() / 10.0, + }) +} + +static SHARED_CLIENT: OnceLock = OnceLock::new(); + +fn shared_client() -> &'static Client { + SHARED_CLIENT.get_or_init(|| { + reqwest::Client::builder() + .connect_timeout(Duration::from_secs(15)) + .tcp_keepalive(Duration::from_secs(30)) + .pool_idle_timeout(Duration::from_secs(90)) + .build() + .expect("reqwest client must build") + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn judge_provider_kind_for_model() { + assert_eq!( + JudgeProviderKind::for_model("gpt-5"), + JudgeProviderKind::OpenAI + ); + assert_eq!( + JudgeProviderKind::for_model("claude-sonnet-4"), + JudgeProviderKind::Anthropic + ); + assert_eq!( + JudgeProviderKind::for_model("anthropic/claude-opus-4"), + JudgeProviderKind::Anthropic + ); + } + + /// Locks the wire-format contract: the LLM judge returns camelCase + /// (`completionScore`, etc.) per the request schema. Deserialization + /// must accept that even though the on-disk JSON form is snake_case. + #[test] + fn parse_scorecard_accepts_camelcase_from_llm() { + let camel = serde_json::json!({ + "analysis": "looks good", + "strengths": ["clean diff"], + "weaknesses": [], + "completionScore": 8.5, + "codeQualityScore": 7.0, + "overallScore": 7.8 + }); + let parsed = parse_scorecard(camel).expect("camelCase must deserialize"); + assert_eq!(parsed.completion_score, 8.5); + assert_eq!(parsed.code_quality_score, 7.0); + assert_eq!(parsed.overall_score, 7.8); + } + + /// snake_case (on-disk eval JSON) must round-trip as well. + #[test] + fn parse_scorecard_accepts_snake_case_from_disk() { + let snake = serde_json::json!({ + "analysis": "", + "strengths": [], + "weaknesses": [], + "completion_score": 1.0, + "code_quality_score": 2.0, + "overall_score": 3.0 + }); + let parsed = parse_scorecard(snake).expect("snake_case must deserialize"); + assert_eq!(parsed.completion_score, 1.0); + assert_eq!(parsed.code_quality_score, 2.0); + assert_eq!(parsed.overall_score, 3.0); + } +} diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs new file mode 100644 index 000000000..f9cc09d06 --- /dev/null +++ b/evals/jbench/src/lessons.rs @@ -0,0 +1,320 @@ +//! Lessons extractor. +//! +//! After an eval run finishes, the lessons extractor compares the +//! agent's actual diff and trace against the ground-truth diff and +//! distills a small list of [`Lesson`]s describing what went wrong and +//! what the agent should have done instead. These can be appended to a +//! per-agent lessons file and folded back into the agent's system +//! prompt or memory graph. +//! +//! Design source: `/tmp/codebuff/evals/buffbench/lessons-extractor.ts`. + +use std::fs; +use std::path::Path; +use std::sync::OnceLock; +use std::time::Duration; + +use anyhow::{Context, Result}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use tokio::time::Duration as TokioDuration; + +/// Timeout for a lessons extraction call. +const LESSONS_TIMEOUT_SECS: u64 = 20 * 60; + +/// One distilled lesson from a single eval run. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Lesson { + pub what_went_wrong: String, + pub what_should_have_been_done: String, +} + +/// Configuration for lessons extraction. +#[derive(Debug, Clone)] +pub struct LessonsConfig { + pub api_base: String, + pub api_key: String, + pub model: String, + pub http_client: Option, +} + +impl Default for LessonsConfig { + fn default() -> Self { + Self { + api_base: std::env::var("JBENCH_API_BASE") + .unwrap_or_else(|_| "https://api.openai.com".to_owned()), + api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + model: "gpt-5-2026-05".to_owned(), + http_client: None, + } + } +} + +fn render_lessons_prompt( + prompt: &str, + ground_truth_diff: &str, + agent_diff: &str, + agent_trace: &str, + judge_summary: Option<&str>, + error: Option<&str>, +) -> String { + let judge_section = judge_summary + .map(|s| format!("\n## Judge Summary\n{s}")) + .unwrap_or_default(); + let error_section = error + .map(|e| format!("\n## Agent Error\n{e}")) + .unwrap_or_default(); + format!( + "## User Prompt\n{prompt}\n\n\ + ## Ground Truth Changes (One valid implementation)\n\ + ```diff\n{ground_truth_diff}\n```\n\n\ + ## Agent's Changes\n\ + ```diff\n{agent_diff}\n```\n\n\ + ## Agent Trace\n\ + ```json\n{agent_trace}\n```\ + {judge_section}{error_section}\n\n\ + Task: Analyze what went wrong and what should have been done.", + prompt = prompt, + ground_truth_diff = ground_truth_diff, + agent_diff = agent_diff, + agent_trace = agent_trace, + judge_section = judge_section, + error_section = error_section + ) +} + +fn lessons_system_prompt() -> &'static str { + r#"You are a Lesson Extractor. Your job: analyze agent performance and extract actionable lessons. + +Context you receive: +- User prompt (what the coding agent was asked) +- Ground truth diffs (one valid solution path) +- The agent's diffs (what they actually changed) +- A truncated agent trace showing HOW they worked +- Optional judge summary (scores, weaknesses) + +You must output an array of lessons. Each lesson has two parts: + +1. **whatWentWrong**: What the agent did incorrectly, misunderstood, or failed to do +2. **whatShouldHaveBeenDone**: The correct approach the agent should have taken + +Rules: +- Each lesson should be a complete learning unit (problem + solution) +- Keep lessons terse but precise (~140 chars per field) +- Do not include things the agent already did correctly +- Focus on gaps that, if filled, would have improved the outcome"# +} + +/// Run the lessons-extractor judge over a finished eval run and return +/// zero or more [`Lesson`]s. +pub async fn extract_lessons( + prompt: &str, + ground_truth_diff: &str, + agent_diff: &str, + agent_trace: &str, + config: &LessonsConfig, + judge_summary: Option<&str>, + error: Option<&str>, +) -> Result> { + let prompt_text = render_lessons_prompt( + prompt, + ground_truth_diff, + agent_diff, + agent_trace, + judge_summary, + error, + ); + + let http = match &config.http_client { + Some(c) => c, + None => { + static CLIENT: OnceLock = OnceLock::new(); + CLIENT.get_or_init(|| { + reqwest::Client::builder() + .connect_timeout(Duration::from_secs(15)) + .tcp_keepalive(Duration::from_secs(30)) + .pool_idle_timeout(Duration::from_secs(90)) + .build() + .expect("reqwest client must build") + }) + } + }; + + let request_body = serde_json::json!({ + "model": &config.model, + "input": [ + { "role": "system", "content": lessons_system_prompt() }, + { "role": "user", "content": prompt_text } + ], + "tools": [ + { + "type": "function", + "name": "set_output", + "description": "Submit lessons derived from this evaluation", + "parameters": { + "type": "object", + "properties": { + "lessons": { + "type": "array", + "items": { + "type": "object", + "properties": { + "whatWentWrong": { "type": "string" }, + "whatShouldHaveBeenDone": { "type": "string" } + }, + "required": ["whatWentWrong", "whatShouldHaveBeenDone"] + } + } + }, + "required": ["lessons"] + } + } + ], + "tool_choice": { "type": "function", "name": "set_output" }, + "output_schema": { + "type": "object", + "properties": { + "lessons": { + "type": "array", + "items": { + "type": "object", + "properties": { + "whatWentWrong": { "type": "string" }, + "whatShouldHaveBeenDone": { "type": "string" } + }, + "required": ["whatWentWrong", "whatShouldHaveBeenDone"] + } + } + }, + "required": ["lessons"] + }, + }); + + let url = format!("{}/v1/responses", config.api_base); + let response = http + .post(&url) + .header("Authorization", format!("Bearer {}", config.api_key)) + .header("Content-Type", "application/json") + .json(&request_body) + .timeout(TokioDuration::from_secs(LESSONS_TIMEOUT_SECS)) + .send() + .await + .context("lessons extraction HTTP request failed")?; + + let body: serde_json::Value = response + .json() + .await + .context("failed to parse lessons extractor response")?; + + let lessons_json = body + .get("output") + .and_then(|o| o.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("content")) + .and_then(|c| c.as_array()) + .and_then(|arr| arr.first()) + .and_then(|item| item.get("text")) + .and_then(|t| t.as_str()) + .and_then(|t| serde_json::from_str::(t).ok()) + .or_else(|| body.get("output").cloned()) + .unwrap_or(serde_json::json!({ "lessons": [] })); + + let lessons: Vec = lessons_json + .get("lessons") + .and_then(|l| l.as_array()) + .map(|arr| { + arr.iter() + .filter_map(|v| serde_json::from_value(v.clone()).ok()) + .collect() + }) + .unwrap_or_default(); + + Ok(lessons) +} + +/// Append `lessons` to the per-agent lessons file at +/// `lessons_dir/.json`, creating the file (and the directory) +/// if needed. +pub fn append_lessons_to_file( + agent_id: &str, + lessons: &[Lesson], + lessons_dir: &Path, +) -> Result<()> { + if lessons.is_empty() { + return Ok(()); + } + + if !lessons_dir.exists() { + fs::create_dir_all(lessons_dir).context("failed to create lessons directory")?; + } + + let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_"); + let file_path = lessons_dir.join(format!("{safe_id}.json")); + + let existing: Vec = if file_path.exists() { + let contents = + fs::read_to_string(&file_path).context("failed to read existing lessons file")?; + serde_json::from_str(&contents).unwrap_or_default() + } else { + Vec::new() + }; + + let all_lessons: Vec = existing + .into_iter() + .chain(lessons.iter().cloned()) + .collect(); + + let json = serde_json::to_string_pretty(&all_lessons).context("failed to serialize lessons")?; + + fs::write(&file_path, json).context("failed to write lessons file")?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn append_lessons_to_empty_dir() { + let tmp = TempDir::new().unwrap(); + let result = append_lessons_to_file( + "test-agent", + &[Lesson { + what_went_wrong: "forgot null check".to_owned(), + what_should_have_been_done: "add null guard".to_owned(), + }], + tmp.path(), + ); + assert!(result.is_ok()); + let contents = fs::read_to_string(tmp.path().join("test-agent.json")).unwrap(); + let lessons: Vec = serde_json::from_str(&contents).unwrap(); + assert_eq!(lessons.len(), 1); + } + + #[test] + fn append_lessons_accumulates() { + let tmp = TempDir::new().unwrap(); + let agent = "clone-agent"; + + fs::create_dir_all(tmp.path()).unwrap(); + let file_path = tmp.path().join("clone-agent.json"); + let first = vec![Lesson { + what_went_wrong: "first mistake".to_owned(), + what_should_have_been_done: "first fix".to_owned(), + }]; + let json = serde_json::to_string_pretty(&first).unwrap(); + fs::write(&file_path, json).unwrap(); + + let second = vec![Lesson { + what_went_wrong: "second mistake".to_owned(), + what_should_have_been_done: "second fix".to_owned(), + }]; + append_lessons_to_file(agent, &second, tmp.path()).unwrap(); + + let contents = fs::read_to_string(tmp.path().join("clone-agent.json")).unwrap(); + let lessons: Vec = serde_json::from_str(&contents).unwrap(); + assert_eq!(lessons.len(), 2); + } +} diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs new file mode 100644 index 000000000..48860cdcb --- /dev/null +++ b/evals/jbench/src/lib.rs @@ -0,0 +1,26 @@ +//! JBench — jcode's git-commit-reconstruction evaluation framework. +//! +//! This crate is a scaffold: data types are real and roundtrip-tested, +//! but orchestration logic is stubbed with `unimplemented!()` so that +//! reviewers can validate the public API surface before behavior lands. +//! +//! See `README.md` for the design and the BuffBench reference at +//! `/tmp/codebuff/evals/buffbench/` for the TypeScript original. +//! +//! The crate consumes [`jcode_agent_runtime::AgentRegistry`] and +//! [`jcode_agent_runtime::AgentDefinition`] for agent discovery and +//! configuration; it does not redefine those concepts locally. + +#![forbid(unsafe_code)] + +#[cfg(feature = "agent-runner")] +pub mod agent_runner; +pub mod judge; +pub mod lessons; +pub mod types; + +#[cfg(feature = "agent-runner")] +pub use agent_runner::AgentRunConfig; +pub use judge::JudgeConfig; +pub use lessons::LessonsConfig; +pub use types::{AgentEvalResults, EvalCommit, EvalDataV2, EvalRun, JudgingResult}; diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs new file mode 100644 index 000000000..39d4645c5 --- /dev/null +++ b/evals/jbench/src/types.rs @@ -0,0 +1,195 @@ +//! Serializable data types modeling JBench's eval inputs and outputs. +//! +//! These types are direct Rust analogues of BuffBench's TypeScript types +//! (`/tmp/codebuff/evals/buffbench/types.ts`) with one deliberate +//! deviation: every field uses `snake_case` in both the Rust definition +//! and the on-disk JSON form, because the rest of jcode's serialized +//! formats already follow `snake_case`. +//! +//! All public types derive `Debug`, `Clone`, `Serialize`, and +//! `Deserialize`. Numeric scores are `f64` in the `[0.0, 10.0]` range — +//! validation is not enforced at the type level so partial / in-progress +//! results round-trip cleanly. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// Status of a single file inside an [`EvalCommit`]'s diff. +/// +/// Mirrors BuffBench's `'modified' | 'added' | 'deleted' | 'renamed'` +/// string union; serialized as lowercase strings so generated eval JSON +/// stays compact and readable. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum FileDiffStatus { + /// File existed before and after, with content changes. + Modified, + /// File was created in this commit. + Added, + /// File was deleted in this commit. + Deleted, + /// File was renamed (and possibly modified) in this commit. + Renamed, +} + +/// Per-file diff entry for a single eval commit. +/// +/// `old_path` is populated only for `Renamed` entries; for all other +/// statuses it is `None` and skipped during serialization to keep the +/// JSON output compact. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FileDiff { + /// Current path of the file (post-commit). For renames this is the + /// new name. + pub path: String, + /// What kind of change this file underwent. + pub status: FileDiffStatus, + /// Previous path, only populated when `status == Renamed`. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub old_path: Option, + /// Unified diff text for the change. May be empty for pure renames. + pub diff: String, +} + +/// One eval task: a single git commit reconstructed from its parent. +/// +/// The agent under test starts from `parent_sha`, is given `prompt`, +/// and is judged against `file_diffs`. `supplemental_files` lists +/// additional context paths the harness should preload into the agent's +/// view (BuffBench picks these via a separate filter step). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalCommit { + /// Stable identifier for the task, typically `-`. + pub id: String, + /// Target commit SHA — the ground-truth state. + pub sha: String, + /// Parent commit SHA — the starting state for the agent. + pub parent_sha: String, + /// Technical specification distilled from the commit message. + pub spec: String, + /// Natural-language prompt presented to the agent under test. + pub prompt: String, + /// Extra files (relative paths) the harness should expose as + /// context, in addition to whatever the agent fetches itself. + pub supplemental_files: Vec, + /// Ground-truth file diffs for this commit. + pub file_diffs: Vec, +} + +/// Top-level eval data file (v2 schema), produced by `gen-evals` and +/// consumed by `run`. +/// +/// `env` and `final_check_commands` are reserved for future use by the +/// runner; they are part of the on-disk schema today so eval JSON files +/// authored against this scaffold remain forward-compatible. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalDataV2 { + /// Source repository to clone for each task. + pub repo_url: String, + /// Optional override for the local clone directory name. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub test_repo_name: Option, + /// ISO-8601 timestamp of when this eval file was generated. + pub generation_date: String, + /// Optional one-time setup command (e.g. `npm install`). + #[serde(skip_serializing_if = "Option::is_none", default)] + pub init_command: Option, + /// Environment variables to apply when running agents and final + /// checks. Defaults to empty. + #[serde(default)] + pub env: HashMap, + /// Validation commands run after the agent finishes (e.g. `cargo + /// test`). Defaults to empty. + #[serde(default)] + pub final_check_commands: Vec, + /// The actual list of commits to evaluate against. + pub eval_commits: Vec, +} + +/// Output of a single judge invocation (or the median of three). +/// +/// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is +/// used so we can also store the *averaged* per-dimension scores when +/// aggregating multiple judges (see `judge::judge_with_three_models`). +/// +/// On-disk JSON stays `snake_case` to match the rest of jcode's eval +/// outputs, but each score field also accepts the `camelCase` spelling +/// (`completionScore`, etc.) via `serde(alias = ...)` so we can +/// deserialize LLM judge responses directly without an intermediate +/// wire-format struct. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct JudgingResult { + /// Free-form prose comparing the agent's diff to the ground truth. + pub analysis: String, + /// Bullet-point strengths called out by the judge. + pub strengths: Vec, + /// Bullet-point weaknesses called out by the judge. + pub weaknesses: Vec, + /// How completely the prompt was addressed, `[0.0, 10.0]`. + #[serde(alias = "completionScore")] + pub completion_score: f64, + /// Code structure / maintainability, `[0.0, 10.0]`. + #[serde(alias = "codeQualityScore")] + pub code_quality_score: f64, + /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric. + #[serde(alias = "overallScore")] + pub overall_score: f64, +} + +impl Default for JudgingResult { + fn default() -> Self { + Self { + analysis: String::new(), + strengths: Vec::new(), + weaknesses: Vec::new(), + completion_score: 0.0, + code_quality_score: 0.0, + overall_score: 0.0, + } + } +} + +/// Outcome of running one agent on one eval commit. +/// +/// `error` is `Some` when the agent crashed, timed out, or otherwise +/// failed to produce a usable diff; in that case `judging` will +/// typically contain a zero-scored placeholder. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EvalRun { + /// SHA of the eval commit this run targeted. + pub commit_sha: String, + /// Prompt the agent was given. + pub prompt: String, + /// Unified diff produced by the agent against the parent commit. + pub diff: String, + /// Three-judge result (see [`crate::judge`]). + pub judging: JudgingResult, + /// Estimated USD cost of running the agent. + pub cost_usd: f64, + /// Wall-clock duration of the run in milliseconds. + pub duration_ms: u64, + /// Populated when the run failed to complete cleanly. + #[serde(skip_serializing_if = "Option::is_none", default)] + pub error: Option, +} + +/// Aggregated results for one agent across an entire eval suite. +/// +/// `average_score` here is `overall_score`; cost and duration averages +/// are computed across **all** runs (including failures) so consumers +/// can spot agents that are cheap or fast at the price of correctness. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AgentEvalResults { + /// ID of the agent (matches an `AgentDefinition::id` in the + /// `jcode-agent-runtime` registry). + pub agent_id: String, + /// Per-commit runs, in evaluation order. + pub runs: Vec, + /// Mean of `judging.overall_score` across runs. + pub average_score: f64, + /// Mean of `cost_usd` across runs. + pub average_cost: f64, + /// Mean of `duration_ms` across runs. + pub average_duration_ms: u64, +} diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs new file mode 100644 index 000000000..fcaa832fb --- /dev/null +++ b/evals/jbench/tests/types.rs @@ -0,0 +1,106 @@ +//! Serde round-trip smoke tests for the public data types. +//! +//! These exercise the JSON shape that `gen-evals` and `run` will read +//! and write, and they fail loudly if anyone changes a field's +//! `snake_case` name without updating consumers. + +use jcode_jbench::types::{EvalCommit, FileDiff, FileDiffStatus, JudgingResult}; + +#[test] +fn eval_commit_round_trips_through_json() { + let original = EvalCommit { + id: "abc1234-add-readme".to_string(), + sha: "abc1234deadbeef".to_string(), + parent_sha: "0011223344556677".to_string(), + spec: "Add a README describing the project.".to_string(), + prompt: "Please add a README.md at the repo root.".to_string(), + supplemental_files: vec!["Cargo.toml".to_string(), "src/lib.rs".to_string()], + file_diffs: vec![FileDiff { + path: "README.md".to_string(), + status: FileDiffStatus::Added, + old_path: None, + diff: "+++ b/README.md\n@@ -0,0 +1 @@\n+hello\n".to_string(), + }], + }; + + let json = serde_json::to_string(&original).expect("serialize EvalCommit"); + // Sanity-check the wire format is snake_case as documented. + assert!(json.contains("\"parent_sha\"")); + assert!(json.contains("\"supplemental_files\"")); + assert!(json.contains("\"file_diffs\"")); + + let decoded: EvalCommit = serde_json::from_str(&json).expect("deserialize EvalCommit"); + assert_eq!(decoded.id, original.id); + assert_eq!(decoded.sha, original.sha); + assert_eq!(decoded.parent_sha, original.parent_sha); + assert_eq!(decoded.spec, original.spec); + assert_eq!(decoded.prompt, original.prompt); + assert_eq!(decoded.supplemental_files, original.supplemental_files); + assert_eq!(decoded.file_diffs.len(), 1); + assert_eq!(decoded.file_diffs[0].path, "README.md"); + assert!(matches!( + decoded.file_diffs[0].status, + FileDiffStatus::Added + )); +} + +#[test] +fn file_diff_round_trips_renamed_with_old_path() { + let original = FileDiff { + path: "src/new_name.rs".to_string(), + status: FileDiffStatus::Renamed, + old_path: Some("src/old_name.rs".to_string()), + diff: "rename from src/old_name.rs\nrename to src/new_name.rs\n".to_string(), + }; + + let json = serde_json::to_string(&original).expect("serialize FileDiff"); + assert!(json.contains("\"status\":\"renamed\"")); + assert!(json.contains("\"old_path\":\"src/old_name.rs\"")); + + let decoded: FileDiff = serde_json::from_str(&json).expect("deserialize FileDiff"); + assert_eq!(decoded.path, original.path); + assert!(matches!(decoded.status, FileDiffStatus::Renamed)); + assert_eq!(decoded.old_path.as_deref(), Some("src/old_name.rs")); + assert_eq!(decoded.diff, original.diff); + + // And a Modified entry should omit `old_path` from the JSON. + let modified = FileDiff { + path: "src/lib.rs".to_string(), + status: FileDiffStatus::Modified, + old_path: None, + diff: "@@ -1 +1 @@\n-old\n+new\n".to_string(), + }; + let modified_json = serde_json::to_string(&modified).expect("serialize Modified FileDiff"); + assert!( + !modified_json.contains("old_path"), + "old_path should be skipped when None, got: {modified_json}" + ); +} + +#[test] +fn judging_result_round_trips_through_json() { + let original = JudgingResult { + analysis: "The agent addressed the prompt and produced clean code.".to_string(), + strengths: vec![ + "Followed existing module structure.".to_string(), + "Added a passing test.".to_string(), + ], + weaknesses: vec!["Missed an edge case in error handling.".to_string()], + completion_score: 8.5, + code_quality_score: 7.0, + overall_score: 7.75, + }; + + let json = serde_json::to_string(&original).expect("serialize JudgingResult"); + assert!(json.contains("\"completion_score\"")); + assert!(json.contains("\"code_quality_score\"")); + assert!(json.contains("\"overall_score\"")); + + let decoded: JudgingResult = serde_json::from_str(&json).expect("deserialize JudgingResult"); + assert_eq!(decoded.analysis, original.analysis); + assert_eq!(decoded.strengths, original.strengths); + assert_eq!(decoded.weaknesses, original.weaknesses); + assert!((decoded.completion_score - original.completion_score).abs() < f64::EPSILON); + assert!((decoded.code_quality_score - original.code_quality_score).abs() < f64::EPSILON); + assert!((decoded.overall_score - original.overall_score).abs() < f64::EPSILON); +} diff --git a/src/bin/harness.rs b/src/bin/harness.rs index d6e9a301d..e0a467f98 100644 --- a/src/bin/harness.rs +++ b/src/bin/harness.rs @@ -73,7 +73,7 @@ async fn main() -> Result<()> { eprintln!("Harness workspace: {}", workspace.display()); let provider: Arc = Arc::new(NoopProvider); - let registry = Registry::new(provider).await; + let registry = Registry::new(provider, None).await; let session_id = new_id("harness"); let base_ctx = ToolContext { diff --git a/src/cli/commands.rs b/src/cli/commands.rs index cbe734875..a257ce34e 100644 --- a/src/cli/commands.rs +++ b/src/cli/commands.rs @@ -2595,7 +2595,7 @@ pub async fn run_single_message_command( } else { super::provider_init::init_provider_for_validation(choice, model).await? }; - let registry = crate::tool::Registry::new(provider.clone()).await; + let registry = crate::tool::Registry::new(provider.clone(), crate::tool::shared_agent_registry()).await; let mut agent = crate::agent::Agent::new(provider.clone(), registry); restore_agent_session_if_requested(&mut agent, resume_session)?; diff --git a/src/cli/commands_tests.rs b/src/cli/commands_tests.rs index c8aba0c90..224e4bceb 100644 --- a/src/cli/commands_tests.rs +++ b/src/cli/commands_tests.rs @@ -952,7 +952,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() { let _guard = crate::storage::lock_test_env(); let provider: Arc = Arc::new(TestProvider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut original = crate::agent::Agent::new(provider.clone(), registry); let original_session_id = original.session_id().to_string(); original @@ -960,7 +960,7 @@ async fn restore_agent_session_if_requested_restores_resumed_session() { .await .expect("seed session"); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut resumed = crate::agent::Agent::new(provider, registry); let fresh_session_id = resumed.session_id().to_string(); assert_ne!(fresh_session_id, original_session_id); diff --git a/src/cli/provider_doctor.rs b/src/cli/provider_doctor.rs index e438604db..c257165c1 100644 --- a/src/cli/provider_doctor.rs +++ b/src/cli/provider_doctor.rs @@ -30,9 +30,8 @@ pub async fn run_provider_doctor_command( Some("claude") => run_claude_native_e2e(provider, model, tier).await?, Some("antigravity") => run_antigravity_native_e2e(provider, model, tier).await?, Some(other) => { - let kind = NativeProviderKind::from_normalized(other).ok_or_else(|| { - anyhow!("`{provider}` has no native provider-doctor driver") - })?; + let kind = NativeProviderKind::from_normalized(other) + .ok_or_else(|| anyhow!("`{provider}` has no native provider-doctor driver"))?; run_generic_native_e2e(kind, model, tier).await? } None => anyhow::bail!("`{provider}` has no native provider-doctor driver"), diff --git a/src/cli/provider_init.rs b/src/cli/provider_init.rs index 7fce11689..f6efc81a7 100644 --- a/src/cli/provider_init.rs +++ b/src/cli/provider_init.rs @@ -1780,7 +1780,7 @@ pub async fn init_provider_and_registry( model: Option<&str>, ) -> Result<(Arc, tool::Registry)> { let provider = init_provider(choice, model).await?; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await; Ok((provider, registry)) } @@ -1789,7 +1789,7 @@ pub async fn init_provider_and_registry_for_validation( model: Option<&str>, ) -> Result<(Arc, tool::Registry)> { let provider = init_provider_for_validation(choice, model).await?; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), tool::shared_agent_registry()).await; Ok((provider, registry)) } diff --git a/src/cli/selfdev_tests.rs b/src/cli/selfdev_tests.rs index 643f73902..0836c9df7 100644 --- a/src/cli/selfdev_tests.rs +++ b/src/cli/selfdev_tests.rs @@ -130,7 +130,7 @@ async fn test_selfdev_tool_registration() { assert!(session.is_canary, "Session should be marked as canary"); let provider = Arc::new(TestProvider) as Arc; - let registry = tool::Registry::new(provider).await; + let registry = tool::Registry::new(provider, None).await; let tools_before: Vec = registry.tool_names().await; let has_selfdev_before = tools_before.contains(&"selfdev".to_string()); @@ -167,7 +167,7 @@ async fn test_selfdev_session_and_registry() { assert!(loaded.is_canary, "Loaded session should be canary"); let provider = Arc::new(TestProvider) as Arc; - let registry = tool::Registry::new(provider.clone()).await; + let registry = tool::Registry::new(provider.clone(), None).await; let tools_before = registry.tool_names().await; assert!( diff --git a/tests/e2e/ambient.rs b/tests/e2e/ambient.rs index d92012834..9438f2b0f 100644 --- a/tests/e2e/ambient.rs +++ b/tests/e2e/ambient.rs @@ -203,7 +203,7 @@ async fn test_ambient_end_cycle_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -261,7 +261,7 @@ async fn test_ambient_request_permission_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -309,7 +309,7 @@ async fn test_ambient_schedule_tool() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider, registry); @@ -585,7 +585,7 @@ async fn test_full_ambient_cycle_simulation() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; registry.register_ambient_tools().await; let mut agent = Agent::new(provider.clone(), registry); diff --git a/tests/e2e/provider_behavior.rs b/tests/e2e/provider_behavior.rs index 5bce2b96f..f82213547 100644 --- a/tests/e2e/provider_behavior.rs +++ b/tests/e2e/provider_behavior.rs @@ -25,7 +25,7 @@ async fn test_multi_turn_conversation() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); // First turn @@ -60,7 +60,7 @@ async fn test_token_usage() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let response = agent.run_once_capture("Test").await?; @@ -84,7 +84,7 @@ async fn test_stream_error() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let result = agent.run_once_capture("Test").await; @@ -800,7 +800,7 @@ async fn test_system_prompt_no_claude_code_identity() -> Result<()> { // Keep a clone of Arc before converting to Arc let provider_for_check = provider.clone(); let provider_dyn: Arc = provider; - let registry = Registry::new(provider_dyn.clone()).await; + let registry = Registry::new(provider_dyn.clone(), None).await; let mut agent = Agent::new(provider_dyn, registry); // Run a simple query - we just need to trigger a complete() call diff --git a/tests/e2e/reload_multiclient.rs b/tests/e2e/reload_multiclient.rs index dd8fd6b6f..8e6e077cd 100644 --- a/tests/e2e/reload_multiclient.rs +++ b/tests/e2e/reload_multiclient.rs @@ -160,7 +160,10 @@ async fn reload_notifies_successor_after_session_takeover() -> Result<()> { assert!( b_saw, "the live successor connection must be told the server is reloading; saw: {:?}", - b_events.iter().map(|e| format!("{e:?}")).collect::>() + b_events + .iter() + .map(|e| format!("{e:?}")) + .collect::>() ); // The superseded original connection must end (disconnect) rather than diff --git a/tests/e2e/session_flow.rs b/tests/e2e/session_flow.rs index b84df85a1..587781d8b 100644 --- a/tests/e2e/session_flow.rs +++ b/tests/e2e/session_flow.rs @@ -138,7 +138,7 @@ async fn test_simple_response() -> Result<()> { ]); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); let response = agent.run_once_capture("Say hello").await?; @@ -154,7 +154,7 @@ async fn test_agent_clear_preserves_debug_flag() -> Result<()> { let _env = setup_test_env()?; let provider = MockProvider::new(); let provider: Arc = Arc::new(provider); - let registry = Registry::new(provider.clone()).await; + let registry = Registry::new(provider.clone(), None).await; let mut agent = Agent::new(provider, registry); agent.set_debug(true); let old_session_id = agent.session_id().to_string();