From c0bcaca4c280934f911c6a54cf3021ec8f20101a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?= Date: Thu, 28 May 2026 01:50:55 +0000 Subject: [PATCH] fix(agent-runtime): address PR #313 review issues Bugs fixed: 1. JudgingResult deserialization (jbench/types.rs) The judge prompt schema asks for camelCase fields (completionScore, codeQualityScore, overallScore) but the Rust struct used snake_case without serde rename. parse_scorecard would fail on every real judge response. Fix: add #[serde(alias = ...)] on each score field so on-disk JSON stays snake_case while LLM-returned camelCase still deserializes cleanly. 2. Anthropic judge authentication (jbench/judge.rs) run_anthropic_judge used Authorization: Bearer which always 401s on the Anthropic Messages API. Fix: switch to x-api-key header (Anthropic standard). Also split JudgeConfig::api_base / api_key from new anthropic_api_base / anthropic_api_key so the Anthropic branch can target api.anthropic.com without breaking the OpenAI-compatible path. Plumbed through run_single_judge. 3. Duplicate substitute_placeholders (src/prompt_placeholders.rs) Conflicts with the existing prompt_templates::substitute_placeholders. Different semantics (fixed context vs HashMap bindings) but same name made grep / jump-to-def ambiguous. Fix: rename the new one to substitute_context_placeholders and document the relationship in the doc comment. 4. meta_analyze .run.json filter (jbench/bin/jbench.rs) path.extension() returns only the final extension ('json'), so matching against "run.json" never fired. meta-analyze would always report zero runs. Fix: match against file_name().ends_with(".run.json"). Plus: - Run cargo fmt --all to clear the Format CI job that PR #313 was failing. - Add tests parse_scorecard_accepts_camelcase_from_llm and parse_scorecard_accepts_snake_case_from_disk to lock in the wire-format contract. --- crates/jcode-agent-runtime/src/definition.rs | 30 +--- crates/jcode-agent-runtime/src/lib.rs | 6 +- crates/jcode-agent-runtime/src/output.rs | 5 +- crates/jcode-agent-runtime/src/reasoning.rs | 10 +- crates/jcode-agent-runtime/src/registry.rs | 57 ++++---- crates/jcode-agent-runtime/src/tier.rs | 10 +- .../tests/sample_agents.rs | 22 ++- evals/jbench/src/agent_runner.rs | 9 +- evals/jbench/src/bin/jbench.rs | 69 +++++++--- evals/jbench/src/judge.rs | 130 ++++++++++++------ evals/jbench/src/lessons.rs | 13 +- evals/jbench/src/lib.rs | 2 +- evals/jbench/src/types.rs | 9 ++ evals/jbench/tests/types.rs | 4 +- src/agent/prompting.rs | 1 - src/prompt_placeholders.rs | 24 ++-- src/tui/app/commands.rs | 3 +- tests/tool_fixtures.rs | 7 +- 18 files changed, 250 insertions(+), 161 deletions(-) diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs index a067668c6..4adeeabbd 100644 --- a/crates/jcode-agent-runtime/src/definition.rs +++ b/crates/jcode-agent-runtime/src/definition.rs @@ -174,9 +174,7 @@ fn default_version() -> String { /// invariants. Displayed to users when a TOML file fails to load. #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum DefinitionError { - #[error( - "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen" - )] + #[error("agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen")] InvalidId(String), #[error( @@ -184,9 +182,7 @@ pub enum DefinitionError { )] SystemPromptConflict { id: String }, - #[error( - "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing" - )] + #[error("agent `{id}` has `output_mode = structured_output` but `output_schema` is missing")] StructuredOutputMissingSchema { id: String }, #[error("agent `{id}` references itself in `spawnable_agents`")] @@ -209,9 +205,7 @@ pub enum DefinitionError { /// agent spawn time. #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)] pub enum ReferenceError { - #[error( - "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}" - )] + #[error("agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}")] UnknownTools { id: String, unknown: String, @@ -245,8 +239,7 @@ impl AgentDefinition { } // 3. structured_output requires schema - if matches!(self.output_mode, OutputMode::StructuredOutput) - && self.output_schema.is_none() + if matches!(self.output_mode, OutputMode::StructuredOutput) && self.output_schema.is_none() { return Err(DefinitionError::StructuredOutputMissingSchema { id: self.id.clone(), @@ -422,30 +415,21 @@ mod tests { fn id_validation_rejects_uppercase() { let mut d = minimal_definition("File-Picker"); d.id = "File-Picker".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] fn id_validation_rejects_underscore() { let mut d = minimal_definition("file_picker"); d.id = "file_picker".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] fn id_validation_rejects_leading_hyphen() { let mut d = minimal_definition("ok"); d.id = "-bad".to_string(); - assert!(matches!( - d.validate(), - Err(DefinitionError::InvalidId(_)) - )); + assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_)))); } #[test] diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs index b78ad983f..80979a845 100644 --- a/crates/jcode-agent-runtime/src/lib.rs +++ b/crates/jcode-agent-runtime/src/lib.rs @@ -38,10 +38,8 @@ pub use signals::{ }; // New public surface (Phase 0). -pub use definition::{ - AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION, -}; +pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError}; pub use output::OutputMode; pub use reasoning::ReasoningEffort; pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind}; -pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource}; +pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source}; diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs index 1ba93dd1a..93dc60a93 100644 --- a/crates/jcode-agent-runtime/src/output.rs +++ b/crates/jcode-agent-runtime/src/output.rs @@ -53,7 +53,10 @@ mod tests { #[test] fn parse_accepts_aliases() { - assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage)); + assert_eq!( + OutputMode::parse("last_message"), + Some(OutputMode::LastMessage) + ); assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages)); assert_eq!( OutputMode::parse("structured"), diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs index d48bafaeb..7cdf8d010 100644 --- a/crates/jcode-agent-runtime/src/reasoning.rs +++ b/crates/jcode-agent-runtime/src/reasoning.rs @@ -79,9 +79,15 @@ mod tests { ReasoningEffort::parse("minimal"), Some(ReasoningEffort::Minimal) ); - assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal)); + assert_eq!( + ReasoningEffort::parse("OFF"), + Some(ReasoningEffort::Minimal) + ); assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High)); - assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium)); + assert_eq!( + ReasoningEffort::parse("default"), + Some(ReasoningEffort::Medium) + ); assert_eq!(ReasoningEffort::parse(""), None); assert_eq!(ReasoningEffort::parse("absurd"), None); } diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs index 71cab810d..82f182b2d 100644 --- a/crates/jcode-agent-runtime/src/registry.rs +++ b/crates/jcode-agent-runtime/src/registry.rs @@ -90,9 +90,7 @@ pub enum LoadError { source: DefinitionError, }, - #[error( - "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`." - )] + #[error("filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`.")] FileNameMismatch { path: PathBuf, id: String }, } @@ -175,10 +173,7 @@ impl AgentRegistry { /// Register a builtin agent. Builtins have the lowest priority and /// are overridable by both user and project files of the same id. - pub fn register_builtin( - &mut self, - definition: AgentDefinition, - ) -> Result<(), DefinitionError> { + pub fn register_builtin(&mut self, definition: AgentDefinition) -> Result<(), DefinitionError> { definition.validate()?; self.insert(LoadedAgent { definition, @@ -233,10 +228,7 @@ impl AgentRegistry { AgentSource::ProjectLocal { path: path.clone() } } }; - self.insert(LoadedAgent { - definition, - source, - }); + self.insert(LoadedAgent { definition, source }); loaded += 1; } Err(err) => { @@ -333,7 +325,10 @@ mod tests { fn missing_dir_is_zero_load_not_error() { let mut reg = AgentRegistry::new(); let n = reg - .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal) + .load_directory( + Path::new("/nonexistent/jcode-test-dir"), + SourceKind::UserGlobal, + ) .unwrap(); assert_eq!(n, 0); assert!(reg.is_empty()); @@ -382,7 +377,10 @@ mod tests { output_schema: None, }; reg.register_builtin(builtin_def.clone()).unwrap(); - assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor"); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "Builtin Editor" + ); // User let user_dir = temp_dir("user"); @@ -394,8 +392,12 @@ mod tests { display_name = "User Editor" "#, ); - reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap(); - assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor"); + reg.load_directory(&user_dir, SourceKind::UserGlobal) + .unwrap(); + assert_eq!( + reg.get("editor").unwrap().definition.display_name, + "User Editor" + ); // Project let proj_dir = temp_dir("proj"); @@ -407,7 +409,8 @@ mod tests { display_name = "Project Editor" "#, ); - reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap(); + reg.load_directory(&proj_dir, SourceKind::ProjectLocal) + .unwrap(); assert_eq!( reg.get("editor").unwrap().definition.display_name, "Project Editor" @@ -432,10 +435,7 @@ mod tests { reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); assert!(reg.is_empty(), "no agents registered"); assert_eq!(reg.load_errors().len(), 1); - assert!(matches!( - reg.load_errors()[0], - LoadError::Parse { .. } - )); + assert!(matches!(reg.load_errors()[0], LoadError::Parse { .. })); } #[test] @@ -453,10 +453,7 @@ mod tests { reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); assert!(reg.is_empty()); assert_eq!(reg.load_errors().len(), 1); - assert!(matches!( - reg.load_errors()[0], - LoadError::Invalid { .. } - )); + assert!(matches!(reg.load_errors()[0], LoadError::Invalid { .. })); } #[test] @@ -506,14 +503,20 @@ mod tests { write_toml( &dir, &format!("{id}.toml"), - &format!(r#"id = "{id}" + &format!( + r#"id = "{id}" display_name = "{id}" -"#), +"# + ), ); } let mut reg = AgentRegistry::new(); reg.load_directory(&dir, SourceKind::UserGlobal).unwrap(); - let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect(); + let ids: Vec<_> = reg + .iter_sorted() + .iter() + .map(|a| a.definition.id.clone()) + .collect(); assert_eq!(ids, vec!["alpha", "mid", "zeta"]); } diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs index 200f511ed..33ee6288b 100644 --- a/crates/jcode-agent-runtime/src/tier.rs +++ b/crates/jcode-agent-runtime/src/tier.rs @@ -135,16 +135,10 @@ pub enum ResolutionSource { /// Used `agent.model_override` directly. Override(String), /// Used the env var backing `tier`. - Tier { - tier: ModelTier, - model: String, - }, + Tier { tier: ModelTier, model: String }, /// Tier was preferred but the env var was unset, so fell back to the /// session's current model. - TierFallback { - tier: ModelTier, - model: String, - }, + TierFallback { tier: ModelTier, model: String }, /// No override or tier preference; using the session's current model. SessionDefault(String), } diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs index e850495d5..ee6ee7034 100644 --- a/crates/jcode-agent-runtime/tests/sample_agents.rs +++ b/crates/jcode-agent-runtime/tests/sample_agents.rs @@ -9,9 +9,7 @@ use std::path::PathBuf; -use jcode_agent_runtime::{ - AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind, -}; +use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind}; /// Path to the project-root sample agents directory, relative to the /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so @@ -20,7 +18,12 @@ use jcode_agent_runtime::{ fn samples_dir() -> PathBuf { let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); // crates/jcode-agent-runtime → ../../ .jcode/agents - crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents") + crate_dir + .parent() + .unwrap() + .parent() + .unwrap() + .join(".jcode/agents") } #[test] @@ -37,7 +40,11 @@ fn loads_bundled_sample_agents() { .load_directory(&dir, SourceKind::ProjectLocal) .expect("load_directory"); assert!(n >= 2, "expected at least 2 sample agents, got {n}"); - assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors()); + assert!( + reg.load_errors().is_empty(), + "load errors: {:?}", + reg.load_errors() + ); } #[test] @@ -56,7 +63,10 @@ fn file_picker_sample_has_expected_shape() { assert_eq!(agent.display_name, "Fletcher the File Fetcher"); assert_eq!(agent.prefer_tier, Some(ModelTier::Routine)); assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal)); - assert!(!agent.include_message_history, "file picker uses clean slate"); + assert!( + !agent.include_message_history, + "file picker uses clean slate" + ); assert!(!agent.inherit_parent_system_prompt); assert_eq!(agent.output_mode, OutputMode::LastMessage); assert!(agent.tool_names.iter().any(|t| t == "read")); diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs index de922e71d..3763ee4c2 100644 --- a/evals/jbench/src/agent_runner.rs +++ b/evals/jbench/src/agent_runner.rs @@ -83,9 +83,12 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result { .current_dir(&config.repo_path) .envs(&env_vars) .args([ - "agent", "run", - "--agent", &config.agent_id, - "--output-mode", "stream", + "agent", + "run", + "--agent", + &config.agent_id, + "--output-mode", + "stream", "--no-interactive", ]) .stdin(Stdio::piped()) diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs index 90808dc60..2e54b50e7 100644 --- a/evals/jbench/src/bin/jbench.rs +++ b/evals/jbench/src/bin/jbench.rs @@ -9,8 +9,8 @@ use clap::{Parser, Subcommand}; use jcode_jbench::{ agent_runner::AgentRunConfig, - judge::{judge_with_three_models, JudgeConfig}, - lessons::{append_lessons_to_file, extract_lessons, LessonsConfig}, + judge::{JudgeConfig, judge_with_three_models}, + lessons::{LessonsConfig, append_lessons_to_file, extract_lessons}, types::{AgentEvalResults, EvalDataV2, EvalRun}, }; @@ -100,16 +100,40 @@ enum Command { async fn main() -> Result<()> { let cli = Cli::parse(); match cli.command { - Command::PickCommits { repo_url, min_msg_len, max_picks, output } => { + Command::PickCommits { + repo_url, + min_msg_len, + max_picks, + output, + } => { pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?; } Command::GenEvals { input, output } => { gen_evals_impl(&input, &output).await?; } - Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => { - run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?; + Command::Run { + eval_file, + agent_id, + output_dir, + jcode_binary, + max_turns, + timeout_secs, + } => { + run_impl( + &eval_file, + &agent_id, + &output_dir, + jcode_binary.as_ref(), + max_turns, + timeout_secs, + ) + .await?; } - Command::Judge { runs_dir, api_base, api_key } => { + Command::Judge { + runs_dir, + api_base, + api_key, + } => { judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?; } Command::MetaAnalyze { runs_dir, output } => { @@ -141,8 +165,8 @@ async fn run_impl( timeout_secs: u64, ) -> Result<()> { use std::fs; - use tokio::time::timeout as tk_timeout; use std::time::Duration; + use tokio::time::timeout as tk_timeout; // Load eval data let eval_data: EvalDataV2 = { @@ -199,22 +223,28 @@ async fn judge_impl( _api_base: Option<&str>, _api_key: Option<&str>, ) -> Result<()> { - todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields") + todo_step( + "Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields", + ) } -async fn meta_analyze_impl( - runs_dir: &PathBuf, - output: Option<&PathBuf>, -) -> Result<()> { - use std::fs; +async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Result<()> { use jcode_jbench::types::AgentEvalResults; + use std::fs; let mut all_runs = Vec::new(); for entry in fs::read_dir(runs_dir)? { let entry = entry?; let path = entry.path(); - if path.extension().and_then(|s| s.to_str()) == Some("run.json") { + // `Path::extension` returns only the trailing component (`json`), + // so matching against `"run.json"` never fires. Match on the full + // file name suffix instead. + let is_run_file = path + .file_name() + .and_then(|s| s.to_str()) + .is_some_and(|s| s.ends_with(".run.json")); + if is_run_file { let text = fs::read_to_string(&path)?; if let Ok(run) = serde_json::from_str::(&text) { all_runs.push(run); @@ -226,12 +256,13 @@ async fn meta_analyze_impl( anyhow::bail!("No .run.json files found in {}", runs_dir.display()); } - let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::() - / all_runs.len() as f64; - let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() + let avg_score = all_runs + .iter() + .map(|r| r.judging.overall_score) + .sum::() / all_runs.len() as f64; - let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() - / all_runs.len() as u64; + let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::() / all_runs.len() as f64; + let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::() / all_runs.len() as u64; let summary = AgentEvalResults { agent_id: "unknown".to_owned(), diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs index 589e5749d..8f9437f47 100644 --- a/evals/jbench/src/judge.rs +++ b/evals/jbench/src/judge.rs @@ -52,10 +52,18 @@ impl JudgeProviderKind { /// Configuration for the judging pipeline. #[derive(Debug, Clone)] pub struct JudgeConfig { - /// API base URL for the judge backend (e.g. OpenAI-compatible). + /// API base URL for the OpenAI-compatible judge backend. pub api_base: String, - /// API key secret. + /// API key for the OpenAI-compatible judge backend. pub api_key: String, + /// Optional separate base URL for Anthropic-routed judges (e.g. + /// `https://api.anthropic.com`). Falls back to `api_base` when + /// `None`, which only makes sense if the OpenAI-compatible host + /// proxies the Anthropic Messages API too. + pub anthropic_api_base: Option, + /// Optional separate API key for Anthropic-routed judges. Falls + /// back to `api_key` when `None`. + pub anthropic_api_key: Option, /// Model IDs for the three judges. Order determines the median /// computation. pub models: [String; 3], @@ -72,6 +80,8 @@ impl Default for JudgeConfig { api_base: std::env::var("JBENCH_API_BASE") .unwrap_or_else(|_| "https://api.openai.com".to_owned()), api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(), + anthropic_api_base: std::env::var("JBENCH_ANTHROPIC_API_BASE").ok(), + anthropic_api_key: std::env::var("JBENCH_ANTHROPIC_API_KEY").ok(), models: [ "gpt-5-2026-05".to_owned(), "google/gemini-3.1-pro".to_owned(), @@ -84,17 +94,15 @@ impl Default for JudgeConfig { } /// Render the full judge prompt from commit + diff + context. -fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap) -> String { +fn render_judge_prompt( + commit: &EvalCommit, + agent_diff: &str, + context_files: &HashMap, +) -> String { let ground_truth_diffs = commit .file_diffs .iter() - .map(|fd| { - format!( - "### {}\n```diff\n{}\n```", - fd.path, - fd.diff - ) - }) + .map(|fd| format!("### {}\n```diff\n{}\n```", fd.path, fd.diff)) .collect::>() .join("\n\n"); @@ -106,10 +114,7 @@ fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &Ha format!( "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```", - commit.prompt, - context_content, - ground_truth_diffs, - agent_diff + commit.prompt, context_content, ground_truth_diffs, agent_diff ) } @@ -168,12 +173,18 @@ struct JudgeResponse { /// Invoke a single judge model with a fully-rendered prompt. /// +/// `anthropic_api_base` / `anthropic_api_key` are only consulted when +/// the model routes through `JudgeProviderKind::Anthropic`; OpenAI-bound +/// requests always use the primary `api_base` / `api_key`. +/// /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`). pub async fn run_single_judge( model: &str, prompt: &str, api_base: &str, api_key: &str, + anthropic_api_base: Option<&str>, + anthropic_api_key: Option<&str>, http_client: &Client, ) -> Result { let kind = JudgeProviderKind::for_model(model); @@ -182,7 +193,12 @@ pub async fn run_single_judge( if kind == JudgeProviderKind::OpenAI { run_openai_judge(model, prompt, system, api_base, api_key, http_client).await } else { - run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await + // Fall back to the primary host/key only if no Anthropic-specific + // overrides were configured. The caller is expected to set both + // overrides when targeting `api.anthropic.com` directly. + let base = anthropic_api_base.unwrap_or(api_base); + let key = anthropic_api_key.unwrap_or(api_key); + run_anthropic_judge(model, prompt, system, base, key, http_client).await } } @@ -292,10 +308,14 @@ async fn run_anthropic_judge( }, }); + // Anthropic Messages API authenticates via `x-api-key`, not + // `Authorization: Bearer ...`. Using the wrong header returns 401 + // even with a valid key, which previously made this branch + // permanently dead. let url = format!("{api_base}/v1/messages"); let response = http_client .post(&url) - .header("Authorization", format!("Bearer {api_key}")) + .header("x-api-key", api_key) .header("Content-Type", "application/json") .header("anthropic-version", "2023-06-01") .json(&request_body) @@ -318,15 +338,14 @@ async fn run_anthropic_judge( .and_then(|t| t.as_str()) .unwrap_or_default(); - let parsed = serde_json::from_str::(text) - .unwrap_or(serde_json::json!({ - "analysis": text.to_owned(), - "strengths": [], - "weaknesses": ["Could not parse structured output from Anthropic judge"], - "completionScore": 0, - "codeQualityScore": 0, - "overallScore": 0 - })); + let parsed = serde_json::from_str::(text).unwrap_or(serde_json::json!({ + "analysis": text.to_owned(), + "strengths": [], + "weaknesses": ["Could not parse structured output from Anthropic judge"], + "completionScore": 0, + "codeQualityScore": 0, + "overallScore": 0 + })); parse_scorecard(parsed) } @@ -365,22 +384,21 @@ pub async fn judge_with_three_models( &prompt, &config.api_base, &config.api_key, + config.anthropic_api_base.as_deref(), + config.anthropic_api_key.as_deref(), http, ) }) .collect(); // Run all three judges in parallel with an overall timeout - let valid: Vec = timeout( - timeout_duration, - futures::future::join_all(judge_futures), - ) - .await - .ok() - .into_iter() // IntoIterator>> - .flatten() // Iterator> - .filter_map(|r| r.ok()) - .collect(); + let valid: Vec = timeout(timeout_duration, futures::future::join_all(judge_futures)) + .await + .ok() + .into_iter() // IntoIterator>> + .flatten() // Iterator> + .filter_map(|r| r.ok()) + .collect(); if valid.len() < MIN_JUDGE_SUCCESS_COUNT { return Ok(Scorecard { @@ -390,11 +408,7 @@ pub async fn judge_with_three_models( 3 ), strengths: vec![], - weaknesses: vec![format!( - "Only {}/{} judges succeeded", - valid.len(), - 3 - )], + weaknesses: vec![format!("Only {}/{} judges succeeded", valid.len(), 3)], completion_score: 0.0, code_quality_score: 0.0, overall_score: 0.0, @@ -453,4 +467,40 @@ mod tests { JudgeProviderKind::Anthropic ); } + + /// Locks the wire-format contract: the LLM judge returns camelCase + /// (`completionScore`, etc.) per the request schema. Deserialization + /// must accept that even though the on-disk JSON form is snake_case. + #[test] + fn parse_scorecard_accepts_camelcase_from_llm() { + let camel = serde_json::json!({ + "analysis": "looks good", + "strengths": ["clean diff"], + "weaknesses": [], + "completionScore": 8.5, + "codeQualityScore": 7.0, + "overallScore": 7.8 + }); + let parsed = parse_scorecard(camel).expect("camelCase must deserialize"); + assert_eq!(parsed.completion_score, 8.5); + assert_eq!(parsed.code_quality_score, 7.0); + assert_eq!(parsed.overall_score, 7.8); + } + + /// snake_case (on-disk eval JSON) must round-trip as well. + #[test] + fn parse_scorecard_accepts_snake_case_from_disk() { + let snake = serde_json::json!({ + "analysis": "", + "strengths": [], + "weaknesses": [], + "completion_score": 1.0, + "code_quality_score": 2.0, + "overall_score": 3.0 + }); + let parsed = parse_scorecard(snake).expect("snake_case must deserialize"); + assert_eq!(parsed.completion_score, 1.0); + assert_eq!(parsed.code_quality_score, 2.0); + assert_eq!(parsed.overall_score, 3.0); + } } diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs index 31bf1b661..f9cc09d06 100644 --- a/evals/jbench/src/lessons.rs +++ b/evals/jbench/src/lessons.rs @@ -245,16 +245,15 @@ pub fn append_lessons_to_file( } if !lessons_dir.exists() { - fs::create_dir_all(lessons_dir) - .context("failed to create lessons directory")?; + fs::create_dir_all(lessons_dir).context("failed to create lessons directory")?; } let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_"); let file_path = lessons_dir.join(format!("{safe_id}.json")); let existing: Vec = if file_path.exists() { - let contents = fs::read_to_string(&file_path) - .context("failed to read existing lessons file")?; + let contents = + fs::read_to_string(&file_path).context("failed to read existing lessons file")?; serde_json::from_str(&contents).unwrap_or_default() } else { Vec::new() @@ -265,11 +264,9 @@ pub fn append_lessons_to_file( .chain(lessons.iter().cloned()) .collect(); - let json = serde_json::to_string_pretty(&all_lessons) - .context("failed to serialize lessons")?; + let json = serde_json::to_string_pretty(&all_lessons).context("failed to serialize lessons")?; - fs::write(&file_path, json) - .context("failed to write lessons file")?; + fs::write(&file_path, json).context("failed to write lessons file")?; Ok(()) } diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs index 97d0eb7c1..36363dc5b 100644 --- a/evals/jbench/src/lib.rs +++ b/evals/jbench/src/lib.rs @@ -18,7 +18,7 @@ pub mod judge; pub mod lessons; pub mod types; -pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults}; pub use agent_runner::AgentRunConfig; pub use judge::JudgeConfig; pub use lessons::LessonsConfig; +pub use types::{AgentEvalResults, EvalCommit, EvalDataV2, EvalRun, JudgingResult}; diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs index 1cb51e17e..39d4645c5 100644 --- a/evals/jbench/src/types.rs +++ b/evals/jbench/src/types.rs @@ -112,6 +112,12 @@ pub struct EvalDataV2 { /// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is /// used so we can also store the *averaged* per-dimension scores when /// aggregating multiple judges (see `judge::judge_with_three_models`). +/// +/// On-disk JSON stays `snake_case` to match the rest of jcode's eval +/// outputs, but each score field also accepts the `camelCase` spelling +/// (`completionScore`, etc.) via `serde(alias = ...)` so we can +/// deserialize LLM judge responses directly without an intermediate +/// wire-format struct. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct JudgingResult { /// Free-form prose comparing the agent's diff to the ground truth. @@ -121,10 +127,13 @@ pub struct JudgingResult { /// Bullet-point weaknesses called out by the judge. pub weaknesses: Vec, /// How completely the prompt was addressed, `[0.0, 10.0]`. + #[serde(alias = "completionScore")] pub completion_score: f64, /// Code structure / maintainability, `[0.0, 10.0]`. + #[serde(alias = "codeQualityScore")] pub code_quality_score: f64, /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric. + #[serde(alias = "overallScore")] pub overall_score: f64, } diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs index 2a8efd02e..fcaa832fb 100644 --- a/evals/jbench/tests/types.rs +++ b/evals/jbench/tests/types.rs @@ -4,9 +4,7 @@ //! and write, and they fail loudly if anyone changes a field's //! `snake_case` name without updating consumers. -use jcode_jbench::types::{ - EvalCommit, FileDiff, FileDiffStatus, JudgingResult, -}; +use jcode_jbench::types::{EvalCommit, FileDiff, FileDiffStatus, JudgingResult}; #[test] fn eval_commit_round_trips_through_json() { diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs index ba9719985..6107a314f 100644 --- a/src/agent/prompting.rs +++ b/src/agent/prompting.rs @@ -122,7 +122,6 @@ impl Agent { } } - /// Wrap a step prompt body in `...` tags. /// /// Step prompts are emitted by the harness (not typed by the user), but they diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs index 68cee139a..635beb8cc 100644 --- a/src/prompt_placeholders.rs +++ b/src/prompt_placeholders.rs @@ -77,7 +77,12 @@ fn truncate_chars(s: &str, max_chars: usize) -> String { /// /// Length caps documented on [`PlaceholderContext`] are enforced here, so /// callers may pass un-truncated input and trust the output to be bounded. -pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { +/// +/// This is the **context-driven** substitution path used for built-in +/// Phase 4 placeholders. For user-supplied template bindings (arbitrary +/// `HashMap`), use +/// [`crate::prompt_templates::substitute_placeholders`] instead. +pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String { if prompt.is_empty() { return String::new(); } @@ -123,11 +128,8 @@ mod tests { k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \ date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \ sys=[{{SYSTEM_INFO}}]"; - let out = substitute_placeholders(input, &ctx); - assert_eq!( - out, - "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]" - ); + let out = substitute_context_placeholders(input, &ctx); + assert_eq!(out, "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"); } #[test] @@ -136,11 +138,11 @@ mod tests { current_date: "2026-05-25".to_string(), ..Default::default() }; - let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx); + let out = substitute_context_placeholders("today is {{CURRENT_DATE}}.", &ctx); assert_eq!(out, "today is 2026-05-25."); // Unrelated placeholder stays empty in the same call. - let out2 = substitute_placeholders( + let out2 = substitute_context_placeholders( "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}", &ctx, ); @@ -161,7 +163,7 @@ mod tests { {{KNOWLEDGE_FILES}}\n\n## Meta\n\ date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \ sys={{SYSTEM_INFO}}"; - let out = substitute_placeholders(input, &ctx); + let out = substitute_context_placeholders(input, &ctx); let expected = "## Tree\nsrc/\n lib.rs\n\n## Knowledge\n\ AGENTS.md contents\n\n## Meta\n\ date=2026-05-25 steps=7 sys=linux x86_64"; @@ -176,7 +178,7 @@ mod tests { }; let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \ other={{ALSO_BOGUS}}"; - let out = substitute_placeholders(input, &ctx); + let out = substitute_context_placeholders(input, &ctx); assert_eq!( out, "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}" @@ -191,7 +193,7 @@ mod tests { file_tree_small: big.clone(), ..Default::default() }; - let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); + let out = substitute_context_placeholders("[{{FILE_TREE_SMALL}}]", &ctx); // Two bracket characters plus the cap. assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2); assert!(out.starts_with('[')); diff --git a/src/tui/app/commands.rs b/src/tui/app/commands.rs index 536d50231..4238e65fe 100644 --- a/src/tui/app/commands.rs +++ b/src/tui/app/commands.rs @@ -1925,7 +1925,8 @@ pub(super) fn handle_session_command(app: &mut App, trimmed: &str) -> bool { Ok(out) if out.status.success() => { let _ = std::fs::remove_file(&tmp_path); let url = String::from_utf8_lossy(&out.stdout) - .lines().rfind(|l| l.starts_with("https://")) + .lines() + .rfind(|l| l.starts_with("https://")) .unwrap_or("") .trim() .to_string(); diff --git a/tests/tool_fixtures.rs b/tests/tool_fixtures.rs index 9a7d98e97..6c7fc0318 100644 --- a/tests/tool_fixtures.rs +++ b/tests/tool_fixtures.rs @@ -105,9 +105,10 @@ fn collect_fixtures() -> Vec<(String, Fixture)> { .unwrap_or("") .to_string(); if let Some(needle) = filter.as_deref() - && !stem.contains(needle) { - continue; - } + && !stem.contains(needle) + { + continue; + } let raw = std::fs::read_to_string(&path).expect("read fixture"); let fixture: Fixture = serde_json::from_str(&raw).unwrap_or_else(|e| panic!("parse fixture {}: {}", stem, e));