From c0bcaca4c280934f911c6a54cf3021ec8f20101a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tr=E1=BA=A7n=20Quang=20=C4=90=C3=A3ng?=
 <tranquangdang21@gmail.com>
Date: Thu, 28 May 2026 01:50:55 +0000
Subject: [PATCH] fix(agent-runtime): address PR #313 review issues

Bugs fixed:

1. JudgingResult deserialization (jbench/types.rs)
   The judge prompt schema asks for camelCase fields
   (completionScore, codeQualityScore, overallScore) but
   the Rust struct used snake_case without serde rename.
   parse_scorecard would fail on every real judge response.

   Fix: add #[serde(alias = ...)] on each score field so
   on-disk JSON stays snake_case while LLM-returned
   camelCase still deserializes cleanly.

2. Anthropic judge authentication (jbench/judge.rs)
   run_anthropic_judge used Authorization: Bearer <key>
   which always 401s on the Anthropic Messages API.

   Fix: switch to x-api-key header (Anthropic standard).
   Also split JudgeConfig::api_base / api_key from new
   anthropic_api_base / anthropic_api_key so the Anthropic
   branch can target api.anthropic.com without breaking
   the OpenAI-compatible path. Plumbed through
   run_single_judge.

3. Duplicate substitute_placeholders (src/prompt_placeholders.rs)
   Conflicts with the existing
   prompt_templates::substitute_placeholders. Different
   semantics (fixed context vs HashMap bindings) but same
   name made grep / jump-to-def ambiguous.

   Fix: rename the new one to
   substitute_context_placeholders and document the
   relationship in the doc comment.

4. meta_analyze .run.json filter (jbench/bin/jbench.rs)
   path.extension() returns only the final extension
   ('json'), so matching against "run.json" never fired.
   meta-analyze would always report zero runs.

   Fix: match against file_name().ends_with(".run.json").

Plus:
- Run cargo fmt --all to clear the Format CI job that PR
  #313 was failing.
- Add tests parse_scorecard_accepts_camelcase_from_llm and
  parse_scorecard_accepts_snake_case_from_disk to lock in
  the wire-format contract.
---
 crates/jcode-agent-runtime/src/definition.rs  |  30 +---
 crates/jcode-agent-runtime/src/lib.rs         |   6 +-
 crates/jcode-agent-runtime/src/output.rs      |   5 +-
 crates/jcode-agent-runtime/src/reasoning.rs   |  10 +-
 crates/jcode-agent-runtime/src/registry.rs    |  57 ++++----
 crates/jcode-agent-runtime/src/tier.rs        |  10 +-
 .../tests/sample_agents.rs                    |  22 ++-
 evals/jbench/src/agent_runner.rs              |   9 +-
 evals/jbench/src/bin/jbench.rs                |  69 +++++++---
 evals/jbench/src/judge.rs                     | 130 ++++++++++++------
 evals/jbench/src/lessons.rs                   |  13 +-
 evals/jbench/src/lib.rs                       |   2 +-
 evals/jbench/src/types.rs                     |   9 ++
 evals/jbench/tests/types.rs                   |   4 +-
 src/agent/prompting.rs                        |   1 -
 src/prompt_placeholders.rs                    |  24 ++--
 src/tui/app/commands.rs                       |   3 +-
 tests/tool_fixtures.rs                        |   7 +-
 18 files changed, 250 insertions(+), 161 deletions(-)
diff --git a/crates/jcode-agent-runtime/src/definition.rs b/crates/jcode-agent-runtime/src/definition.rs
index a067668c6..4adeeabbd 100644
--- a/crates/jcode-agent-runtime/src/definition.rs
+++ b/crates/jcode-agent-runtime/src/definition.rs
@@ -174,9 +174,7 @@ fn default_version() -> String {
 /// invariants. Displayed to users when a TOML file fails to load.
 #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
 pub enum DefinitionError {
-    #[error(
-        "agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen"
-    )]
+    #[error("agent id `{0}` is invalid: must be non-empty, lowercase ASCII alphanumeric or hyphen")]
     InvalidId(String),
 
     #[error(
@@ -184,9 +182,7 @@ pub enum DefinitionError {
     )]
     SystemPromptConflict { id: String },
 
-    #[error(
-        "agent `{id}` has `output_mode = structured_output` but `output_schema` is missing"
-    )]
+    #[error("agent `{id}` has `output_mode = structured_output` but `output_schema` is missing")]
     StructuredOutputMissingSchema { id: String },
 
     #[error("agent `{id}` references itself in `spawnable_agents`")]
@@ -209,9 +205,7 @@ pub enum DefinitionError {
 /// agent spawn time.
 #[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
 pub enum ReferenceError {
-    #[error(
-        "agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}"
-    )]
+    #[error("agent `{id}` references unknown tool(s): {unknown}. Available tools: {available}")]
     UnknownTools {
         id: String,
         unknown: String,
@@ -245,8 +239,7 @@ impl AgentDefinition {
         }
 
         // 3. structured_output requires schema
-        if matches!(self.output_mode, OutputMode::StructuredOutput)
-            && self.output_schema.is_none()
+        if matches!(self.output_mode, OutputMode::StructuredOutput) && self.output_schema.is_none()
         {
             return Err(DefinitionError::StructuredOutputMissingSchema {
                 id: self.id.clone(),
@@ -422,30 +415,21 @@ mod tests {
     fn id_validation_rejects_uppercase() {
         let mut d = minimal_definition("File-Picker");
         d.id = "File-Picker".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
     fn id_validation_rejects_underscore() {
         let mut d = minimal_definition("file_picker");
         d.id = "file_picker".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
     fn id_validation_rejects_leading_hyphen() {
         let mut d = minimal_definition("ok");
         d.id = "-bad".to_string();
-        assert!(matches!(
-            d.validate(),
-            Err(DefinitionError::InvalidId(_))
-        ));
+        assert!(matches!(d.validate(), Err(DefinitionError::InvalidId(_))));
     }
 
     #[test]
diff --git a/crates/jcode-agent-runtime/src/lib.rs b/crates/jcode-agent-runtime/src/lib.rs
index b78ad983f..80979a845 100644
--- a/crates/jcode-agent-runtime/src/lib.rs
+++ b/crates/jcode-agent-runtime/src/lib.rs
@@ -38,10 +38,8 @@ pub use signals::{
 };
 
 // New public surface (Phase 0).
-pub use definition::{
-    AgentDefinition, DefinitionError, ReferenceError, DEFAULT_AGENT_VERSION,
-};
+pub use definition::{AgentDefinition, DEFAULT_AGENT_VERSION, DefinitionError, ReferenceError};
 pub use output::OutputMode;
 pub use reasoning::ReasoningEffort;
 pub use registry::{AgentRegistry, AgentSource, LoadError, LoadedAgent, SourceKind};
-pub use tier::{resolve_model, resolve_model_with_source, ModelTier, ResolutionSource};
+pub use tier::{ModelTier, ResolutionSource, resolve_model, resolve_model_with_source};
diff --git a/crates/jcode-agent-runtime/src/output.rs b/crates/jcode-agent-runtime/src/output.rs
index 1ba93dd1a..93dc60a93 100644
--- a/crates/jcode-agent-runtime/src/output.rs
+++ b/crates/jcode-agent-runtime/src/output.rs
@@ -53,7 +53,10 @@ mod tests {
 
     #[test]
     fn parse_accepts_aliases() {
-        assert_eq!(OutputMode::parse("last_message"), Some(OutputMode::LastMessage));
+        assert_eq!(
+            OutputMode::parse("last_message"),
+            Some(OutputMode::LastMessage)
+        );
         assert_eq!(OutputMode::parse("all"), Some(OutputMode::AllMessages));
         assert_eq!(
             OutputMode::parse("structured"),
diff --git a/crates/jcode-agent-runtime/src/reasoning.rs b/crates/jcode-agent-runtime/src/reasoning.rs
index d48bafaeb..7cdf8d010 100644
--- a/crates/jcode-agent-runtime/src/reasoning.rs
+++ b/crates/jcode-agent-runtime/src/reasoning.rs
@@ -79,9 +79,15 @@ mod tests {
             ReasoningEffort::parse("minimal"),
             Some(ReasoningEffort::Minimal)
         );
-        assert_eq!(ReasoningEffort::parse("OFF"), Some(ReasoningEffort::Minimal));
+        assert_eq!(
+            ReasoningEffort::parse("OFF"),
+            Some(ReasoningEffort::Minimal)
+        );
         assert_eq!(ReasoningEffort::parse("max"), Some(ReasoningEffort::High));
-        assert_eq!(ReasoningEffort::parse("default"), Some(ReasoningEffort::Medium));
+        assert_eq!(
+            ReasoningEffort::parse("default"),
+            Some(ReasoningEffort::Medium)
+        );
         assert_eq!(ReasoningEffort::parse(""), None);
         assert_eq!(ReasoningEffort::parse("absurd"), None);
     }
diff --git a/crates/jcode-agent-runtime/src/registry.rs b/crates/jcode-agent-runtime/src/registry.rs
index 71cab810d..82f182b2d 100644
--- a/crates/jcode-agent-runtime/src/registry.rs
+++ b/crates/jcode-agent-runtime/src/registry.rs
@@ -90,9 +90,7 @@ pub enum LoadError {
         source: DefinitionError,
     },
 
-    #[error(
-        "filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`."
-    )]
+    #[error("filename `{path}` does not match agent id `{id}`. Rename the file to `{id}.toml`.")]
     FileNameMismatch { path: PathBuf, id: String },
 }
 
@@ -175,10 +173,7 @@ impl AgentRegistry {
 
     /// Register a builtin agent. Builtins have the lowest priority and
     /// are overridable by both user and project files of the same id.
-    pub fn register_builtin(
-        &mut self,
-        definition: AgentDefinition,
-    ) -> Result<(), DefinitionError> {
+    pub fn register_builtin(&mut self, definition: AgentDefinition) -> Result<(), DefinitionError> {
         definition.validate()?;
         self.insert(LoadedAgent {
             definition,
@@ -233,10 +228,7 @@ impl AgentRegistry {
                             AgentSource::ProjectLocal { path: path.clone() }
                         }
                     };
-                    self.insert(LoadedAgent {
-                        definition,
-                        source,
-                    });
+                    self.insert(LoadedAgent { definition, source });
                     loaded += 1;
                 }
                 Err(err) => {
@@ -333,7 +325,10 @@ mod tests {
     fn missing_dir_is_zero_load_not_error() {
         let mut reg = AgentRegistry::new();
         let n = reg
-            .load_directory(Path::new("/nonexistent/jcode-test-dir"), SourceKind::UserGlobal)
+            .load_directory(
+                Path::new("/nonexistent/jcode-test-dir"),
+                SourceKind::UserGlobal,
+            )
             .unwrap();
         assert_eq!(n, 0);
         assert!(reg.is_empty());
@@ -382,7 +377,10 @@ mod tests {
             output_schema: None,
         };
         reg.register_builtin(builtin_def.clone()).unwrap();
-        assert_eq!(reg.get("editor").unwrap().definition.display_name, "Builtin Editor");
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "Builtin Editor"
+        );
 
         // User
         let user_dir = temp_dir("user");
@@ -394,8 +392,12 @@ mod tests {
                 display_name = "User Editor"
             "#,
         );
-        reg.load_directory(&user_dir, SourceKind::UserGlobal).unwrap();
-        assert_eq!(reg.get("editor").unwrap().definition.display_name, "User Editor");
+        reg.load_directory(&user_dir, SourceKind::UserGlobal)
+            .unwrap();
+        assert_eq!(
+            reg.get("editor").unwrap().definition.display_name,
+            "User Editor"
+        );
 
         // Project
         let proj_dir = temp_dir("proj");
@@ -407,7 +409,8 @@ mod tests {
                 display_name = "Project Editor"
             "#,
         );
-        reg.load_directory(&proj_dir, SourceKind::ProjectLocal).unwrap();
+        reg.load_directory(&proj_dir, SourceKind::ProjectLocal)
+            .unwrap();
         assert_eq!(
             reg.get("editor").unwrap().definition.display_name,
             "Project Editor"
@@ -432,10 +435,7 @@ mod tests {
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
         assert!(reg.is_empty(), "no agents registered");
         assert_eq!(reg.load_errors().len(), 1);
-        assert!(matches!(
-            reg.load_errors()[0],
-            LoadError::Parse { .. }
-        ));
+        assert!(matches!(reg.load_errors()[0], LoadError::Parse { .. }));
     }
 
     #[test]
@@ -453,10 +453,7 @@ mod tests {
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
         assert!(reg.is_empty());
         assert_eq!(reg.load_errors().len(), 1);
-        assert!(matches!(
-            reg.load_errors()[0],
-            LoadError::Invalid { .. }
-        ));
+        assert!(matches!(reg.load_errors()[0], LoadError::Invalid { .. }));
     }
 
     #[test]
@@ -506,14 +503,20 @@ mod tests {
             write_toml(
                 &dir,
                 &format!("{id}.toml"),
-                &format!(r#"id = "{id}"
+                &format!(
+                    r#"id = "{id}"
 display_name = "{id}"
-"#),
+"#
+                ),
             );
         }
         let mut reg = AgentRegistry::new();
         reg.load_directory(&dir, SourceKind::UserGlobal).unwrap();
-        let ids: Vec<_> = reg.iter_sorted().iter().map(|a| a.definition.id.clone()).collect();
+        let ids: Vec<_> = reg
+            .iter_sorted()
+            .iter()
+            .map(|a| a.definition.id.clone())
+            .collect();
         assert_eq!(ids, vec!["alpha", "mid", "zeta"]);
     }
 
diff --git a/crates/jcode-agent-runtime/src/tier.rs b/crates/jcode-agent-runtime/src/tier.rs
index 200f511ed..33ee6288b 100644
--- a/crates/jcode-agent-runtime/src/tier.rs
+++ b/crates/jcode-agent-runtime/src/tier.rs
@@ -135,16 +135,10 @@ pub enum ResolutionSource {
     /// Used `agent.model_override` directly.
     Override(String),
     /// Used the env var backing `tier`.
-    Tier {
-        tier: ModelTier,
-        model: String,
-    },
+    Tier { tier: ModelTier, model: String },
     /// Tier was preferred but the env var was unset, so fell back to the
     /// session's current model.
-    TierFallback {
-        tier: ModelTier,
-        model: String,
-    },
+    TierFallback { tier: ModelTier, model: String },
     /// No override or tier preference; using the session's current model.
     SessionDefault(String),
 }
diff --git a/crates/jcode-agent-runtime/tests/sample_agents.rs b/crates/jcode-agent-runtime/tests/sample_agents.rs
index e850495d5..ee6ee7034 100644
--- a/crates/jcode-agent-runtime/tests/sample_agents.rs
+++ b/crates/jcode-agent-runtime/tests/sample_agents.rs
@@ -9,9 +9,7 @@
 
 use std::path::PathBuf;
 
-use jcode_agent_runtime::{
-    AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind,
-};
+use jcode_agent_runtime::{AgentRegistry, ModelTier, OutputMode, ReasoningEffort, SourceKind};
 
 /// Path to the project-root sample agents directory, relative to the
 /// crate manifest. Deliberately constructed via `CARGO_MANIFEST_DIR` so
@@ -20,7 +18,12 @@ use jcode_agent_runtime::{
 fn samples_dir() -> PathBuf {
     let crate_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
     // crates/jcode-agent-runtime → ../../ .jcode/agents
-    crate_dir.parent().unwrap().parent().unwrap().join(".jcode/agents")
+    crate_dir
+        .parent()
+        .unwrap()
+        .parent()
+        .unwrap()
+        .join(".jcode/agents")
 }
 
 #[test]
@@ -37,7 +40,11 @@ fn loads_bundled_sample_agents() {
         .load_directory(&dir, SourceKind::ProjectLocal)
         .expect("load_directory");
     assert!(n >= 2, "expected at least 2 sample agents, got {n}");
-    assert!(reg.load_errors().is_empty(), "load errors: {:?}", reg.load_errors());
+    assert!(
+        reg.load_errors().is_empty(),
+        "load errors: {:?}",
+        reg.load_errors()
+    );
 }
 
 #[test]
@@ -56,7 +63,10 @@ fn file_picker_sample_has_expected_shape() {
     assert_eq!(agent.display_name, "Fletcher the File Fetcher");
     assert_eq!(agent.prefer_tier, Some(ModelTier::Routine));
     assert_eq!(agent.reasoning, Some(ReasoningEffort::Minimal));
-    assert!(!agent.include_message_history, "file picker uses clean slate");
+    assert!(
+        !agent.include_message_history,
+        "file picker uses clean slate"
+    );
     assert!(!agent.inherit_parent_system_prompt);
     assert_eq!(agent.output_mode, OutputMode::LastMessage);
     assert!(agent.tool_names.iter().any(|t| t == "read"));
diff --git a/evals/jbench/src/agent_runner.rs b/evals/jbench/src/agent_runner.rs
index de922e71d..3763ee4c2 100644
--- a/evals/jbench/src/agent_runner.rs
+++ b/evals/jbench/src/agent_runner.rs
@@ -83,9 +83,12 @@ pub async fn run_agent_in_repo(config: AgentRunConfig) -> Result<EvalRun> {
         .current_dir(&config.repo_path)
         .envs(&env_vars)
         .args([
-            "agent", "run",
-            "--agent", &config.agent_id,
-            "--output-mode", "stream",
+            "agent",
+            "run",
+            "--agent",
+            &config.agent_id,
+            "--output-mode",
+            "stream",
             "--no-interactive",
         ])
         .stdin(Stdio::piped())
diff --git a/evals/jbench/src/bin/jbench.rs b/evals/jbench/src/bin/jbench.rs
index 90808dc60..2e54b50e7 100644
--- a/evals/jbench/src/bin/jbench.rs
+++ b/evals/jbench/src/bin/jbench.rs
@@ -9,8 +9,8 @@ use clap::{Parser, Subcommand};
 
 use jcode_jbench::{
     agent_runner::AgentRunConfig,
-    judge::{judge_with_three_models, JudgeConfig},
-    lessons::{append_lessons_to_file, extract_lessons, LessonsConfig},
+    judge::{JudgeConfig, judge_with_three_models},
+    lessons::{LessonsConfig, append_lessons_to_file, extract_lessons},
     types::{AgentEvalResults, EvalDataV2, EvalRun},
 };
 
@@ -100,16 +100,40 @@ enum Command {
 async fn main() -> Result<()> {
     let cli = Cli::parse();
     match cli.command {
-        Command::PickCommits { repo_url, min_msg_len, max_picks, output } => {
+        Command::PickCommits {
+            repo_url,
+            min_msg_len,
+            max_picks,
+            output,
+        } => {
             pick_commits_impl(&repo_url, min_msg_len, max_picks, output).await?;
         }
         Command::GenEvals { input, output } => {
             gen_evals_impl(&input, &output).await?;
         }
-        Command::Run { eval_file, agent_id, output_dir, jcode_binary, max_turns, timeout_secs } => {
-            run_impl(&eval_file, &agent_id, &output_dir, jcode_binary.as_ref(), max_turns, timeout_secs).await?;
+        Command::Run {
+            eval_file,
+            agent_id,
+            output_dir,
+            jcode_binary,
+            max_turns,
+            timeout_secs,
+        } => {
+            run_impl(
+                &eval_file,
+                &agent_id,
+                &output_dir,
+                jcode_binary.as_ref(),
+                max_turns,
+                timeout_secs,
+            )
+            .await?;
         }
-        Command::Judge { runs_dir, api_base, api_key } => {
+        Command::Judge {
+            runs_dir,
+            api_base,
+            api_key,
+        } => {
             judge_impl(&runs_dir, api_base.as_deref(), api_key.as_deref()).await?;
         }
         Command::MetaAnalyze { runs_dir, output } => {
@@ -141,8 +165,8 @@ async fn run_impl(
     timeout_secs: u64,
 ) -> Result<()> {
     use std::fs;
-    use tokio::time::timeout as tk_timeout;
     use std::time::Duration;
+    use tokio::time::timeout as tk_timeout;
 
     // Load eval data
     let eval_data: EvalDataV2 = {
@@ -199,22 +223,28 @@ async fn judge_impl(
     _api_base: Option<&str>,
     _api_key: Option<&str>,
 ) -> Result<()> {
-    todo_step("Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields")
+    todo_step(
+        "Phase 5.4: load EvalRun JSONs, call judge_with_three_models, overwrite judging fields",
+    )
 }
 
-async fn meta_analyze_impl(
-    runs_dir: &PathBuf,
-    output: Option<&PathBuf>,
-) -> Result<()> {
-    use std::fs;
+async fn meta_analyze_impl(runs_dir: &PathBuf, output: Option<&PathBuf>) -> Result<()> {
     use jcode_jbench::types::AgentEvalResults;
+    use std::fs;
 
     let mut all_runs = Vec::new();
 
     for entry in fs::read_dir(runs_dir)? {
         let entry = entry?;
         let path = entry.path();
-        if path.extension().and_then(|s| s.to_str()) == Some("run.json") {
+        // `Path::extension` returns only the trailing component (`json`),
+        // so matching against `"run.json"` never fires. Match on the full
+        // file name suffix instead.
+        let is_run_file = path
+            .file_name()
+            .and_then(|s| s.to_str())
+            .is_some_and(|s| s.ends_with(".run.json"));
+        if is_run_file {
             let text = fs::read_to_string(&path)?;
             if let Ok(run) = serde_json::from_str::<EvalRun>(&text) {
                 all_runs.push(run);
@@ -226,12 +256,13 @@ async fn meta_analyze_impl(
         anyhow::bail!("No .run.json files found in {}", runs_dir.display());
     }
 
-    let avg_score = all_runs.iter().map(|r| r.judging.overall_score).sum::<f64>()
-        / all_runs.len() as f64;
-    let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>()
+    let avg_score = all_runs
+        .iter()
+        .map(|r| r.judging.overall_score)
+        .sum::<f64>()
         / all_runs.len() as f64;
-    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>()
-        / all_runs.len() as u64;
+    let avg_cost = all_runs.iter().map(|r| r.cost_usd).sum::<f64>() / all_runs.len() as f64;
+    let avg_duration = all_runs.iter().map(|r| r.duration_ms).sum::<u64>() / all_runs.len() as u64;
 
     let summary = AgentEvalResults {
         agent_id: "unknown".to_owned(),
diff --git a/evals/jbench/src/judge.rs b/evals/jbench/src/judge.rs
index 589e5749d..8f9437f47 100644
--- a/evals/jbench/src/judge.rs
+++ b/evals/jbench/src/judge.rs
@@ -52,10 +52,18 @@ impl JudgeProviderKind {
 /// Configuration for the judging pipeline.
 #[derive(Debug, Clone)]
 pub struct JudgeConfig {
-    /// API base URL for the judge backend (e.g. OpenAI-compatible).
+    /// API base URL for the OpenAI-compatible judge backend.
     pub api_base: String,
-    /// API key secret.
+    /// API key for the OpenAI-compatible judge backend.
     pub api_key: String,
+    /// Optional separate base URL for Anthropic-routed judges (e.g.
+    /// `https://api.anthropic.com`). Falls back to `api_base` when
+    /// `None`, which only makes sense if the OpenAI-compatible host
+    /// proxies the Anthropic Messages API too.
+    pub anthropic_api_base: Option<String>,
+    /// Optional separate API key for Anthropic-routed judges. Falls
+    /// back to `api_key` when `None`.
+    pub anthropic_api_key: Option<String>,
     /// Model IDs for the three judges. Order determines the median
     /// computation.
     pub models: [String; 3],
@@ -72,6 +80,8 @@ impl Default for JudgeConfig {
             api_base: std::env::var("JBENCH_API_BASE")
                 .unwrap_or_else(|_| "https://api.openai.com".to_owned()),
             api_key: std::env::var("JBENCH_API_KEY").unwrap_or_default(),
+            anthropic_api_base: std::env::var("JBENCH_ANTHROPIC_API_BASE").ok(),
+            anthropic_api_key: std::env::var("JBENCH_ANTHROPIC_API_KEY").ok(),
             models: [
                 "gpt-5-2026-05".to_owned(),
                 "google/gemini-3.1-pro".to_owned(),
@@ -84,17 +94,15 @@ impl Default for JudgeConfig {
 }
 
 /// Render the full judge prompt from commit + diff + context.
-fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &HashMap<String, String>) -> String {
+fn render_judge_prompt(
+    commit: &EvalCommit,
+    agent_diff: &str,
+    context_files: &HashMap<String, String>,
+) -> String {
     let ground_truth_diffs = commit
         .file_diffs
         .iter()
-        .map(|fd| {
-            format!(
-                "### {}\n```diff\n{}\n```",
-                fd.path,
-                fd.diff
-            )
-        })
+        .map(|fd| format!("### {}\n```diff\n{}\n```", fd.path, fd.diff))
         .collect::<Vec<_>>()
         .join("\n\n");
 
@@ -106,10 +114,7 @@ fn render_judge_prompt(commit: &EvalCommit, agent_diff: &str, context_files: &Ha
 
     format!(
         "## User Prompt (What the agent was asked to do)\n{}\n\n## Context Files (from parent commit)\n{}\n\n## Ground Truth Changes (One valid implementation)\n{}\n\n## Agent's Changes (What the agent actually did)\n```diff\n{}\n```",
-        commit.prompt,
-        context_content,
-        ground_truth_diffs,
-        agent_diff
+        commit.prompt, context_content, ground_truth_diffs, agent_diff
     )
 }
 
@@ -168,12 +173,18 @@ struct JudgeResponse {
 
 /// Invoke a single judge model with a fully-rendered prompt.
 ///
+/// `anthropic_api_base` / `anthropic_api_key` are only consulted when
+/// the model routes through `JudgeProviderKind::Anthropic`; OpenAI-bound
+/// requests always use the primary `api_base` / `api_key`.
+///
 /// Design source: `/tmp/codebuff/evals/buffbench/judge.ts` (`runSingleJudge`).
 pub async fn run_single_judge(
     model: &str,
     prompt: &str,
     api_base: &str,
     api_key: &str,
+    anthropic_api_base: Option<&str>,
+    anthropic_api_key: Option<&str>,
     http_client: &Client,
 ) -> Result<Scorecard> {
     let kind = JudgeProviderKind::for_model(model);
@@ -182,7 +193,12 @@ pub async fn run_single_judge(
     if kind == JudgeProviderKind::OpenAI {
         run_openai_judge(model, prompt, system, api_base, api_key, http_client).await
     } else {
-        run_anthropic_judge(model, prompt, system, api_base, api_key, http_client).await
+        // Fall back to the primary host/key only if no Anthropic-specific
+        // overrides were configured. The caller is expected to set both
+        // overrides when targeting `api.anthropic.com` directly.
+        let base = anthropic_api_base.unwrap_or(api_base);
+        let key = anthropic_api_key.unwrap_or(api_key);
+        run_anthropic_judge(model, prompt, system, base, key, http_client).await
     }
 }
 
@@ -292,10 +308,14 @@ async fn run_anthropic_judge(
         },
     });
 
+    // Anthropic Messages API authenticates via `x-api-key`, not
+    // `Authorization: Bearer ...`. Using the wrong header returns 401
+    // even with a valid key, which previously made this branch
+    // permanently dead.
     let url = format!("{api_base}/v1/messages");
     let response = http_client
         .post(&url)
-        .header("Authorization", format!("Bearer {api_key}"))
+        .header("x-api-key", api_key)
         .header("Content-Type", "application/json")
         .header("anthropic-version", "2023-06-01")
         .json(&request_body)
@@ -318,15 +338,14 @@ async fn run_anthropic_judge(
         .and_then(|t| t.as_str())
         .unwrap_or_default();
 
-    let parsed = serde_json::from_str::<serde_json::Value>(text)
-        .unwrap_or(serde_json::json!({
-            "analysis": text.to_owned(),
-            "strengths": [],
-            "weaknesses": ["Could not parse structured output from Anthropic judge"],
-            "completionScore": 0,
-            "codeQualityScore": 0,
-            "overallScore": 0
-        }));
+    let parsed = serde_json::from_str::<serde_json::Value>(text).unwrap_or(serde_json::json!({
+        "analysis": text.to_owned(),
+        "strengths": [],
+        "weaknesses": ["Could not parse structured output from Anthropic judge"],
+        "completionScore": 0,
+        "codeQualityScore": 0,
+        "overallScore": 0
+    }));
 
     parse_scorecard(parsed)
 }
@@ -365,22 +384,21 @@ pub async fn judge_with_three_models(
                 &prompt,
                 &config.api_base,
                 &config.api_key,
+                config.anthropic_api_base.as_deref(),
+                config.anthropic_api_key.as_deref(),
                 http,
             )
         })
         .collect();
 
     // Run all three judges in parallel with an overall timeout
-    let valid: Vec<Scorecard> = timeout(
-        timeout_duration,
-        futures::future::join_all(judge_futures),
-    )
-    .await
-    .ok()
-    .into_iter()           // IntoIterator<Item = Vec<Result<Scorecard>>>
-    .flatten()            // Iterator<Item = Result<Scorecard>>
-    .filter_map(|r| r.ok())
-    .collect();
+    let valid: Vec<Scorecard> = timeout(timeout_duration, futures::future::join_all(judge_futures))
+        .await
+        .ok()
+        .into_iter() // IntoIterator<Item = Vec<Result<Scorecard>>>
+        .flatten() // Iterator<Item = Result<Scorecard>>
+        .filter_map(|r| r.ok())
+        .collect();
 
     if valid.len() < MIN_JUDGE_SUCCESS_COUNT {
         return Ok(Scorecard {
@@ -390,11 +408,7 @@ pub async fn judge_with_three_models(
                 3
             ),
             strengths: vec![],
-            weaknesses: vec![format!(
-                "Only {}/{} judges succeeded",
-                valid.len(),
-                3
-            )],
+            weaknesses: vec![format!("Only {}/{} judges succeeded", valid.len(), 3)],
             completion_score: 0.0,
             code_quality_score: 0.0,
             overall_score: 0.0,
@@ -453,4 +467,40 @@ mod tests {
             JudgeProviderKind::Anthropic
         );
     }
+
+    /// Locks the wire-format contract: the LLM judge returns camelCase
+    /// (`completionScore`, etc.) per the request schema. Deserialization
+    /// must accept that even though the on-disk JSON form is snake_case.
+    #[test]
+    fn parse_scorecard_accepts_camelcase_from_llm() {
+        let camel = serde_json::json!({
+            "analysis": "looks good",
+            "strengths": ["clean diff"],
+            "weaknesses": [],
+            "completionScore": 8.5,
+            "codeQualityScore": 7.0,
+            "overallScore": 7.8
+        });
+        let parsed = parse_scorecard(camel).expect("camelCase must deserialize");
+        assert_eq!(parsed.completion_score, 8.5);
+        assert_eq!(parsed.code_quality_score, 7.0);
+        assert_eq!(parsed.overall_score, 7.8);
+    }
+
+    /// snake_case (on-disk eval JSON) must round-trip as well.
+    #[test]
+    fn parse_scorecard_accepts_snake_case_from_disk() {
+        let snake = serde_json::json!({
+            "analysis": "",
+            "strengths": [],
+            "weaknesses": [],
+            "completion_score": 1.0,
+            "code_quality_score": 2.0,
+            "overall_score": 3.0
+        });
+        let parsed = parse_scorecard(snake).expect("snake_case must deserialize");
+        assert_eq!(parsed.completion_score, 1.0);
+        assert_eq!(parsed.code_quality_score, 2.0);
+        assert_eq!(parsed.overall_score, 3.0);
+    }
 }
diff --git a/evals/jbench/src/lessons.rs b/evals/jbench/src/lessons.rs
index 31bf1b661..f9cc09d06 100644
--- a/evals/jbench/src/lessons.rs
+++ b/evals/jbench/src/lessons.rs
@@ -245,16 +245,15 @@ pub fn append_lessons_to_file(
     }
 
     if !lessons_dir.exists() {
-        fs::create_dir_all(lessons_dir)
-            .context("failed to create lessons directory")?;
+        fs::create_dir_all(lessons_dir).context("failed to create lessons directory")?;
     }
 
     let safe_id = agent_id.replace(|c: char| !c.is_alphanumeric() && c != '-' && c != '_', "_");
     let file_path = lessons_dir.join(format!("{safe_id}.json"));
 
     let existing: Vec<Lesson> = if file_path.exists() {
-        let contents = fs::read_to_string(&file_path)
-            .context("failed to read existing lessons file")?;
+        let contents =
+            fs::read_to_string(&file_path).context("failed to read existing lessons file")?;
         serde_json::from_str(&contents).unwrap_or_default()
     } else {
         Vec::new()
@@ -265,11 +264,9 @@ pub fn append_lessons_to_file(
         .chain(lessons.iter().cloned())
         .collect();
 
-    let json = serde_json::to_string_pretty(&all_lessons)
-        .context("failed to serialize lessons")?;
+    let json = serde_json::to_string_pretty(&all_lessons).context("failed to serialize lessons")?;
 
-    fs::write(&file_path, json)
-        .context("failed to write lessons file")?;
+    fs::write(&file_path, json).context("failed to write lessons file")?;
 
     Ok(())
 }
diff --git a/evals/jbench/src/lib.rs b/evals/jbench/src/lib.rs
index 97d0eb7c1..36363dc5b 100644
--- a/evals/jbench/src/lib.rs
+++ b/evals/jbench/src/lib.rs
@@ -18,7 +18,7 @@ pub mod judge;
 pub mod lessons;
 pub mod types;
 
-pub use types::{EvalCommit, EvalDataV2, EvalRun, JudgingResult, AgentEvalResults};
 pub use agent_runner::AgentRunConfig;
 pub use judge::JudgeConfig;
 pub use lessons::LessonsConfig;
+pub use types::{AgentEvalResults, EvalCommit, EvalDataV2, EvalRun, JudgingResult};
diff --git a/evals/jbench/src/types.rs b/evals/jbench/src/types.rs
index 1cb51e17e..39d4645c5 100644
--- a/evals/jbench/src/types.rs
+++ b/evals/jbench/src/types.rs
@@ -112,6 +112,12 @@ pub struct EvalDataV2 {
 /// All three score fields are on the same `[0.0, 10.0]` scale; `f64` is
 /// used so we can also store the *averaged* per-dimension scores when
 /// aggregating multiple judges (see `judge::judge_with_three_models`).
+///
+/// On-disk JSON stays `snake_case` to match the rest of jcode's eval
+/// outputs, but each score field also accepts the `camelCase` spelling
+/// (`completionScore`, etc.) via `serde(alias = ...)` so we can
+/// deserialize LLM judge responses directly without an intermediate
+/// wire-format struct.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct JudgingResult {
     /// Free-form prose comparing the agent's diff to the ground truth.
@@ -121,10 +127,13 @@ pub struct JudgingResult {
     /// Bullet-point weaknesses called out by the judge.
     pub weaknesses: Vec<String>,
     /// How completely the prompt was addressed, `[0.0, 10.0]`.
+    #[serde(alias = "completionScore")]
     pub completion_score: f64,
     /// Code structure / maintainability, `[0.0, 10.0]`.
+    #[serde(alias = "codeQualityScore")]
     pub code_quality_score: f64,
     /// Combined assessment, `[0.0, 10.0]`. JBench's canonical metric.
+    #[serde(alias = "overallScore")]
     pub overall_score: f64,
 }
 
diff --git a/evals/jbench/tests/types.rs b/evals/jbench/tests/types.rs
index 2a8efd02e..fcaa832fb 100644
--- a/evals/jbench/tests/types.rs
+++ b/evals/jbench/tests/types.rs
@@ -4,9 +4,7 @@
 //! and write, and they fail loudly if anyone changes a field's
 //! `snake_case` name without updating consumers.
 
-use jcode_jbench::types::{
-    EvalCommit, FileDiff, FileDiffStatus, JudgingResult,
-};
+use jcode_jbench::types::{EvalCommit, FileDiff, FileDiffStatus, JudgingResult};
 
 #[test]
 fn eval_commit_round_trips_through_json() {
diff --git a/src/agent/prompting.rs b/src/agent/prompting.rs
index ba9719985..6107a314f 100644
--- a/src/agent/prompting.rs
+++ b/src/agent/prompting.rs
@@ -122,7 +122,6 @@ impl Agent {
     }
 }
 
-
 /// Wrap a step prompt body in `<system_reminder>...</system_reminder>` tags.
 ///
 /// Step prompts are emitted by the harness (not typed by the user), but they
diff --git a/src/prompt_placeholders.rs b/src/prompt_placeholders.rs
index 68cee139a..635beb8cc 100644
--- a/src/prompt_placeholders.rs
+++ b/src/prompt_placeholders.rs
@@ -77,7 +77,12 @@ fn truncate_chars(s: &str, max_chars: usize) -> String {
 ///
 /// Length caps documented on [`PlaceholderContext`] are enforced here, so
 /// callers may pass un-truncated input and trust the output to be bounded.
-pub fn substitute_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String {
+///
+/// This is the **context-driven** substitution path used for built-in
+/// Phase 4 placeholders. For user-supplied template bindings (arbitrary
+/// `HashMap<String, String>`), use
+/// [`crate::prompt_templates::substitute_placeholders`] instead.
+pub fn substitute_context_placeholders(prompt: &str, ctx: &PlaceholderContext) -> String {
     if prompt.is_empty() {
         return String::new();
     }
@@ -123,11 +128,8 @@ mod tests {
                      k=[{{KNOWLEDGE_FILES}}] git=[{{GIT_CHANGES}}] \
                      date=[{{CURRENT_DATE}}] steps=[{{REMAINING_STEPS}}] \
                      sys=[{{SYSTEM_INFO}}]";
-        let out = substitute_placeholders(input, &ctx);
-        assert_eq!(
-            out,
-            "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]"
-        );
+        let out = substitute_context_placeholders(input, &ctx);
+        assert_eq!(out, "tree=[] full=[] k=[] git=[] date=[] steps=[] sys=[]");
     }
 
     #[test]
@@ -136,11 +138,11 @@ mod tests {
             current_date: "2026-05-25".to_string(),
             ..Default::default()
         };
-        let out = substitute_placeholders("today is {{CURRENT_DATE}}.", &ctx);
+        let out = substitute_context_placeholders("today is {{CURRENT_DATE}}.", &ctx);
         assert_eq!(out, "today is 2026-05-25.");
 
         // Unrelated placeholder stays empty in the same call.
-        let out2 = substitute_placeholders(
+        let out2 = substitute_context_placeholders(
             "date={{CURRENT_DATE}} steps={{REMAINING_STEPS}}",
             &ctx,
         );
@@ -161,7 +163,7 @@ mod tests {
                      {{KNOWLEDGE_FILES}}\n\n## Meta\n\
                      date={{CURRENT_DATE}} steps={{REMAINING_STEPS}} \
                      sys={{SYSTEM_INFO}}";
-        let out = substitute_placeholders(input, &ctx);
+        let out = substitute_context_placeholders(input, &ctx);
         let expected = "## Tree\nsrc/\n  lib.rs\n\n## Knowledge\n\
                         AGENTS.md contents\n\n## Meta\n\
                         date=2026-05-25 steps=7 sys=linux x86_64";
@@ -176,7 +178,7 @@ mod tests {
         };
         let input = "known={{CURRENT_DATE}} unknown={{NOT_A_REAL_TOKEN}} \
                      other={{ALSO_BOGUS}}";
-        let out = substitute_placeholders(input, &ctx);
+        let out = substitute_context_placeholders(input, &ctx);
         assert_eq!(
             out,
             "known=2026-05-25 unknown={{NOT_A_REAL_TOKEN}} other={{ALSO_BOGUS}}"
@@ -191,7 +193,7 @@ mod tests {
             file_tree_small: big.clone(),
             ..Default::default()
         };
-        let out = substitute_placeholders("[{{FILE_TREE_SMALL}}]", &ctx);
+        let out = substitute_context_placeholders("[{{FILE_TREE_SMALL}}]", &ctx);
         // Two bracket characters plus the cap.
         assert_eq!(out.chars().count(), FILE_TREE_SMALL_MAX_CHARS + 2);
         assert!(out.starts_with('['));
diff --git a/src/tui/app/commands.rs b/src/tui/app/commands.rs
index 536d50231..4238e65fe 100644
--- a/src/tui/app/commands.rs
+++ b/src/tui/app/commands.rs
@@ -1925,7 +1925,8 @@ pub(super) fn handle_session_command(app: &mut App, trimmed: &str) -> bool {
             Ok(out) if out.status.success() => {
                 let _ = std::fs::remove_file(&tmp_path);
                 let url = String::from_utf8_lossy(&out.stdout)
-                    .lines().rfind(|l| l.starts_with("https://"))
+                    .lines()
+                    .rfind(|l| l.starts_with("https://"))
                     .unwrap_or("")
                     .trim()
                     .to_string();
diff --git a/tests/tool_fixtures.rs b/tests/tool_fixtures.rs
index 9a7d98e97..6c7fc0318 100644
--- a/tests/tool_fixtures.rs
+++ b/tests/tool_fixtures.rs
@@ -105,9 +105,10 @@ fn collect_fixtures() -> Vec<(String, Fixture)> {
             .unwrap_or("")
             .to_string();
         if let Some(needle) = filter.as_deref()
-            && !stem.contains(needle) {
-                continue;
-            }
+            && !stem.contains(needle)
+        {
+            continue;
+        }
         let raw = std::fs::read_to_string(&path).expect("read fixture");
         let fixture: Fixture =
             serde_json::from_str(&raw).unwrap_or_else(|e| panic!("parse fixture {}: {}", stem, e));