From 81c586b8735f39334c604e70b03e6b4e42a0dcb8 Mon Sep 17 00:00:00 2001
From: weich97 <25754285+weich97@users.noreply.github.com>
Date: Mon, 15 Jun 2026 11:12:08 +0800
Subject: [PATCH] Add FinAudit recall confidence intervals from 3-sample
 experiment

- run_audit_eval.py: surface the exception message (not just the type)
  on a failed task so balance-exhaustion (402) is visible to wrappers.
- render_finaudit_figures.py: draw 95% Wilson intervals on the
  difficulty-tier bars when the CI experiment is present, using its
  temperature-0.7 point estimates for consistency.
- audit_eval_ci_wilson.csv: per-(model, tier) recall with Wilson 95%
  intervals over 6 models x 100 tasks x 3 samples. The difficulty
  inversion is confirmed: strong auditors' L1 intervals (lower bound
  >= 0.73) are disjoint from weak auditors' (upper bound <= 0.60).
---
 .../audit_eval_ci/audit_eval_ci_wilson.csv    | 25 +++++++++++++
 scripts/render_finaudit_figures.py            | 36 +++++++++++++++++--
 scripts/run_audit_eval.py                     |  2 +-
 3 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 docs/results/audit_eval_ci/audit_eval_ci_wilson.csv

diff --git a/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv
new file mode 100644
index 00000000..cd2391a3
--- /dev/null
+++ b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv
@@ -0,0 +1,25 @@
+model,slice,trials,recall,ci_low,ci_high
+poe:gpt-5.5,ALL,300,1.0,0.9874,1.0
+poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0
+poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0
+poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0
+poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972
+poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958
+poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988
+poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0
+poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745
+poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958
+poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0
+poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0
+deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522
+deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604
+deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504
+deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202
+glm:glm-5,ALL,300,0.7,0.6459,0.7491
+glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865
+glm:glm-5,diff:L2,150,0.8,0.7289,0.8562
+glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958
+poe:glm-5,ALL,300,0.6567,0.6013,0.7081
+poe:glm-5,diff:L1,75,0.28,0.191,0.3904
+poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272
+poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854
diff --git a/scripts/render_finaudit_figures.py b/scripts/render_finaudit_figures.py
index 7b6c2a66..bb9946ff 100644
--- a/scripts/render_finaudit_figures.py
+++ b/scripts/render_finaudit_figures.py
@@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]:
     return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0))
 
 
-def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path:
+def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]:
+    """(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present."""
+    if not ci_path.exists():
+        return {}
+    out: dict[tuple[str, str], tuple[float, float, float]] = {}
+    with ci_path.open(encoding="utf-8") as handle:
+        for row in csv.DictReader(handle):
+            out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"]))
+    return out
+
+
+def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path:
+    # When the 3-sample CI experiment is available, bars and error bars both
+    # come from it (one consistent measurement); otherwise fall back to the
+    # deterministic summary with no error bars.
+    ci = ci or {}
+    ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"}
     models = _ordered_models(table)
     fig, ax = plt.subplots(figsize=(6.6, 3.0))
     width = 0.26
     colors = ["#c44e52", "#55a868", "#4c72b0"]
     for j, (slice_key, label) in enumerate(DIFF):
         xs = [i + (j - 1) * width for i in range(len(models))]
-        ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j])
+        heights, lo, hi = [], [], []
+        for m in models:
+            trip = ci.get((m, ci_slice[slice_key])) if ci else None
+            if trip:
+                heights.append(trip[0])
+                lo.append(max(0.0, trip[0] - trip[1]))
+                hi.append(max(0.0, trip[2] - trip[0]))
+            else:
+                heights.append(table[m].get(slice_key, 0.0))
+                lo.append(0.0)
+                hi.append(0.0)
+        errs = [lo, hi] if ci else None
+        ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8})
     ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
     ax.set_ylabel("Recall", fontsize=9)
     ax.set_ylim(0, 1.05)
@@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) ->
 def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser(description="Render FinAudit paper figures.")
     parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv")
+    parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv")
     parser.add_argument("--output-dir", default="paper/finaudit/figures")
     args = parser.parse_args(argv)
     summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary)
+    ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci)
     output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     table = _load(summary)
-    print("wrote", render_difficulty_bars(table, output_dir))
+    print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path)))
     print("wrote", render_kind_heatmap(table, output_dir))
     return 0
 
diff --git a/scripts/run_audit_eval.py b/scripts/run_audit_eval.py
index c39a396d..0f1462f8 100644
--- a/scripts/run_audit_eval.py
+++ b/scripts/run_audit_eval.py
@@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int:
                             sample=sample, temperature=args.temperature,
                         )
                     except Exception as exc:  # provider failures should not lose the run
-                        print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True)
+                        print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
                         continue
                     findings = parse_findings(response)
                     scores = score_findings(findings, [truth])