From 81c586b8735f39334c604e70b03e6b4e42a0dcb8 Mon Sep 17 00:00:00 2001 From: weich97 <25754285+weich97@users.noreply.github.com> Date: Mon, 15 Jun 2026 11:12:08 +0800 Subject: [PATCH] Add FinAudit recall confidence intervals from 3-sample experiment - run_audit_eval.py: surface the exception message (not just the type) on a failed task so balance-exhaustion (402) is visible to wrappers. - render_finaudit_figures.py: draw 95% Wilson intervals on the difficulty-tier bars when the CI experiment is present, using its temperature-0.7 point estimates for consistency. - audit_eval_ci_wilson.csv: per-(model, tier) recall with Wilson 95% intervals over 6 models x 100 tasks x 3 samples. The difficulty inversion is confirmed: strong auditors' L1 intervals (lower bound >= 0.73) are disjoint from weak auditors' (upper bound <= 0.60). --- .../audit_eval_ci/audit_eval_ci_wilson.csv | 25 +++++++++++++ scripts/render_finaudit_figures.py | 36 +++++++++++++++++-- scripts/run_audit_eval.py | 2 +- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 docs/results/audit_eval_ci/audit_eval_ci_wilson.csv diff --git a/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv new file mode 100644 index 00000000..cd2391a3 --- /dev/null +++ b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv @@ -0,0 +1,25 @@ +model,slice,trials,recall,ci_low,ci_high +poe:gpt-5.5,ALL,300,1.0,0.9874,1.0 +poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0 +poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0 +poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0 +poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972 +poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958 +poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988 +poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0 +poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745 +poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958 +poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0 +poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0 +deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522 +deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604 +deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504 +deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202 +glm:glm-5,ALL,300,0.7,0.6459,0.7491 +glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865 +glm:glm-5,diff:L2,150,0.8,0.7289,0.8562 +glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958 +poe:glm-5,ALL,300,0.6567,0.6013,0.7081 +poe:glm-5,diff:L1,75,0.28,0.191,0.3904 +poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272 +poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854 diff --git a/scripts/render_finaudit_figures.py b/scripts/render_finaudit_figures.py index 7b6c2a66..bb9946ff 100644 --- a/scripts/render_finaudit_figures.py +++ b/scripts/render_finaudit_figures.py @@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]: return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0)) -def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path: +def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]: + """(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present.""" + if not ci_path.exists(): + return {} + out: dict[tuple[str, str], tuple[float, float, float]] = {} + with ci_path.open(encoding="utf-8") as handle: + for row in csv.DictReader(handle): + out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"])) + return out + + +def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path: + # When the 3-sample CI experiment is available, bars and error bars both + # come from it (one consistent measurement); otherwise fall back to the + # deterministic summary with no error bars. + ci = ci or {} + ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"} models = _ordered_models(table) fig, ax = plt.subplots(figsize=(6.6, 3.0)) width = 0.26 colors = ["#c44e52", "#55a868", "#4c72b0"] for j, (slice_key, label) in enumerate(DIFF): xs = [i + (j - 1) * width for i in range(len(models))] - ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j]) + heights, lo, hi = [], [], [] + for m in models: + trip = ci.get((m, ci_slice[slice_key])) if ci else None + if trip: + heights.append(trip[0]) + lo.append(max(0.0, trip[0] - trip[1])) + hi.append(max(0.0, trip[2] - trip[0])) + else: + heights.append(table[m].get(slice_key, 0.0)) + lo.append(0.0) + hi.append(0.0) + errs = [lo, hi] if ci else None + ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8}) ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8) ax.set_ylabel("Recall", fontsize=9) ax.set_ylim(0, 1.05) @@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) -> def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Render FinAudit paper figures.") parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv") + parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv") parser.add_argument("--output-dir", default="paper/finaudit/figures") args = parser.parse_args(argv) summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary) + ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci) output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) table = _load(summary) - print("wrote", render_difficulty_bars(table, output_dir)) + print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path))) print("wrote", render_kind_heatmap(table, output_dir)) return 0 diff --git a/scripts/run_audit_eval.py b/scripts/run_audit_eval.py index c39a396d..0f1462f8 100644 --- a/scripts/run_audit_eval.py +++ b/scripts/run_audit_eval.py @@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int: sample=sample, temperature=args.temperature, ) except Exception as exc: # provider failures should not lose the run - print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True) + print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True) continue findings = parse_findings(response) scores = score_findings(findings, [truth])