Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/results/audit_eval_ci/audit_eval_ci_wilson.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
model,slice,trials,recall,ci_low,ci_high
poe:gpt-5.5,ALL,300,1.0,0.9874,1.0
poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0
poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0
poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0
poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972
poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958
poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988
poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0
poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745
poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958
poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0
poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0
deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522
deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604
deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504
deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202
glm:glm-5,ALL,300,0.7,0.6459,0.7491
glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865
glm:glm-5,diff:L2,150,0.8,0.7289,0.8562
glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958
poe:glm-5,ALL,300,0.6567,0.6013,0.7081
poe:glm-5,diff:L1,75,0.28,0.191,0.3904
poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272
poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854
36 changes: 33 additions & 3 deletions scripts/render_finaudit_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]:
return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0))


def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path:
def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]:
"""(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present."""
if not ci_path.exists():
return {}
out: dict[tuple[str, str], tuple[float, float, float]] = {}
with ci_path.open(encoding="utf-8") as handle:
for row in csv.DictReader(handle):
out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"]))
return out


def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path:
# When the 3-sample CI experiment is available, bars and error bars both
# come from it (one consistent measurement); otherwise fall back to the
# deterministic summary with no error bars.
ci = ci or {}
ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"}
models = _ordered_models(table)
fig, ax = plt.subplots(figsize=(6.6, 3.0))
width = 0.26
colors = ["#c44e52", "#55a868", "#4c72b0"]
for j, (slice_key, label) in enumerate(DIFF):
xs = [i + (j - 1) * width for i in range(len(models))]
ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j])
heights, lo, hi = [], [], []
for m in models:
trip = ci.get((m, ci_slice[slice_key])) if ci else None
if trip:
heights.append(trip[0])
lo.append(max(0.0, trip[0] - trip[1]))
hi.append(max(0.0, trip[2] - trip[0]))
else:
heights.append(table[m].get(slice_key, 0.0))
lo.append(0.0)
hi.append(0.0)
errs = [lo, hi] if ci else None
ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8})
ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
ax.set_ylabel("Recall", fontsize=9)
ax.set_ylim(0, 1.05)
Expand Down Expand Up @@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) ->
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Render FinAudit paper figures.")
parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv")
parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv")
parser.add_argument("--output-dir", default="paper/finaudit/figures")
args = parser.parse_args(argv)
summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary)
ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci)
output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
table = _load(summary)
print("wrote", render_difficulty_bars(table, output_dir))
print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path)))
print("wrote", render_kind_heatmap(table, output_dir))
return 0

Expand Down
2 changes: 1 addition & 1 deletion scripts/run_audit_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int:
sample=sample, temperature=args.temperature,
)
except Exception as exc: # provider failures should not lose the run
print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True)
print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
continue
findings = parse_findings(response)
scores = score_findings(findings, [truth])
Expand Down