Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions docs/results/audit_eval_ci/audit_eval_ci_wilson.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
model,slice,trials,recall,ci_low,ci_high
poe:gpt-5.5,ALL,300,1.0,0.9874,1.0
poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0
poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0
poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0
poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972
poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958
poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988
poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0
poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745
poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958
poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0
poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0
deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522
deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604
deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504
deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202
glm:glm-5,ALL,300,0.7,0.6459,0.7491
glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865
glm:glm-5,diff:L2,150,0.8,0.7289,0.8562
glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958
poe:glm-5,ALL,300,0.6567,0.6013,0.7081
poe:glm-5,diff:L1,75,0.28,0.191,0.3904
poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272
poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854
50 changes: 25 additions & 25 deletions docs/results/memory_pollution_llm/model_difference.csv
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
agent,kind,outcome,max_dose,mean_delta,permutation_p_value,cohens_d
deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.0078125,0.7937620406122988
deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.05078125,0.6858006858010287
glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,0.125,-0.6210590034081187
glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,0.25,0.45703667411923465
poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,0.5,-0.4743416490252569
poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,1.0,-0.1355261854357876
poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,0.1953125,0.46662826262869134
poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.009765625,1.219439788928185
poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,0.671875,-0.18428853505018533
poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,0.34375,0.4537180990676371
poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,1.0,0.0
poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,1.0,-0.31622776601683794
deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,0.365234375,-0.3295596404413543
deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,0.271484375,-0.3772928612673862
glm:glm-5,fake_rejections,turnover_events,0.75,0.0,1.0,0.0
glm:glm-5,fake_violations,turnover_events,0.75,-0.1,0.8828125,-0.07951845705060932
poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.125,0.6095569153307366
poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.125,0.5105227557738203
poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,0.01953125,-0.9259274142755227
poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,0.11328125,-0.5843429835076802
poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,0.09375,-0.564710224624343
poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,0.5,-0.2966883079863587
poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,1.0,-0.29839251436340897
poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,0.625,-0.2882982594743907
agent,kind,outcome,max_dose,mean_delta,ci_low,ci_high,permutation_p_value,cohens_d
deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.034722222222222224,0.14444444444444446,0.0078125,0.7937620406122988
deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.025000000000000015,0.23750000000000004,0.05078125,0.6858006858010287
glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,-0.024999999999999994,-0.0013888888888888868,0.125,-0.6210590034081187
glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,-0.0027777777777777735,0.029166666666666674,0.25,0.45703667411923465
poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,-0.013888888888888885,0.0,0.5,-0.4743416490252569
poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,-0.008333333333333333,0.004166666666666668,1.0,-0.1355261854357876
poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,-0.0125,0.09305555555555553,0.1953125,0.46662826262869134
poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.03888888888888886,0.11805555555555554,0.009765625,1.219439788928185
poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,-0.022222222222222223,0.012499999999999997,0.671875,-0.18428853505018533
poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,-0.0027777777777777735,0.037500000000000006,0.34375,0.4537180990676371
poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,0.0,0.0,1.0,0.0
poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,-0.004166666666666668,0.0,1.0,-0.31622776601683794
deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,-2.7,0.8999999999999997,0.365234375,-0.3295596404413543
deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,-3.0333333333333337,0.6333333333333332,0.271484375,-0.3772928612673862
glm:glm-5,fake_rejections,turnover_events,0.75,0.0,-1.1999999999999997,0.9999999999999998,1.0,0.0
glm:glm-5,fake_violations,turnover_events,0.75,-0.1,-0.7666666666666673,0.6333333333333334,0.8828125,-0.07951845705060932
poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.06666666666666661,1.0666666666666667,0.125,0.6095569153307366
poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.06666666666666661,1.2666666666666668,0.125,0.5105227557738203
poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,-2.3999999999999995,-0.5666666666666659,0.01953125,-0.9259274142755227
poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,-2.6666666666666665,-0.03333333333333286,0.11328125,-0.5843429835076802
poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,-2.166666666666667,-0.10000000000000035,0.09375,-0.564710224624343
poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,-0.7333333333333332,0.19999999999999965,0.5,-0.2966883079863587
poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,-2.0,0.10000000000000071,1.0,-0.29839251436340897
poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,-2.1666666666666665,0.26666666666666694,0.625,-0.2882982594743907
4 changes: 3 additions & 1 deletion scripts/analyze_memory_pollution_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def model_difference_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
"outcome": outcome,
"max_dose": max_dose,
"mean_delta": result["mean_delta"],
"ci_low": result["delta_ci_low"],
"ci_high": result["delta_ci_high"],
"permutation_p_value": result["permutation_p_value"],
"cohens_d": result["cohens_d"],
}
Expand Down Expand Up @@ -189,7 +191,7 @@ def main(argv: list[str] | None = None) -> int:
_write_csv(
output_dir / "model_difference.csv",
model_diff,
["agent", "kind", "outcome", "max_dose", "mean_delta", "permutation_p_value", "cohens_d"],
["agent", "kind", "outcome", "max_dose", "mean_delta", "ci_low", "ci_high", "permutation_p_value", "cohens_d"],
)
_write_markdown(output_dir / "memory_pollution_llm.md", dose_response, model_diff, rows)
sig = sum(1 for r in dose_response if r["q_value"] is not None and float(r["q_value"]) < 0.05)
Expand Down
36 changes: 33 additions & 3 deletions scripts/render_finaudit_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]:
return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0))


def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path:
def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]:
"""(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present."""
if not ci_path.exists():
return {}
out: dict[tuple[str, str], tuple[float, float, float]] = {}
with ci_path.open(encoding="utf-8") as handle:
for row in csv.DictReader(handle):
out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"]))
return out


def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path:
# When the 3-sample CI experiment is available, bars and error bars both
# come from it (one consistent measurement); otherwise fall back to the
# deterministic summary with no error bars.
ci = ci or {}
ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"}
models = _ordered_models(table)
fig, ax = plt.subplots(figsize=(6.6, 3.0))
width = 0.26
colors = ["#c44e52", "#55a868", "#4c72b0"]
for j, (slice_key, label) in enumerate(DIFF):
xs = [i + (j - 1) * width for i in range(len(models))]
ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j])
heights, lo, hi = [], [], []
for m in models:
trip = ci.get((m, ci_slice[slice_key])) if ci else None
if trip:
heights.append(trip[0])
lo.append(max(0.0, trip[0] - trip[1]))
hi.append(max(0.0, trip[2] - trip[0]))
else:
heights.append(table[m].get(slice_key, 0.0))
lo.append(0.0)
hi.append(0.0)
errs = [lo, hi] if ci else None
ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8})
ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
ax.set_ylabel("Recall", fontsize=9)
ax.set_ylim(0, 1.05)
Expand Down Expand Up @@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) ->
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Render FinAudit paper figures.")
parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv")
parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv")
parser.add_argument("--output-dir", default="paper/finaudit/figures")
args = parser.parse_args(argv)
summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary)
ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci)
output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
table = _load(summary)
print("wrote", render_difficulty_bars(table, output_dir))
print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path)))
print("wrote", render_kind_heatmap(table, output_dir))
return 0

Expand Down
14 changes: 10 additions & 4 deletions scripts/render_mempollution_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,13 @@ def render_decay_invariance(output_dir: Path) -> Path:
def render_model_shift(output_dir: Path) -> Path:
rows = [r for r in csv.DictReader(LLM.open(encoding="utf-8")) if r["outcome"] == "hold_ratio"]
shift: dict[str, dict[str, float]] = defaultdict(dict)
err: dict[str, dict[str, tuple[float, float]]] = defaultdict(dict)
sig: dict[str, dict[str, bool]] = defaultdict(dict)
for r in rows:
shift[r["agent"]][r["kind"]] = float(r["mean_delta"])
d = float(r["mean_delta"])
shift[r["agent"]][r["kind"]] = d
if r.get("ci_low") not in ("", None):
err[r["agent"]][r["kind"]] = (max(0.0, d - float(r["ci_low"])), max(0.0, float(r["ci_high"]) - d))
p = r["permutation_p_value"]
sig[r["agent"]][r["kind"]] = p not in ("", None) and float(p) < 0.05
models = [m for m in LLM_LABEL if m in shift]
Expand All @@ -79,10 +83,12 @@ def render_model_shift(output_dir: Path) -> Path:
for j, (k, label, color) in enumerate(kinds):
xs = [i + (j - 0.5) * width for i in range(len(models))]
vals = [shift[m].get(k, 0.0) for m in models]
bars = ax.bar(xs, vals, width, label=label, color=color)
for x, m, b in zip(xs, models, bars):
lo = [err[m].get(k, (0.0, 0.0))[0] for m in models]
hi = [err[m].get(k, (0.0, 0.0))[1] for m in models]
bars = ax.bar(xs, vals, width, label=label, color=color, yerr=[lo, hi], capsize=2, error_kw={"linewidth": 0.7})
for x, m, b, h in zip(xs, models, bars, hi):
if sig[m].get(k):
ax.text(x, b.get_height() + 0.004, "*", ha="center", fontsize=11)
ax.text(x, b.get_height() + h + 0.006, "*", ha="center", fontsize=11)
ax.axhline(0.0, color="black", linewidth=0.7)
ax.set_xticks(range(len(models)), [LLM_LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
ax.set_ylabel("Hold-ratio shift at max dose", fontsize=9)
Expand Down
2 changes: 1 addition & 1 deletion scripts/run_audit_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int:
sample=sample, temperature=args.temperature,
)
except Exception as exc: # provider failures should not lose the run
print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True)
print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
continue
findings = parse_findings(response)
scores = score_findings(findings, [truth])
Expand Down