diff --git a/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv new file mode 100644 index 00000000..cd2391a3 --- /dev/null +++ b/docs/results/audit_eval_ci/audit_eval_ci_wilson.csv @@ -0,0 +1,25 @@ +model,slice,trials,recall,ci_low,ci_high +poe:gpt-5.5,ALL,300,1.0,0.9874,1.0 +poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0 +poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0 +poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0 +poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972 +poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958 +poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988 +poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0 +poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745 +poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958 +poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0 +poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0 +deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522 +deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604 +deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504 +deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202 +glm:glm-5,ALL,300,0.7,0.6459,0.7491 +glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865 +glm:glm-5,diff:L2,150,0.8,0.7289,0.8562 +glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958 +poe:glm-5,ALL,300,0.6567,0.6013,0.7081 +poe:glm-5,diff:L1,75,0.28,0.191,0.3904 +poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272 +poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854 diff --git a/docs/results/memory_pollution_llm/model_difference.csv b/docs/results/memory_pollution_llm/model_difference.csv index aee3e756..f234340e 100644 --- a/docs/results/memory_pollution_llm/model_difference.csv +++ b/docs/results/memory_pollution_llm/model_difference.csv @@ -1,25 +1,25 @@ -agent,kind,outcome,max_dose,mean_delta,permutation_p_value,cohens_d -deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.0078125,0.7937620406122988 -deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.05078125,0.6858006858010287 -glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,0.125,-0.6210590034081187 -glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,0.25,0.45703667411923465 -poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,0.5,-0.4743416490252569 -poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,1.0,-0.1355261854357876 -poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,0.1953125,0.46662826262869134 -poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.009765625,1.219439788928185 -poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,0.671875,-0.18428853505018533 -poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,0.34375,0.4537180990676371 -poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,1.0,0.0 -poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,1.0,-0.31622776601683794 -deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,0.365234375,-0.3295596404413543 -deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,0.271484375,-0.3772928612673862 -glm:glm-5,fake_rejections,turnover_events,0.75,0.0,1.0,0.0 -glm:glm-5,fake_violations,turnover_events,0.75,-0.1,0.8828125,-0.07951845705060932 -poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.125,0.6095569153307366 -poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.125,0.5105227557738203 -poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,0.01953125,-0.9259274142755227 -poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,0.11328125,-0.5843429835076802 -poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,0.09375,-0.564710224624343 -poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,0.5,-0.2966883079863587 -poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,1.0,-0.29839251436340897 -poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,0.625,-0.2882982594743907 +agent,kind,outcome,max_dose,mean_delta,ci_low,ci_high,permutation_p_value,cohens_d +deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.034722222222222224,0.14444444444444446,0.0078125,0.7937620406122988 +deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.025000000000000015,0.23750000000000004,0.05078125,0.6858006858010287 +glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,-0.024999999999999994,-0.0013888888888888868,0.125,-0.6210590034081187 +glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,-0.0027777777777777735,0.029166666666666674,0.25,0.45703667411923465 +poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,-0.013888888888888885,0.0,0.5,-0.4743416490252569 +poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,-0.008333333333333333,0.004166666666666668,1.0,-0.1355261854357876 +poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,-0.0125,0.09305555555555553,0.1953125,0.46662826262869134 +poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.03888888888888886,0.11805555555555554,0.009765625,1.219439788928185 +poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,-0.022222222222222223,0.012499999999999997,0.671875,-0.18428853505018533 +poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,-0.0027777777777777735,0.037500000000000006,0.34375,0.4537180990676371 +poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,0.0,0.0,1.0,0.0 +poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,-0.004166666666666668,0.0,1.0,-0.31622776601683794 +deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,-2.7,0.8999999999999997,0.365234375,-0.3295596404413543 +deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,-3.0333333333333337,0.6333333333333332,0.271484375,-0.3772928612673862 +glm:glm-5,fake_rejections,turnover_events,0.75,0.0,-1.1999999999999997,0.9999999999999998,1.0,0.0 +glm:glm-5,fake_violations,turnover_events,0.75,-0.1,-0.7666666666666673,0.6333333333333334,0.8828125,-0.07951845705060932 +poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.06666666666666661,1.0666666666666667,0.125,0.6095569153307366 +poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.06666666666666661,1.2666666666666668,0.125,0.5105227557738203 +poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,-2.3999999999999995,-0.5666666666666659,0.01953125,-0.9259274142755227 +poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,-2.6666666666666665,-0.03333333333333286,0.11328125,-0.5843429835076802 +poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,-2.166666666666667,-0.10000000000000035,0.09375,-0.564710224624343 +poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,-0.7333333333333332,0.19999999999999965,0.5,-0.2966883079863587 +poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,-2.0,0.10000000000000071,1.0,-0.29839251436340897 +poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,-2.1666666666666665,0.26666666666666694,0.625,-0.2882982594743907 diff --git a/scripts/analyze_memory_pollution_llm.py b/scripts/analyze_memory_pollution_llm.py index ad691f1f..4eed9a77 100644 --- a/scripts/analyze_memory_pollution_llm.py +++ b/scripts/analyze_memory_pollution_llm.py @@ -149,6 +149,8 @@ def model_difference_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: "outcome": outcome, "max_dose": max_dose, "mean_delta": result["mean_delta"], + "ci_low": result["delta_ci_low"], + "ci_high": result["delta_ci_high"], "permutation_p_value": result["permutation_p_value"], "cohens_d": result["cohens_d"], } @@ -189,7 +191,7 @@ def main(argv: list[str] | None = None) -> int: _write_csv( output_dir / "model_difference.csv", model_diff, - ["agent", "kind", "outcome", "max_dose", "mean_delta", "permutation_p_value", "cohens_d"], + ["agent", "kind", "outcome", "max_dose", "mean_delta", "ci_low", "ci_high", "permutation_p_value", "cohens_d"], ) _write_markdown(output_dir / "memory_pollution_llm.md", dose_response, model_diff, rows) sig = sum(1 for r in dose_response if r["q_value"] is not None and float(r["q_value"]) < 0.05) diff --git a/scripts/render_finaudit_figures.py b/scripts/render_finaudit_figures.py index 7b6c2a66..bb9946ff 100644 --- a/scripts/render_finaudit_figures.py +++ b/scripts/render_finaudit_figures.py @@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]: return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0)) -def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path: +def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]: + """(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present.""" + if not ci_path.exists(): + return {} + out: dict[tuple[str, str], tuple[float, float, float]] = {} + with ci_path.open(encoding="utf-8") as handle: + for row in csv.DictReader(handle): + out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"])) + return out + + +def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path: + # When the 3-sample CI experiment is available, bars and error bars both + # come from it (one consistent measurement); otherwise fall back to the + # deterministic summary with no error bars. + ci = ci or {} + ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"} models = _ordered_models(table) fig, ax = plt.subplots(figsize=(6.6, 3.0)) width = 0.26 colors = ["#c44e52", "#55a868", "#4c72b0"] for j, (slice_key, label) in enumerate(DIFF): xs = [i + (j - 1) * width for i in range(len(models))] - ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j]) + heights, lo, hi = [], [], [] + for m in models: + trip = ci.get((m, ci_slice[slice_key])) if ci else None + if trip: + heights.append(trip[0]) + lo.append(max(0.0, trip[0] - trip[1])) + hi.append(max(0.0, trip[2] - trip[0])) + else: + heights.append(table[m].get(slice_key, 0.0)) + lo.append(0.0) + hi.append(0.0) + errs = [lo, hi] if ci else None + ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8}) ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8) ax.set_ylabel("Recall", fontsize=9) ax.set_ylim(0, 1.05) @@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) -> def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser(description="Render FinAudit paper figures.") parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv") + parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv") parser.add_argument("--output-dir", default="paper/finaudit/figures") args = parser.parse_args(argv) summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary) + ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci) output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) table = _load(summary) - print("wrote", render_difficulty_bars(table, output_dir)) + print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path))) print("wrote", render_kind_heatmap(table, output_dir)) return 0 diff --git a/scripts/render_mempollution_figures.py b/scripts/render_mempollution_figures.py index b915edaa..144bb606 100644 --- a/scripts/render_mempollution_figures.py +++ b/scripts/render_mempollution_figures.py @@ -66,9 +66,13 @@ def render_decay_invariance(output_dir: Path) -> Path: def render_model_shift(output_dir: Path) -> Path: rows = [r for r in csv.DictReader(LLM.open(encoding="utf-8")) if r["outcome"] == "hold_ratio"] shift: dict[str, dict[str, float]] = defaultdict(dict) + err: dict[str, dict[str, tuple[float, float]]] = defaultdict(dict) sig: dict[str, dict[str, bool]] = defaultdict(dict) for r in rows: - shift[r["agent"]][r["kind"]] = float(r["mean_delta"]) + d = float(r["mean_delta"]) + shift[r["agent"]][r["kind"]] = d + if r.get("ci_low") not in ("", None): + err[r["agent"]][r["kind"]] = (max(0.0, d - float(r["ci_low"])), max(0.0, float(r["ci_high"]) - d)) p = r["permutation_p_value"] sig[r["agent"]][r["kind"]] = p not in ("", None) and float(p) < 0.05 models = [m for m in LLM_LABEL if m in shift] @@ -79,10 +83,12 @@ def render_model_shift(output_dir: Path) -> Path: for j, (k, label, color) in enumerate(kinds): xs = [i + (j - 0.5) * width for i in range(len(models))] vals = [shift[m].get(k, 0.0) for m in models] - bars = ax.bar(xs, vals, width, label=label, color=color) - for x, m, b in zip(xs, models, bars): + lo = [err[m].get(k, (0.0, 0.0))[0] for m in models] + hi = [err[m].get(k, (0.0, 0.0))[1] for m in models] + bars = ax.bar(xs, vals, width, label=label, color=color, yerr=[lo, hi], capsize=2, error_kw={"linewidth": 0.7}) + for x, m, b, h in zip(xs, models, bars, hi): if sig[m].get(k): - ax.text(x, b.get_height() + 0.004, "*", ha="center", fontsize=11) + ax.text(x, b.get_height() + h + 0.006, "*", ha="center", fontsize=11) ax.axhline(0.0, color="black", linewidth=0.7) ax.set_xticks(range(len(models)), [LLM_LABEL[m] for m in models], rotation=20, ha="right", fontsize=8) ax.set_ylabel("Hold-ratio shift at max dose", fontsize=9) diff --git a/scripts/run_audit_eval.py b/scripts/run_audit_eval.py index c39a396d..0f1462f8 100644 --- a/scripts/run_audit_eval.py +++ b/scripts/run_audit_eval.py @@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int: sample=sample, temperature=args.temperature, ) except Exception as exc: # provider failures should not lose the run - print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True) + print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True) continue findings = parse_findings(response) scores = score_findings(findings, [truth])