weich97 · weich97 · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -0,0 +1,25 @@
+model,slice,trials,recall,ci_low,ci_high
+poe:gpt-5.5,ALL,300,1.0,0.9874,1.0
+poe:gpt-5.5,diff:L1,75,1.0,0.9513,1.0
+poe:gpt-5.5,diff:L2,150,1.0,0.975,1.0
+poe:gpt-5.5,diff:L3,75,1.0,0.9513,1.0
+poe:claude-opus-4.7,ALL,300,0.9533,0.9232,0.972
+poe:claude-opus-4.7,diff:L1,75,0.8267,0.7257,0.8958
+poe:claude-opus-4.7,diff:L2,150,0.9933,0.9632,0.9988
+poe:claude-opus-4.7,diff:L3,75,1.0,0.9513,1.0
+poe:gemini-3.1-pro,ALL,300,0.9567,0.9273,0.9745
+poe:gemini-3.1-pro,diff:L1,75,0.8267,0.7257,0.8958
+poe:gemini-3.1-pro,diff:L2,150,1.0,0.975,1.0
+poe:gemini-3.1-pro,diff:L3,75,1.0,0.9513,1.0
+deepseek:deepseek-v4-pro,ALL,300,0.7033,0.6493,0.7522
+deepseek:deepseek-v4-pro,diff:L1,75,0.4933,0.3833,0.604
+deepseek:deepseek-v4-pro,diff:L2,150,0.7933,0.7216,0.8504
+deepseek:deepseek-v4-pro,diff:L3,75,0.7333,0.6237,0.8202
+glm:glm-5,ALL,300,0.7,0.6459,0.7491
+glm:glm-5,diff:L1,75,0.3733,0.2726,0.4865
+glm:glm-5,diff:L2,150,0.8,0.7289,0.8562
+glm:glm-5,diff:L3,75,0.8267,0.7257,0.8958
+poe:glm-5,ALL,300,0.6567,0.6013,0.7081
+poe:glm-5,diff:L1,75,0.28,0.191,0.3904
+poe:glm-5,diff:L2,150,0.7667,0.6928,0.8272
+poe:glm-5,diff:L3,75,0.8133,0.7107,0.8854
@@ -1,25 +1,25 @@
-agent,kind,outcome,max_dose,mean_delta,permutation_p_value,cohens_d
-deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.0078125,0.7937620406122988
-deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.05078125,0.6858006858010287
-glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,0.125,-0.6210590034081187
-glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,0.25,0.45703667411923465
-poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,0.5,-0.4743416490252569
-poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,1.0,-0.1355261854357876
-poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,0.1953125,0.46662826262869134
-poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.009765625,1.219439788928185
-poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,0.671875,-0.18428853505018533
-poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,0.34375,0.4537180990676371
-poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,1.0,0.0
-poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,1.0,-0.31622776601683794
-deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,0.365234375,-0.3295596404413543
-deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,0.271484375,-0.3772928612673862
-glm:glm-5,fake_rejections,turnover_events,0.75,0.0,1.0,0.0
-glm:glm-5,fake_violations,turnover_events,0.75,-0.1,0.8828125,-0.07951845705060932
-poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.125,0.6095569153307366
-poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.125,0.5105227557738203
-poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,0.01953125,-0.9259274142755227
-poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,0.11328125,-0.5843429835076802
-poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,0.09375,-0.564710224624343
-poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,0.5,-0.2966883079863587
-poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,1.0,-0.29839251436340897
-poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,0.625,-0.2882982594743907
+agent,kind,outcome,max_dose,mean_delta,ci_low,ci_high,permutation_p_value,cohens_d
+deepseek:deepseek-v4-pro,fake_rejections,hold_ratio,0.75,0.07916666666666668,0.034722222222222224,0.14444444444444446,0.0078125,0.7937620406122988
+deepseek:deepseek-v4-pro,fake_violations,hold_ratio,0.75,0.125,0.025000000000000015,0.23750000000000004,0.05078125,0.6858006858010287
+glm:glm-5,fake_rejections,hold_ratio,0.75,-0.012499999999999997,-0.024999999999999994,-0.0013888888888888868,0.125,-0.6210590034081187
+glm:glm-5,fake_violations,hold_ratio,0.75,0.012500000000000006,-0.0027777777777777735,0.029166666666666674,0.25,0.45703667411923465
+poe:claude-opus-4.7,fake_rejections,hold_ratio,0.75,-0.005555555555555554,-0.013888888888888885,0.0,0.5,-0.4743416490252569
+poe:claude-opus-4.7,fake_violations,hold_ratio,0.75,-0.001388888888888888,-0.008333333333333333,0.004166666666666668,1.0,-0.1355261854357876
+poe:gemini-3.1-pro,fake_rejections,hold_ratio,0.75,0.041666666666666664,-0.0125,0.09305555555555553,0.1953125,0.46662826262869134
+poe:gemini-3.1-pro,fake_violations,hold_ratio,0.75,0.08055555555555553,0.03888888888888886,0.11805555555555554,0.009765625,1.219439788928185
+poe:glm-5,fake_rejections,hold_ratio,0.75,-0.005555555555555555,-0.022222222222222223,0.012499999999999997,0.671875,-0.18428853505018533
+poe:glm-5,fake_violations,hold_ratio,0.75,0.015277777777777782,-0.0027777777777777735,0.037500000000000006,0.34375,0.4537180990676371
+poe:gpt-5.5,fake_rejections,hold_ratio,0.75,0.0,0.0,0.0,1.0,0.0
+poe:gpt-5.5,fake_violations,hold_ratio,0.75,-0.0013888888888888896,-0.004166666666666668,0.0,1.0,-0.31622776601683794
+deepseek:deepseek-v4-pro,fake_rejections,turnover_events,0.75,-0.9666666666666668,-2.7,0.8999999999999997,0.365234375,-0.3295596404413543
+deepseek:deepseek-v4-pro,fake_violations,turnover_events,0.75,-1.1666666666666665,-3.0333333333333337,0.6333333333333332,0.271484375,-0.3772928612673862
+glm:glm-5,fake_rejections,turnover_events,0.75,0.0,-1.1999999999999997,0.9999999999999998,1.0,0.0
+glm:glm-5,fake_violations,turnover_events,0.75,-0.1,-0.7666666666666673,0.6333333333333334,0.8828125,-0.07951845705060932
+poe:claude-opus-4.7,fake_rejections,turnover_events,0.75,0.4999999999999999,0.06666666666666661,1.0666666666666667,0.125,0.6095569153307366
+poe:claude-opus-4.7,fake_violations,turnover_events,0.75,0.5333333333333333,0.06666666666666661,1.2666666666666668,0.125,0.5105227557738203
+poe:gemini-3.1-pro,fake_rejections,turnover_events,0.75,-1.4333333333333333,-2.3999999999999995,-0.5666666666666659,0.01953125,-0.9259274142755227
+poe:gemini-3.1-pro,fake_violations,turnover_events,0.75,-1.3,-2.6666666666666665,-0.03333333333333286,0.11328125,-0.5843429835076802
+poe:glm-5,fake_rejections,turnover_events,0.75,-1.0000000000000004,-2.166666666666667,-0.10000000000000035,0.09375,-0.564710224624343
+poe:glm-5,fake_violations,turnover_events,0.75,-0.2333333333333334,-0.7333333333333332,0.19999999999999965,0.5,-0.2966883079863587
+poe:gpt-5.5,fake_rejections,turnover_events,0.75,-0.6333333333333331,-2.0,0.10000000000000071,1.0,-0.29839251436340897
+poe:gpt-5.5,fake_violations,turnover_events,0.75,-0.6333333333333332,-2.1666666666666665,0.26666666666666694,0.625,-0.2882982594743907
@@ -149,6 +149,8 @@ def model_difference_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
                         "outcome": outcome,
                         "max_dose": max_dose,
                         "mean_delta": result["mean_delta"],
+                        "ci_low": result["delta_ci_low"],
+                        "ci_high": result["delta_ci_high"],
                         "permutation_p_value": result["permutation_p_value"],
                         "cohens_d": result["cohens_d"],
                     }
@@ -189,7 +191,7 @@ def main(argv: list[str] | None = None) -> int:
     _write_csv(
         output_dir / "model_difference.csv",
         model_diff,
-        ["agent", "kind", "outcome", "max_dose", "mean_delta", "permutation_p_value", "cohens_d"],
+        ["agent", "kind", "outcome", "max_dose", "mean_delta", "ci_low", "ci_high", "permutation_p_value", "cohens_d"],
     )
     _write_markdown(output_dir / "memory_pollution_llm.md", dose_response, model_diff, rows)
     sig = sum(1 for r in dose_response if r["q_value"] is not None and float(r["q_value"]) < 0.05)

@@ -55,14 +55,42 @@ def _ordered_models(table: dict[str, dict[str, float]]) -> list[str]:
     return sorted((m for m in table if m in LABEL), key=lambda m: -table[m].get("ALL", 0.0))
 
 
-def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path) -> Path:
+def _load_ci(ci_path: Path) -> dict[tuple[str, str], tuple[float, float, float]]:
+    """(model, slice) -> (recall, ci_low, ci_high) from the Wilson CSV, if present."""
+    if not ci_path.exists():
+        return {}
+    out: dict[tuple[str, str], tuple[float, float, float]] = {}
+    with ci_path.open(encoding="utf-8") as handle:
+        for row in csv.DictReader(handle):
+            out[(row["model"], row["slice"])] = (float(row["recall"]), float(row["ci_low"]), float(row["ci_high"]))
+    return out
+
+
+def render_difficulty_bars(table: dict[str, dict[str, float]], output_dir: Path, ci: dict | None = None) -> Path:
+    # When the 3-sample CI experiment is available, bars and error bars both
+    # come from it (one consistent measurement); otherwise fall back to the
+    # deterministic summary with no error bars.
+    ci = ci or {}
+    ci_slice = {"difficulty:L1": "diff:L1", "difficulty:L2": "diff:L2", "difficulty:L3": "diff:L3"}
     models = _ordered_models(table)
     fig, ax = plt.subplots(figsize=(6.6, 3.0))
     width = 0.26
     colors = ["#c44e52", "#55a868", "#4c72b0"]
     for j, (slice_key, label) in enumerate(DIFF):
         xs = [i + (j - 1) * width for i in range(len(models))]
-        ax.bar(xs, [table[m].get(slice_key, 0.0) for m in models], width, label=label, color=colors[j])
+        heights, lo, hi = [], [], []
+        for m in models:
+            trip = ci.get((m, ci_slice[slice_key])) if ci else None
+            if trip:
+                heights.append(trip[0])
+                lo.append(max(0.0, trip[0] - trip[1]))
+                hi.append(max(0.0, trip[2] - trip[0]))
+            else:
+                heights.append(table[m].get(slice_key, 0.0))
+                lo.append(0.0)
+                hi.append(0.0)
+        errs = [lo, hi] if ci else None
+        ax.bar(xs, heights, width, label=label, color=colors[j], yerr=errs, capsize=2, error_kw={"linewidth": 0.8})
     ax.set_xticks(range(len(models)), [LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
     ax.set_ylabel("Recall", fontsize=9)
     ax.set_ylim(0, 1.05)
@@ -100,13 +128,15 @@ def render_kind_heatmap(table: dict[str, dict[str, float]], output_dir: Path) ->
 def main(argv: list[str] | None = None) -> int:
     parser = argparse.ArgumentParser(description="Render FinAudit paper figures.")
     parser.add_argument("--summary", default="docs/results/audit_eval_pilot/audit_eval_summary.csv")
+    parser.add_argument("--ci", default="docs/results/audit_eval_ci/audit_eval_ci_wilson.csv")
     parser.add_argument("--output-dir", default="paper/finaudit/figures")
     args = parser.parse_args(argv)
     summary = ROOT / args.summary if not Path(args.summary).is_absolute() else Path(args.summary)
+    ci_path = ROOT / args.ci if not Path(args.ci).is_absolute() else Path(args.ci)
     output_dir = ROOT / args.output_dir if not Path(args.output_dir).is_absolute() else Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     table = _load(summary)
-    print("wrote", render_difficulty_bars(table, output_dir))
+    print("wrote", render_difficulty_bars(table, output_dir, _load_ci(ci_path)))
     print("wrote", render_kind_heatmap(table, output_dir))
     return 0
 

@@ -66,9 +66,13 @@ def render_decay_invariance(output_dir: Path) -> Path:
 def render_model_shift(output_dir: Path) -> Path:
     rows = [r for r in csv.DictReader(LLM.open(encoding="utf-8")) if r["outcome"] == "hold_ratio"]
     shift: dict[str, dict[str, float]] = defaultdict(dict)
+    err: dict[str, dict[str, tuple[float, float]]] = defaultdict(dict)
     sig: dict[str, dict[str, bool]] = defaultdict(dict)
     for r in rows:
-        shift[r["agent"]][r["kind"]] = float(r["mean_delta"])
+        d = float(r["mean_delta"])
+        shift[r["agent"]][r["kind"]] = d
+        if r.get("ci_low") not in ("", None):
+            err[r["agent"]][r["kind"]] = (max(0.0, d - float(r["ci_low"])), max(0.0, float(r["ci_high"]) - d))
         p = r["permutation_p_value"]
         sig[r["agent"]][r["kind"]] = p not in ("", None) and float(p) < 0.05
     models = [m for m in LLM_LABEL if m in shift]
@@ -79,10 +83,12 @@ def render_model_shift(output_dir: Path) -> Path:
     for j, (k, label, color) in enumerate(kinds):
         xs = [i + (j - 0.5) * width for i in range(len(models))]
         vals = [shift[m].get(k, 0.0) for m in models]
-        bars = ax.bar(xs, vals, width, label=label, color=color)
-        for x, m, b in zip(xs, models, bars):
+        lo = [err[m].get(k, (0.0, 0.0))[0] for m in models]
+        hi = [err[m].get(k, (0.0, 0.0))[1] for m in models]
+        bars = ax.bar(xs, vals, width, label=label, color=color, yerr=[lo, hi], capsize=2, error_kw={"linewidth": 0.7})
+        for x, m, b, h in zip(xs, models, bars, hi):
             if sig[m].get(k):
-                ax.text(x, b.get_height() + 0.004, "*", ha="center", fontsize=11)
+                ax.text(x, b.get_height() + h + 0.006, "*", ha="center", fontsize=11)
     ax.axhline(0.0, color="black", linewidth=0.7)
     ax.set_xticks(range(len(models)), [LLM_LABEL[m] for m in models], rotation=20, ha="right", fontsize=8)
     ax.set_ylabel("Hold-ratio shift at max dose", fontsize=9)

@@ -257,7 +257,7 @@ def main(argv: list[str] | None = None) -> int:
                             sample=sample, temperature=args.temperature,
                         )
                     except Exception as exc:  # provider failures should not lose the run
-                        print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}", file=sys.stderr, flush=True)
+                        print(f"FAILED {spec} {task_id} s{sample}: {type(exc).__name__}: {exc}", file=sys.stderr, flush=True)
                         continue
                     findings = parse_findings(response)
                     scores = score_findings(findings, [truth])