From f33f5d0a5588e8eab05562e73ed5194a0bbae8c8 Mon Sep 17 00:00:00 2001 From: Eva Date: Tue, 16 Jun 2026 21:01:18 +0700 Subject: [PATCH] qa(phase-3): wire the per-run latency sidecar so the dormant RRI latency gate activates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #954 added additive latency hard-gates (latency_s_per_beat / latency_coldopen) to qa/release_readiness.py, but they were DORMANT on a real sweep: read_latency() reads each PERSONA run dir's latency.json sidecar, while the runners derive the latency rollup into the TRANSCRIPT dir ($T/$RUN.latency.json) — so the per-run sidecar never existed and the gate fell through to a (safe-by-design) evidence-gap SKIP instead of actually gating. Wire it (all additive): - qa/latency_rollup.py: new reusable stamp_sidecars(rollup, run_dirs) + a --stamp-into CLI flag. It writes {s_per_beat, coldopen_s, turns_per_beat} into each run dir as /latency.json — the exact shape read_latency() consumes. NULL columns are preserved (read_latency treats null as ABSENT -> skip, never a fabricated 0.0); a non-existent run dir is skipped, never created. - qa/release_gate.sh: after the duo run (which produced the per-beat ledger), re-derive the SAME rollup and stamp it into every persona run dir BEFORE the RRI rollup reads them. Non-fatal: a stamp hiccup / a duo with no derivable beat leaves the gate a documented skip. - qa/release_readiness.py: corrected read_latency()'s stale docstring (it claimed run_duo.sh writes the per-run sidecar; in fact run_duo writes to the transcript dir and release_gate.sh stamps the per-run sidecar) — the inaccuracy is part of why the gate looked wired but wasn't. - qa/evidence_audit.py: refreshed the stale "canonical 11 RRI gates" comment — the evaluated set is 11 by default and 13 once latency gates carry evidence (gates_total is read dynamically; RRI_GATE_NAMES stays the always-required baseline). Verify: - A real RRI rollup over runs WITH over-budget latency evidence (s_per_beat>120 or coldopen_s>240) now FAILS the latency gates (gates_total 13, release_ready=False); under budget PASSES; absent stays a byte-identical skip (gates_total 11). - New tests: stamp_sidecars unit coverage + a CLI --stamp-into test (test_latency_rollup.py); an end-to-end SEAM test driving the production rollup→stamp→gate path (test_release_readiness.py); a static contract locking the release_gate.sh wiring so the gate can't silently go dormant again (test_release_gate_static.py). - Single-process: qa/test_release_readiness.py + qa/test_deterministic_rri_gate.py + qa/test_latency_rollup.py + affected static/audit/scope/orchestrate tests — all green. --- qa/evidence_audit.py | 8 +++- qa/latency_rollup.py | 46 +++++++++++++++++++++- qa/release_gate.sh | 18 +++++++++ qa/release_readiness.py | 11 ++++-- qa/test_latency_rollup.py | 62 +++++++++++++++++++++++++++++ qa/test_release_gate_static.py | 14 +++++++ qa/test_release_readiness.py | 71 ++++++++++++++++++++++++++++++++++ 7 files changed, 223 insertions(+), 7 deletions(-) diff --git a/qa/evidence_audit.py b/qa/evidence_audit.py index c07faa6d..6a08ee1f 100644 --- a/qa/evidence_audit.py +++ b/qa/evidence_audit.py @@ -50,8 +50,12 @@ REQUIREMENTS_PATH = HERE / "verdict_requirements.json" DEFAULT_VERDICT = "rri_release" -# The canonical 11 RRI gates (mirrors release_readiness.py — kept here so the audit -# can report REQUIRED items even when a gate did not appear in the rollup output). +# The always-present RRI gates (mirrors release_readiness.py — kept here so the audit can +# report REQUIRED items even when a gate did not appear in the rollup output). The EVALUATED +# set is 11 by default and 13 once the two additive latency gates (latency_s_per_beat / +# latency_coldopen) carry evidence; release_readiness counts gates_total dynamically and the +# audit reads gates_passed/gates_total straight from the rollup, so this list stays the +# always-required baseline (the conditional latency gates are intentionally not in it). RRI_GATE_NAMES = [ "native_gate", "arc_completed", diff --git a/qa/latency_rollup.py b/qa/latency_rollup.py index 3b8b74c5..bd71eeff 100644 --- a/qa/latency_rollup.py +++ b/qa/latency_rollup.py @@ -49,8 +49,14 @@ import json import os import re +import sys from pathlib import Path -from typing import Any, Optional +from typing import Any, Iterable, Optional + +# The latency columns release_readiness.py:read_latency() reads from a /latency.json +# sidecar (it only consumes s_per_beat + coldopen_s; turns_per_beat is carried for parity +# with scores_db.add_run and is harmless extra detail for the reader). +SIDECAR_COLUMNS = ("s_per_beat", "coldopen_s", "turns_per_beat") # A beat transcript is ".dm..jsonl"; capture the nanos for beat ordering. _DM_RE = re.compile(r"\.dm\.(\d+)\.jsonl$") @@ -153,11 +159,42 @@ def rollup_run(transcript_dir: str | Path, run_id: str) -> dict[str, Any]: return rollup_files(beat_files(transcript_dir, run_id)) +def stamp_sidecars(rollup: dict[str, Any], run_dirs: Iterable[str | Path]) -> list[str]: + """Write a run's latency ``rollup`` as a ``/latency.json`` sidecar into each run dir, + in the exact shape ``qa/release_readiness.py:read_latency()`` reads + (``{s_per_beat, coldopen_s, turns_per_beat}``). + + This is the bridge that ACTIVATES the additive RRI latency gate on a real sweep: the runners + derive the per-beat ledger into the TRANSCRIPT dir (``$T/$RUN.latency.json``), but + release_readiness reads each PERSONA run dir's sidecar — so without this stamp the gate is a + dormant evidence-gap SKIP. The rollup is a BUILD-level measurement (one deep duo play), so it + is replicated into every persona run dir; the gate aggregates the MAX across personas, and + identical values yield exactly that build figure. + + NULL columns are preserved verbatim — read_latency treats a null ``s_per_beat``/``coldopen_s`` + as ABSENT evidence (an evidence-gap skip), never a fabricated 0.0 that would silently pass. + A run dir that does not already exist is SKIPPED (never created), so a stale/typo path can + never fabricate latency evidence. Returns the sidecar paths actually written.""" + sidecar = {k: rollup.get(k) for k in SIDECAR_COLUMNS} + written: list[str] = [] + for raw in run_dirs: + d = Path(raw) + if not d.is_dir(): + continue + target = d / "latency.json" + target.write_text(json.dumps(sidecar) + "\n", encoding="utf-8") + written.append(str(target)) + return written + + def _main(argv: Optional[list[str]] = None) -> int: ap = argparse.ArgumentParser(description="Derive the F13-4 latency ledger from DM beat transcripts.") ap.add_argument("--dir", help="transcript directory ($T) — used with --run") ap.add_argument("--run", help="run id ($RUN) — used with --dir") ap.add_argument("--out", help="write the rollup JSON here (also printed to stdout)") + ap.add_argument("--stamp-into", default="", help="comma-separated PERSONA run dirs to stamp the " + "rollup into as /latency.json (the shape release_readiness.read_latency reads) " + "— this is what ACTIVATES the additive RRI latency gate on a real sweep") ap.add_argument("files", nargs="*", help="explicit beat transcript paths (overrides --dir/--run)") args = ap.parse_args(argv) @@ -170,6 +207,13 @@ def _main(argv: Optional[list[str]] = None) -> int: ap.error("pass either explicit transcript files, or --dir and --run") return 2 + if args.stamp_into: + dirs = [p.strip() for p in args.stamp_into.split(",") if p.strip()] + written = stamp_sidecars(result, dirs) + # stderr so --out / stdout stay pure JSON for piping; a no-op (no existing dirs) is silent. + if written: + print(f"latency: stamped sidecar into {len(written)} run dir(s)", file=sys.stderr) + blob = json.dumps(result, indent=2) if args.out: Path(args.out).write_text(blob + "\n", encoding="utf-8") diff --git a/qa/release_gate.sh b/qa/release_gate.sh index 39ac1fe9..6ce93fc7 100755 --- a/qa/release_gate.sh +++ b/qa/release_gate.sh @@ -298,6 +298,24 @@ else [ -f "$STORY" ] && ok "story: $(python3 -c "import json;print(json.load(open('$STORY')).get('overall'))" 2>/dev/null)" || warn "no story score" [ -f "$MECH" ] && ok "mech: $(python3 -c "import json;print(json.load(open('$MECH')).get('overall'))" 2>/dev/null)" || warn "no mech score" + # ── LATENCY SIDECAR — activate the additive RRI latency gate (Phase-3) ────────── + # run_duo.sh just derived the per-beat latency ledger from the duo's *.dm..jsonl beats, but it + # wrote it to the TRANSCRIPT dir (qa/transcripts/${RUNID}-duo.latency.json). release_readiness.py's + # read_latency() reads each PERSONA run dir's latency.json sidecar instead, so on a real sweep the + # s_per_beat/coldopen_s gates were a dormant evidence-gap SKIP. Re-derive the SAME rollup and stamp it + # into every persona run dir in the shape the reader expects, so the figures are judged against + # qa/latency_baseline.json (s_per_beat>120 or coldopen_s>240 -> FAIL). Non-fatal: a stamp hiccup (or a + # duo that produced no derivable beat -> NULL columns) leaves the gate a documented SKIP, never a new + # false fail — additive, exactly today's behavior when latency evidence is absent. + if [ -n "$RUN_DIRS" ]; then + if python3 qa/latency_rollup.py --dir "$ROOT/qa/transcripts" --run "${RUNID}-duo" \ + --stamp-into "$RUN_DIRS" >/dev/null 2>&1; then + ok "latency sidecar stamped into persona run dirs (RRI latency gate active)" + else + warn "latency sidecar stamp skipped — RRI latency gate stays an evidence-gap skip" + fi + fi + # ── BEHAVIORAL + AXE/UI-AUDIT ──────────────────────────────────────────────── echo "── BEHAVIORAL + UI-AUDIT ─────────────────────────────────────────" BEHAV_PATH="$ROOT/qa/ui_playtest_runs/${RUNID}-behavioral.txt" diff --git a/qa/release_readiness.py b/qa/release_readiness.py index 8dcf92ed..fd5c8dbd 100644 --- a/qa/release_readiness.py +++ b/qa/release_readiness.py @@ -263,10 +263,13 @@ def _latency_float(value) -> Optional[float]: def read_latency(run: Path, run_json: dict, score: dict) -> tuple[Optional[float], Optional[float], str]: """Read (s_per_beat, coldopen_s, source) from the same on-disk artifacts the rollup - already reads — a run's ``latency.json`` sidecar first (what qa/run_duo.sh writes via - qa/latency_rollup.py --out), then a ``latency`` block inside run.json, then top-level - latency fields on run.json / score.json. ABSENT everywhere -> (None, None, "none"), - which makes the latency gates a documented EVIDENCE-GAP/skip, never a new false fail.""" + already reads — a run's ``latency.json`` sidecar first (the per-run ledger qa/release_gate.sh + stamps into each persona run dir via ``qa/latency_rollup.py --stamp-into``, derived from the + duo's per-beat transcripts; NOTE the runners themselves write the rollup to the TRANSCRIPT dir, + so without that stamp this gate stays a dormant evidence-gap skip), then a ``latency`` block + inside run.json, then top-level latency fields on run.json / score.json. ABSENT everywhere -> + (None, None, "none"), which makes the latency gates a documented EVIDENCE-GAP/skip, never a + new false fail.""" sidecar = read_json(run / "latency.json") candidates: list[tuple[dict, str]] = [ (sidecar, str(run / "latency.json")), diff --git a/qa/test_latency_rollup.py b/qa/test_latency_rollup.py index 9f8e5543..7f80c41e 100644 --- a/qa/test_latency_rollup.py +++ b/qa/test_latency_rollup.py @@ -124,3 +124,65 @@ def test_cli_writes_out_json(tmp_path, capsys): assert rc == 0 data = json.loads(out.read_text()) assert data["coldopen_s"] == 240.0 and data["s_per_beat"] == 100.0 + + +# ── stamp_sidecars: the bridge that activates the RRI latency gate ───────────────── +# The runners derive the ledger into the TRANSCRIPT dir; release_readiness reads each PERSONA +# run dir's /latency.json sidecar. These cover the stamp that closes that gap. + +def test_stamp_sidecars_writes_per_run_latency_json(tmp_path): + run = "duo-stamp" + _write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18) # cold open 240s + _write_beat(tmp_path, run, 2000, api_ms=100000, num_turns=4) # routine 100s + r = latency_rollup.rollup_run(tmp_path, run) + rundirs = [tmp_path / "gate-newbie", tmp_path / "gate-veteran"] + for d in rundirs: + d.mkdir() + written = latency_rollup.stamp_sidecars(r, rundirs) + assert len(written) == 2 + for d in rundirs: + sidecar = json.loads((d / "latency.json").read_text()) + # exactly the columns release_readiness.read_latency() consumes + assert sidecar["s_per_beat"] == 100.0 + assert sidecar["coldopen_s"] == 240.0 + assert sidecar["turns_per_beat"] == 4.0 + + +def test_stamp_sidecars_skips_nonexistent_dirs_never_creating_them(tmp_path): + # A stale/typo run-dir path must never fabricate latency evidence by creating a dir. + run = "duo-skip" + _write_beat(tmp_path, run, 1000, api_ms=200000, num_turns=10) + _write_beat(tmp_path, run, 2000, api_ms=90000, num_turns=4) + r = latency_rollup.rollup_run(tmp_path, run) + real = tmp_path / "gate-real"; real.mkdir() + missing = tmp_path / "gate-missing" # does NOT exist + written = latency_rollup.stamp_sidecars(r, [real, missing]) + assert written == [str(real / "latency.json")] + assert not missing.exists() + + +def test_stamp_sidecars_preserves_null_columns_not_zero(tmp_path): + # A cold-open-only run has NULL routine stats; the sidecar must keep null so read_latency + # treats it as ABSENT (a skip), never a fabricated 0.0 that silently passes the gate. + run = "duo-null" + _write_beat(tmp_path, run, 1000, api_ms=180000, num_turns=12) # cold open only + r = latency_rollup.rollup_run(tmp_path, run) + d = tmp_path / "gate-x"; d.mkdir() + latency_rollup.stamp_sidecars(r, [d]) + sidecar = json.loads((d / "latency.json").read_text()) + assert sidecar["s_per_beat"] is None + assert sidecar["coldopen_s"] == 180.0 + + +def test_cli_stamp_into_writes_sidecars(tmp_path): + # The exact path qa/release_gate.sh drives: --dir/--run + --stamp-into "dir1,dir2". + run = "duo-clistamp" + _write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18) + _write_beat(tmp_path, run, 2000, api_ms=130000, num_turns=5) + d1 = tmp_path / "gate-a"; d1.mkdir() + d2 = tmp_path / "gate-b"; d2.mkdir() + rc = latency_rollup._main(["--dir", str(tmp_path), "--run", run, "--stamp-into", f"{d1},{d2}"]) + assert rc == 0 + for d in (d1, d2): + sidecar = json.loads((d / "latency.json").read_text()) + assert sidecar["coldopen_s"] == 240.0 and sidecar["s_per_beat"] == 130.0 diff --git a/qa/test_release_gate_static.py b/qa/test_release_gate_static.py index 565d7a98..328d3299 100644 --- a/qa/test_release_gate_static.py +++ b/qa/test_release_gate_static.py @@ -57,6 +57,20 @@ def test_release_gate_uses_real_duo_prompt_file(self): self.assertIn('DUO_PROMPT="$ROOT/qa/play_player_duo.txt"', source) self.assertNotIn('qa/run_duo.sh "${RUNID}-duo" baldurs-gate veteran', source) + def test_release_gate_stamps_latency_sidecar_into_persona_run_dirs(self): + # The RRI latency gate was DORMANT: run_duo.sh derives the per-beat ledger into the + # TRANSCRIPT dir, but release_readiness.read_latency() reads each PERSONA run dir's + # latency.json sidecar. Lock in the wiring that activates the gate — release_gate.sh must + # stamp the duo rollup into the run dirs via latency_rollup.py --stamp-into, BEFORE the + # RRI rollup reads them — so the gate can never silently fall back to a skip again. + source = (ROOT / "qa" / "release_gate.sh").read_text(encoding="utf-8") + + self.assertIn("qa/latency_rollup.py", source) + self.assertIn('--run "${RUNID}-duo"', source) + self.assertIn('--stamp-into "$RUN_DIRS"', source) + # the stamp must run BEFORE release_readiness reads the per-run sidecars + self.assertLess(source.index("--stamp-into"), source.index("python3 qa/release_readiness.py")) + def test_ui_playtest_persists_final_session_surface_before_teardown(self): source = (ROOT / "qa" / "ui_playtest_app.sh").read_text(encoding="utf-8") diff --git a/qa/test_release_readiness.py b/qa/test_release_readiness.py index e1c242d5..a40885b9 100644 --- a/qa/test_release_readiness.py +++ b/qa/test_release_readiness.py @@ -2634,6 +2634,77 @@ def test_latency_absent_is_skip_and_byte_identical_release(self): self.assertIsNone(payload["signals"]["latency_s_per_beat"]) self.assertIsNone(payload["signals"]["latency_coldopen_s"]) + def _write_duo_beats(self, transcript_dir: Path, run: str, coldopen_ms: int, routine_ms: int) -> None: + # Minimal stream-json duo beat transcripts (one result event per beat) in the + # .dm..jsonl shape the runners write and latency_rollup reads: a cold + # open (nanos=1000) + two routine beats. duration_api_ms is the only timing field read. + transcript_dir.mkdir(parents=True, exist_ok=True) + for nanos, ms, turns in ((1000, coldopen_ms, 18), (2000, routine_ms, 4), (3000, routine_ms, 5)): + res = { + "type": "result", "subtype": "success", "is_error": False, + "api_error_status": None, "duration_api_ms": ms, "num_turns": turns, "result": "prose", + } + (transcript_dir / f"{run}.dm.{nanos}.jsonl").write_text( + json.dumps(res) + "\n", encoding="utf-8") + + def test_latency_rollup_stamp_sidecars_activates_the_gate_end_to_end(self): + # END-TO-END SEAM (the wiring PR #954 left dormant): the runners derive the latency + # ledger into the TRANSCRIPT dir, but release_readiness reads each PERSONA run dir's + # latency.json. This drives the REAL production path qa/release_gate.sh now uses — + # latency_rollup.rollup_run() over real duo beat transcripts, then + # latency_rollup.stamp_sidecars() into the persona run dirs — and proves the gate then + # FAILS over budget and PASSES under budget (no hand-written sidecar dict). + sys.path.insert(0, str(ROOT / "qa")) + import latency_rollup + + def build_and_run(coldopen_ms: int, routine_ms: int): + with tempfile.TemporaryDirectory() as td: + tmp = Path(td) + run = "gate-duo" + self._write_duo_beats(tmp / "transcripts", run, coldopen_ms, routine_ms) + rollup = latency_rollup.rollup_run(tmp / "transcripts", run) + runs = self._five_clean_runs_with_latency(tmp, None) # NO hand-written latency + # THE WIRING: stamp the build-level duo rollup into every persona run dir. + written = latency_rollup.stamp_sidecars(rollup, runs) + self.assertEqual(len(written), 5) + # the sidecar landed in the exact shape release_readiness.read_latency() reads + sidecar = json.loads((runs[0] / "latency.json").read_text()) + self.assertIn("s_per_beat", sidecar) + self.assertIn("coldopen_s", sidecar) + story, mech, behavioral, audit, palette = self.write_release_inputs(tmp) + return self.run_rri( + tmp, + "--runs", ",".join(str(r) for r in runs), + "--expected-personas", "newbie,veteran,adversarial,narrative,optimizer", + "--story", str(story), "--mech", str(mech), + "--behavioral", "GREEN", "--behavioral-path", str(behavioral), + "--ui-audit", "PASS", "--ui-audit-log", str(audit), + "--palette-live", "true", "--palette-source", str(palette), + "--build-sha", "deadbee", + ) + + # OVER budget: cold open 500s (> 240), routine 300s/beat (> 120) -> FAIL, gate evaluated. + rc, _text, payload = build_and_run(coldopen_ms=500_000, routine_ms=300_000) + self.assertEqual(rc, 1) + self.assertFalse(payload["release_ready"]) + self.assertIn("latency_s_per_beat", payload["failed_gates"]) + self.assertIn("latency_coldopen", payload["failed_gates"]) + self.assertEqual(payload["signals"]["latency_s_per_beat"], 300.0) + self.assertEqual(payload["signals"]["latency_coldopen_s"], 500.0) + # the gate is now ACTIVE, not a dormant evidence-gap skip + self.assertEqual(payload["skipped_gates"], []) + self.assertEqual(payload["gates_total"], 13) + + # UNDER budget: cold open 150s (< 240), routine 80s/beat (< 120) -> PASS. + rc, _text, payload = build_and_run(coldopen_ms=150_000, routine_ms=80_000) + self.assertEqual(rc, 0) + self.assertTrue(payload["release_ready"]) + self.assertEqual(payload["evidence_gaps"], []) + self.assertNotIn("latency_s_per_beat", payload["failed_gates"]) + self.assertNotIn("latency_coldopen", payload["failed_gates"]) + self.assertEqual(payload["signals"]["latency_s_per_beat"], 80.0) + self.assertEqual(payload["signals"]["latency_coldopen_s"], 150.0) + def test_deterministic_only_marks_llm_gates_skipped_not_failed(self): with tempfile.TemporaryDirectory() as td: tmp = Path(td)