From f33f5d0a5588e8eab05562e73ed5194a0bbae8c8 Mon Sep 17 00:00:00 2001
From: Eva <arncalso@gmail.com>
Date: Tue, 16 Jun 2026 21:01:18 +0700
Subject: [PATCH] qa(phase-3): wire the per-run latency sidecar so the dormant
 RRI latency gate activates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #954 added additive latency hard-gates (latency_s_per_beat / latency_coldopen) to
qa/release_readiness.py, but they were DORMANT on a real sweep: read_latency() reads each
PERSONA run dir's latency.json sidecar, while the runners derive the latency rollup into the
TRANSCRIPT dir ($T/$RUN.latency.json) — so the per-run sidecar never existed and the gate
fell through to a (safe-by-design) evidence-gap SKIP instead of actually gating.

Wire it (all additive):
- qa/latency_rollup.py: new reusable stamp_sidecars(rollup, run_dirs) + a --stamp-into CLI
  flag. It writes {s_per_beat, coldopen_s, turns_per_beat} into each run dir as
  <run>/latency.json — the exact shape read_latency() consumes. NULL columns are preserved
  (read_latency treats null as ABSENT -> skip, never a fabricated 0.0); a non-existent run dir
  is skipped, never created.
- qa/release_gate.sh: after the duo run (which produced the per-beat ledger), re-derive the
  SAME rollup and stamp it into every persona run dir BEFORE the RRI rollup reads them.
  Non-fatal: a stamp hiccup / a duo with no derivable beat leaves the gate a documented skip.
- qa/release_readiness.py: corrected read_latency()'s stale docstring (it claimed run_duo.sh
  writes the per-run sidecar; in fact run_duo writes to the transcript dir and release_gate.sh
  stamps the per-run sidecar) — the inaccuracy is part of why the gate looked wired but wasn't.
- qa/evidence_audit.py: refreshed the stale "canonical 11 RRI gates" comment — the evaluated
  set is 11 by default and 13 once latency gates carry evidence (gates_total is read
  dynamically; RRI_GATE_NAMES stays the always-required baseline).

Verify:
- A real RRI rollup over runs WITH over-budget latency evidence (s_per_beat>120 or
  coldopen_s>240) now FAILS the latency gates (gates_total 13, release_ready=False); under
  budget PASSES; absent stays a byte-identical skip (gates_total 11).
- New tests: stamp_sidecars unit coverage + a CLI --stamp-into test (test_latency_rollup.py);
  an end-to-end SEAM test driving the production rollup→stamp→gate path
  (test_release_readiness.py); a static contract locking the release_gate.sh wiring so the gate
  can't silently go dormant again (test_release_gate_static.py).
- Single-process: qa/test_release_readiness.py + qa/test_deterministic_rri_gate.py +
  qa/test_latency_rollup.py + affected static/audit/scope/orchestrate tests — all green.
---
 qa/evidence_audit.py           |  8 +++-
 qa/latency_rollup.py           | 46 +++++++++++++++++++++-
 qa/release_gate.sh             | 18 +++++++++
 qa/release_readiness.py        | 11 ++++--
 qa/test_latency_rollup.py      | 62 +++++++++++++++++++++++++++++
 qa/test_release_gate_static.py | 14 +++++++
 qa/test_release_readiness.py   | 71 ++++++++++++++++++++++++++++++++++
 7 files changed, 223 insertions(+), 7 deletions(-)
diff --git a/qa/evidence_audit.py b/qa/evidence_audit.py
index c07faa6d..6a08ee1f 100644
--- a/qa/evidence_audit.py
+++ b/qa/evidence_audit.py
@@ -50,8 +50,12 @@
 REQUIREMENTS_PATH = HERE / "verdict_requirements.json"
 DEFAULT_VERDICT = "rri_release"
 
-# The canonical 11 RRI gates (mirrors release_readiness.py — kept here so the audit
-# can report REQUIRED items even when a gate did not appear in the rollup output).
+# The always-present RRI gates (mirrors release_readiness.py — kept here so the audit can
+# report REQUIRED items even when a gate did not appear in the rollup output). The EVALUATED
+# set is 11 by default and 13 once the two additive latency gates (latency_s_per_beat /
+# latency_coldopen) carry evidence; release_readiness counts gates_total dynamically and the
+# audit reads gates_passed/gates_total straight from the rollup, so this list stays the
+# always-required baseline (the conditional latency gates are intentionally not in it).
 RRI_GATE_NAMES = [
     "native_gate",
     "arc_completed",
diff --git a/qa/latency_rollup.py b/qa/latency_rollup.py
index 3b8b74c5..bd71eeff 100644
--- a/qa/latency_rollup.py
+++ b/qa/latency_rollup.py
@@ -49,8 +49,14 @@
 import json
 import os
 import re
+import sys
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Iterable, Optional
+
+# The latency columns release_readiness.py:read_latency() reads from a <run>/latency.json
+# sidecar (it only consumes s_per_beat + coldopen_s; turns_per_beat is carried for parity
+# with scores_db.add_run and is harmless extra detail for the reader).
+SIDECAR_COLUMNS = ("s_per_beat", "coldopen_s", "turns_per_beat")
 
 # A beat transcript is "<run>.dm.<nanoseconds>.jsonl"; capture the nanos for beat ordering.
 _DM_RE = re.compile(r"\.dm\.(\d+)\.jsonl$")
@@ -153,11 +159,42 @@ def rollup_run(transcript_dir: str | Path, run_id: str) -> dict[str, Any]:
     return rollup_files(beat_files(transcript_dir, run_id))
 
 
+def stamp_sidecars(rollup: dict[str, Any], run_dirs: Iterable[str | Path]) -> list[str]:
+    """Write a run's latency ``rollup`` as a ``<run>/latency.json`` sidecar into each run dir,
+    in the exact shape ``qa/release_readiness.py:read_latency()`` reads
+    (``{s_per_beat, coldopen_s, turns_per_beat}``).
+
+    This is the bridge that ACTIVATES the additive RRI latency gate on a real sweep: the runners
+    derive the per-beat ledger into the TRANSCRIPT dir (``$T/$RUN.latency.json``), but
+    release_readiness reads each PERSONA run dir's sidecar — so without this stamp the gate is a
+    dormant evidence-gap SKIP. The rollup is a BUILD-level measurement (one deep duo play), so it
+    is replicated into every persona run dir; the gate aggregates the MAX across personas, and
+    identical values yield exactly that build figure.
+
+    NULL columns are preserved verbatim — read_latency treats a null ``s_per_beat``/``coldopen_s``
+    as ABSENT evidence (an evidence-gap skip), never a fabricated 0.0 that would silently pass.
+    A run dir that does not already exist is SKIPPED (never created), so a stale/typo path can
+    never fabricate latency evidence. Returns the sidecar paths actually written."""
+    sidecar = {k: rollup.get(k) for k in SIDECAR_COLUMNS}
+    written: list[str] = []
+    for raw in run_dirs:
+        d = Path(raw)
+        if not d.is_dir():
+            continue
+        target = d / "latency.json"
+        target.write_text(json.dumps(sidecar) + "\n", encoding="utf-8")
+        written.append(str(target))
+    return written
+
+
 def _main(argv: Optional[list[str]] = None) -> int:
     ap = argparse.ArgumentParser(description="Derive the F13-4 latency ledger from DM beat transcripts.")
     ap.add_argument("--dir", help="transcript directory ($T) — used with --run")
     ap.add_argument("--run", help="run id ($RUN) — used with --dir")
     ap.add_argument("--out", help="write the rollup JSON here (also printed to stdout)")
+    ap.add_argument("--stamp-into", default="", help="comma-separated PERSONA run dirs to stamp the "
+                    "rollup into as <dir>/latency.json (the shape release_readiness.read_latency reads) "
+                    "— this is what ACTIVATES the additive RRI latency gate on a real sweep")
     ap.add_argument("files", nargs="*", help="explicit beat transcript paths (overrides --dir/--run)")
     args = ap.parse_args(argv)
 
@@ -170,6 +207,13 @@ def _main(argv: Optional[list[str]] = None) -> int:
         ap.error("pass either explicit transcript files, or --dir and --run")
         return 2
 
+    if args.stamp_into:
+        dirs = [p.strip() for p in args.stamp_into.split(",") if p.strip()]
+        written = stamp_sidecars(result, dirs)
+        # stderr so --out / stdout stay pure JSON for piping; a no-op (no existing dirs) is silent.
+        if written:
+            print(f"latency: stamped sidecar into {len(written)} run dir(s)", file=sys.stderr)
+
     blob = json.dumps(result, indent=2)
     if args.out:
         Path(args.out).write_text(blob + "\n", encoding="utf-8")
diff --git a/qa/release_gate.sh b/qa/release_gate.sh
index 39ac1fe9..6ce93fc7 100755
--- a/qa/release_gate.sh
+++ b/qa/release_gate.sh
@@ -298,6 +298,24 @@ else
   [ -f "$STORY" ] && ok "story: $(python3 -c "import json;print(json.load(open('$STORY')).get('overall'))" 2>/dev/null)" || warn "no story score"
   [ -f "$MECH" ] && ok "mech:  $(python3 -c "import json;print(json.load(open('$MECH')).get('overall'))" 2>/dev/null)" || warn "no mech score"
 
+  # ── LATENCY SIDECAR — activate the additive RRI latency gate (Phase-3) ──────────
+  # run_duo.sh just derived the per-beat latency ledger from the duo's *.dm.<ns>.jsonl beats, but it
+  # wrote it to the TRANSCRIPT dir (qa/transcripts/${RUNID}-duo.latency.json). release_readiness.py's
+  # read_latency() reads each PERSONA run dir's latency.json sidecar instead, so on a real sweep the
+  # s_per_beat/coldopen_s gates were a dormant evidence-gap SKIP. Re-derive the SAME rollup and stamp it
+  # into every persona run dir in the shape the reader expects, so the figures are judged against
+  # qa/latency_baseline.json (s_per_beat>120 or coldopen_s>240 -> FAIL). Non-fatal: a stamp hiccup (or a
+  # duo that produced no derivable beat -> NULL columns) leaves the gate a documented SKIP, never a new
+  # false fail — additive, exactly today's behavior when latency evidence is absent.
+  if [ -n "$RUN_DIRS" ]; then
+    if python3 qa/latency_rollup.py --dir "$ROOT/qa/transcripts" --run "${RUNID}-duo" \
+         --stamp-into "$RUN_DIRS" >/dev/null 2>&1; then
+      ok "latency sidecar stamped into persona run dirs (RRI latency gate active)"
+    else
+      warn "latency sidecar stamp skipped — RRI latency gate stays an evidence-gap skip"
+    fi
+  fi
+
   # ── BEHAVIORAL + AXE/UI-AUDIT ────────────────────────────────────────────────
   echo "── BEHAVIORAL + UI-AUDIT ─────────────────────────────────────────"
   BEHAV_PATH="$ROOT/qa/ui_playtest_runs/${RUNID}-behavioral.txt"
diff --git a/qa/release_readiness.py b/qa/release_readiness.py
index 8dcf92ed..fd5c8dbd 100644
--- a/qa/release_readiness.py
+++ b/qa/release_readiness.py
@@ -263,10 +263,13 @@ def _latency_float(value) -> Optional[float]:
 
 def read_latency(run: Path, run_json: dict, score: dict) -> tuple[Optional[float], Optional[float], str]:
     """Read (s_per_beat, coldopen_s, source) from the same on-disk artifacts the rollup
-    already reads — a run's ``latency.json`` sidecar first (what qa/run_duo.sh writes via
-    qa/latency_rollup.py --out), then a ``latency`` block inside run.json, then top-level
-    latency fields on run.json / score.json. ABSENT everywhere -> (None, None, "none"),
-    which makes the latency gates a documented EVIDENCE-GAP/skip, never a new false fail."""
+    already reads — a run's ``latency.json`` sidecar first (the per-run ledger qa/release_gate.sh
+    stamps into each persona run dir via ``qa/latency_rollup.py --stamp-into``, derived from the
+    duo's per-beat transcripts; NOTE the runners themselves write the rollup to the TRANSCRIPT dir,
+    so without that stamp this gate stays a dormant evidence-gap skip), then a ``latency`` block
+    inside run.json, then top-level latency fields on run.json / score.json. ABSENT everywhere ->
+    (None, None, "none"), which makes the latency gates a documented EVIDENCE-GAP/skip, never a
+    new false fail."""
     sidecar = read_json(run / "latency.json")
     candidates: list[tuple[dict, str]] = [
         (sidecar, str(run / "latency.json")),
diff --git a/qa/test_latency_rollup.py b/qa/test_latency_rollup.py
index 9f8e5543..7f80c41e 100644
--- a/qa/test_latency_rollup.py
+++ b/qa/test_latency_rollup.py
@@ -124,3 +124,65 @@ def test_cli_writes_out_json(tmp_path, capsys):
     assert rc == 0
     data = json.loads(out.read_text())
     assert data["coldopen_s"] == 240.0 and data["s_per_beat"] == 100.0
+
+
+# ── stamp_sidecars: the bridge that activates the RRI latency gate ─────────────────
+# The runners derive the ledger into the TRANSCRIPT dir; release_readiness reads each PERSONA
+# run dir's <run>/latency.json sidecar. These cover the stamp that closes that gap.
+
+def test_stamp_sidecars_writes_per_run_latency_json(tmp_path):
+    run = "duo-stamp"
+    _write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18)   # cold open 240s
+    _write_beat(tmp_path, run, 2000, api_ms=100000, num_turns=4)    # routine 100s
+    r = latency_rollup.rollup_run(tmp_path, run)
+    rundirs = [tmp_path / "gate-newbie", tmp_path / "gate-veteran"]
+    for d in rundirs:
+        d.mkdir()
+    written = latency_rollup.stamp_sidecars(r, rundirs)
+    assert len(written) == 2
+    for d in rundirs:
+        sidecar = json.loads((d / "latency.json").read_text())
+        # exactly the columns release_readiness.read_latency() consumes
+        assert sidecar["s_per_beat"] == 100.0
+        assert sidecar["coldopen_s"] == 240.0
+        assert sidecar["turns_per_beat"] == 4.0
+
+
+def test_stamp_sidecars_skips_nonexistent_dirs_never_creating_them(tmp_path):
+    # A stale/typo run-dir path must never fabricate latency evidence by creating a dir.
+    run = "duo-skip"
+    _write_beat(tmp_path, run, 1000, api_ms=200000, num_turns=10)
+    _write_beat(tmp_path, run, 2000, api_ms=90000, num_turns=4)
+    r = latency_rollup.rollup_run(tmp_path, run)
+    real = tmp_path / "gate-real"; real.mkdir()
+    missing = tmp_path / "gate-missing"          # does NOT exist
+    written = latency_rollup.stamp_sidecars(r, [real, missing])
+    assert written == [str(real / "latency.json")]
+    assert not missing.exists()
+
+
+def test_stamp_sidecars_preserves_null_columns_not_zero(tmp_path):
+    # A cold-open-only run has NULL routine stats; the sidecar must keep null so read_latency
+    # treats it as ABSENT (a skip), never a fabricated 0.0 that silently passes the gate.
+    run = "duo-null"
+    _write_beat(tmp_path, run, 1000, api_ms=180000, num_turns=12)   # cold open only
+    r = latency_rollup.rollup_run(tmp_path, run)
+    d = tmp_path / "gate-x"; d.mkdir()
+    latency_rollup.stamp_sidecars(r, [d])
+    sidecar = json.loads((d / "latency.json").read_text())
+    assert sidecar["s_per_beat"] is None
+    assert sidecar["coldopen_s"] == 180.0
+
+
+def test_cli_stamp_into_writes_sidecars(tmp_path):
+    # The exact path qa/release_gate.sh drives: --dir/--run + --stamp-into "dir1,dir2".
+    run = "duo-clistamp"
+    _write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18)
+    _write_beat(tmp_path, run, 2000, api_ms=130000, num_turns=5)
+    d1 = tmp_path / "gate-a"; d1.mkdir()
+    d2 = tmp_path / "gate-b"; d2.mkdir()
+    rc = latency_rollup._main(["--dir", str(tmp_path), "--run", run, "--stamp-into", f"{d1},{d2}"])
+    assert rc == 0
+    for d in (d1, d2):
+        sidecar = json.loads((d / "latency.json").read_text())
+        assert sidecar["coldopen_s"] == 240.0 and sidecar["s_per_beat"] == 130.0
diff --git a/qa/test_release_gate_static.py b/qa/test_release_gate_static.py
index 565d7a98..328d3299 100644
--- a/qa/test_release_gate_static.py
+++ b/qa/test_release_gate_static.py
@@ -57,6 +57,20 @@ def test_release_gate_uses_real_duo_prompt_file(self):
         self.assertIn('DUO_PROMPT="$ROOT/qa/play_player_duo.txt"', source)
         self.assertNotIn('qa/run_duo.sh "${RUNID}-duo" baldurs-gate veteran', source)
 
+    def test_release_gate_stamps_latency_sidecar_into_persona_run_dirs(self):
+        # The RRI latency gate was DORMANT: run_duo.sh derives the per-beat ledger into the
+        # TRANSCRIPT dir, but release_readiness.read_latency() reads each PERSONA run dir's
+        # latency.json sidecar. Lock in the wiring that activates the gate — release_gate.sh must
+        # stamp the duo rollup into the run dirs via latency_rollup.py --stamp-into, BEFORE the
+        # RRI rollup reads them — so the gate can never silently fall back to a skip again.
+        source = (ROOT / "qa" / "release_gate.sh").read_text(encoding="utf-8")
+
+        self.assertIn("qa/latency_rollup.py", source)
+        self.assertIn('--run "${RUNID}-duo"', source)
+        self.assertIn('--stamp-into "$RUN_DIRS"', source)
+        # the stamp must run BEFORE release_readiness reads the per-run sidecars
+        self.assertLess(source.index("--stamp-into"), source.index("python3 qa/release_readiness.py"))
+
     def test_ui_playtest_persists_final_session_surface_before_teardown(self):
         source = (ROOT / "qa" / "ui_playtest_app.sh").read_text(encoding="utf-8")
 
diff --git a/qa/test_release_readiness.py b/qa/test_release_readiness.py
index e1c242d5..a40885b9 100644
--- a/qa/test_release_readiness.py
+++ b/qa/test_release_readiness.py
@@ -2634,6 +2634,77 @@ def test_latency_absent_is_skip_and_byte_identical_release(self):
             self.assertIsNone(payload["signals"]["latency_s_per_beat"])
             self.assertIsNone(payload["signals"]["latency_coldopen_s"])
 
+    def _write_duo_beats(self, transcript_dir: Path, run: str, coldopen_ms: int, routine_ms: int) -> None:
+        # Minimal stream-json duo beat transcripts (one result event per beat) in the
+        # <run>.dm.<nanos>.jsonl shape the runners write and latency_rollup reads: a cold
+        # open (nanos=1000) + two routine beats. duration_api_ms is the only timing field read.
+        transcript_dir.mkdir(parents=True, exist_ok=True)
+        for nanos, ms, turns in ((1000, coldopen_ms, 18), (2000, routine_ms, 4), (3000, routine_ms, 5)):
+            res = {
+                "type": "result", "subtype": "success", "is_error": False,
+                "api_error_status": None, "duration_api_ms": ms, "num_turns": turns, "result": "prose",
+            }
+            (transcript_dir / f"{run}.dm.{nanos}.jsonl").write_text(
+                json.dumps(res) + "\n", encoding="utf-8")
+
+    def test_latency_rollup_stamp_sidecars_activates_the_gate_end_to_end(self):
+        # END-TO-END SEAM (the wiring PR #954 left dormant): the runners derive the latency
+        # ledger into the TRANSCRIPT dir, but release_readiness reads each PERSONA run dir's
+        # latency.json. This drives the REAL production path qa/release_gate.sh now uses —
+        # latency_rollup.rollup_run() over real duo beat transcripts, then
+        # latency_rollup.stamp_sidecars() into the persona run dirs — and proves the gate then
+        # FAILS over budget and PASSES under budget (no hand-written sidecar dict).
+        sys.path.insert(0, str(ROOT / "qa"))
+        import latency_rollup
+
+        def build_and_run(coldopen_ms: int, routine_ms: int):
+            with tempfile.TemporaryDirectory() as td:
+                tmp = Path(td)
+                run = "gate-duo"
+                self._write_duo_beats(tmp / "transcripts", run, coldopen_ms, routine_ms)
+                rollup = latency_rollup.rollup_run(tmp / "transcripts", run)
+                runs = self._five_clean_runs_with_latency(tmp, None)  # NO hand-written latency
+                # THE WIRING: stamp the build-level duo rollup into every persona run dir.
+                written = latency_rollup.stamp_sidecars(rollup, runs)
+                self.assertEqual(len(written), 5)
+                # the sidecar landed in the exact shape release_readiness.read_latency() reads
+                sidecar = json.loads((runs[0] / "latency.json").read_text())
+                self.assertIn("s_per_beat", sidecar)
+                self.assertIn("coldopen_s", sidecar)
+                story, mech, behavioral, audit, palette = self.write_release_inputs(tmp)
+                return self.run_rri(
+                    tmp,
+                    "--runs", ",".join(str(r) for r in runs),
+                    "--expected-personas", "newbie,veteran,adversarial,narrative,optimizer",
+                    "--story", str(story), "--mech", str(mech),
+                    "--behavioral", "GREEN", "--behavioral-path", str(behavioral),
+                    "--ui-audit", "PASS", "--ui-audit-log", str(audit),
+                    "--palette-live", "true", "--palette-source", str(palette),
+                    "--build-sha", "deadbee",
+                )
+
+        # OVER budget: cold open 500s (> 240), routine 300s/beat (> 120) -> FAIL, gate evaluated.
+        rc, _text, payload = build_and_run(coldopen_ms=500_000, routine_ms=300_000)
+        self.assertEqual(rc, 1)
+        self.assertFalse(payload["release_ready"])
+        self.assertIn("latency_s_per_beat", payload["failed_gates"])
+        self.assertIn("latency_coldopen", payload["failed_gates"])
+        self.assertEqual(payload["signals"]["latency_s_per_beat"], 300.0)
+        self.assertEqual(payload["signals"]["latency_coldopen_s"], 500.0)
+        # the gate is now ACTIVE, not a dormant evidence-gap skip
+        self.assertEqual(payload["skipped_gates"], [])
+        self.assertEqual(payload["gates_total"], 13)
+
+        # UNDER budget: cold open 150s (< 240), routine 80s/beat (< 120) -> PASS.
+        rc, _text, payload = build_and_run(coldopen_ms=150_000, routine_ms=80_000)
+        self.assertEqual(rc, 0)
+        self.assertTrue(payload["release_ready"])
+        self.assertEqual(payload["evidence_gaps"], [])
+        self.assertNotIn("latency_s_per_beat", payload["failed_gates"])
+        self.assertNotIn("latency_coldopen", payload["failed_gates"])
+        self.assertEqual(payload["signals"]["latency_s_per_beat"], 80.0)
+        self.assertEqual(payload["signals"]["latency_coldopen_s"], 150.0)
+
     def test_deterministic_only_marks_llm_gates_skipped_not_failed(self):
         with tempfile.TemporaryDirectory() as td:
             tmp = Path(td)