Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions qa/evidence_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@
REQUIREMENTS_PATH = HERE / "verdict_requirements.json"
DEFAULT_VERDICT = "rri_release"

# The canonical 11 RRI gates (mirrors release_readiness.py — kept here so the audit
# can report REQUIRED items even when a gate did not appear in the rollup output).
# The always-present RRI gates (mirrors release_readiness.py — kept here so the audit can
# report REQUIRED items even when a gate did not appear in the rollup output). The EVALUATED
# set is 11 by default and 13 once the two additive latency gates (latency_s_per_beat /
# latency_coldopen) carry evidence; release_readiness counts gates_total dynamically and the
# audit reads gates_passed/gates_total straight from the rollup, so this list stays the
# always-required baseline (the conditional latency gates are intentionally not in it).
RRI_GATE_NAMES = [
"native_gate",
"arc_completed",
Expand Down
46 changes: 45 additions & 1 deletion qa/latency_rollup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,14 @@
import json
import os
import re
import sys
from pathlib import Path
from typing import Any, Optional
from typing import Any, Iterable, Optional

# The latency columns release_readiness.py:read_latency() reads from a <run>/latency.json
# sidecar (it only consumes s_per_beat + coldopen_s; turns_per_beat is carried for parity
# with scores_db.add_run and is harmless extra detail for the reader).
SIDECAR_COLUMNS = ("s_per_beat", "coldopen_s", "turns_per_beat")

# A beat transcript is "<run>.dm.<nanoseconds>.jsonl"; capture the nanos for beat ordering.
_DM_RE = re.compile(r"\.dm\.(\d+)\.jsonl$")
Expand Down Expand Up @@ -153,11 +159,42 @@ def rollup_run(transcript_dir: str | Path, run_id: str) -> dict[str, Any]:
return rollup_files(beat_files(transcript_dir, run_id))


def stamp_sidecars(rollup: dict[str, Any], run_dirs: Iterable[str | Path]) -> list[str]:
"""Write a run's latency ``rollup`` as a ``<run>/latency.json`` sidecar into each run dir,
in the exact shape ``qa/release_readiness.py:read_latency()`` reads
(``{s_per_beat, coldopen_s, turns_per_beat}``).

This is the bridge that ACTIVATES the additive RRI latency gate on a real sweep: the runners
derive the per-beat ledger into the TRANSCRIPT dir (``$T/$RUN.latency.json``), but
release_readiness reads each PERSONA run dir's sidecar — so without this stamp the gate is a
dormant evidence-gap SKIP. The rollup is a BUILD-level measurement (one deep duo play), so it
is replicated into every persona run dir; the gate aggregates the MAX across personas, and
identical values yield exactly that build figure.

NULL columns are preserved verbatim — read_latency treats a null ``s_per_beat``/``coldopen_s``
as ABSENT evidence (an evidence-gap skip), never a fabricated 0.0 that would silently pass.
A run dir that does not already exist is SKIPPED (never created), so a stale/typo path can
never fabricate latency evidence. Returns the sidecar paths actually written."""
sidecar = {k: rollup.get(k) for k in SIDECAR_COLUMNS}
written: list[str] = []
for raw in run_dirs:
d = Path(raw)
if not d.is_dir():
continue
target = d / "latency.json"
target.write_text(json.dumps(sidecar) + "\n", encoding="utf-8")
written.append(str(target))
return written


def _main(argv: Optional[list[str]] = None) -> int:
ap = argparse.ArgumentParser(description="Derive the F13-4 latency ledger from DM beat transcripts.")
ap.add_argument("--dir", help="transcript directory ($T) — used with --run")
ap.add_argument("--run", help="run id ($RUN) — used with --dir")
ap.add_argument("--out", help="write the rollup JSON here (also printed to stdout)")
ap.add_argument("--stamp-into", default="", help="comma-separated PERSONA run dirs to stamp the "
"rollup into as <dir>/latency.json (the shape release_readiness.read_latency reads) "
"— this is what ACTIVATES the additive RRI latency gate on a real sweep")
ap.add_argument("files", nargs="*", help="explicit beat transcript paths (overrides --dir/--run)")
args = ap.parse_args(argv)

Expand All @@ -170,6 +207,13 @@ def _main(argv: Optional[list[str]] = None) -> int:
ap.error("pass either explicit transcript files, or --dir and --run")
return 2

if args.stamp_into:
dirs = [p.strip() for p in args.stamp_into.split(",") if p.strip()]
written = stamp_sidecars(result, dirs)
# stderr so --out / stdout stay pure JSON for piping; a no-op (no existing dirs) is silent.
if written:
print(f"latency: stamped sidecar into {len(written)} run dir(s)", file=sys.stderr)

blob = json.dumps(result, indent=2)
if args.out:
Path(args.out).write_text(blob + "\n", encoding="utf-8")
Expand Down
18 changes: 18 additions & 0 deletions qa/release_gate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,24 @@ else
[ -f "$STORY" ] && ok "story: $(python3 -c "import json;print(json.load(open('$STORY')).get('overall'))" 2>/dev/null)" || warn "no story score"
[ -f "$MECH" ] && ok "mech: $(python3 -c "import json;print(json.load(open('$MECH')).get('overall'))" 2>/dev/null)" || warn "no mech score"

# ── LATENCY SIDECAR — activate the additive RRI latency gate (Phase-3) ──────────
# run_duo.sh just derived the per-beat latency ledger from the duo's *.dm.<ns>.jsonl beats, but it
# wrote it to the TRANSCRIPT dir (qa/transcripts/${RUNID}-duo.latency.json). release_readiness.py's
# read_latency() reads each PERSONA run dir's latency.json sidecar instead, so on a real sweep the
# s_per_beat/coldopen_s gates were a dormant evidence-gap SKIP. Re-derive the SAME rollup and stamp it
# into every persona run dir in the shape the reader expects, so the figures are judged against
# qa/latency_baseline.json (s_per_beat>120 or coldopen_s>240 -> FAIL). Non-fatal: a stamp hiccup (or a
# duo that produced no derivable beat -> NULL columns) leaves the gate a documented SKIP, never a new
# false fail — additive, exactly today's behavior when latency evidence is absent.
if [ -n "$RUN_DIRS" ]; then
if python3 qa/latency_rollup.py --dir "$ROOT/qa/transcripts" --run "${RUNID}-duo" \
--stamp-into "$RUN_DIRS" >/dev/null 2>&1; then
ok "latency sidecar stamped into persona run dirs (RRI latency gate active)"
else
warn "latency sidecar stamp skipped — RRI latency gate stays an evidence-gap skip"
fi
fi

# ── BEHAVIORAL + AXE/UI-AUDIT ────────────────────────────────────────────────
echo "── BEHAVIORAL + UI-AUDIT ─────────────────────────────────────────"
BEHAV_PATH="$ROOT/qa/ui_playtest_runs/${RUNID}-behavioral.txt"
Expand Down
11 changes: 7 additions & 4 deletions qa/release_readiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,10 +263,13 @@ def _latency_float(value) -> Optional[float]:

def read_latency(run: Path, run_json: dict, score: dict) -> tuple[Optional[float], Optional[float], str]:
"""Read (s_per_beat, coldopen_s, source) from the same on-disk artifacts the rollup
already reads — a run's ``latency.json`` sidecar first (what qa/run_duo.sh writes via
qa/latency_rollup.py --out), then a ``latency`` block inside run.json, then top-level
latency fields on run.json / score.json. ABSENT everywhere -> (None, None, "none"),
which makes the latency gates a documented EVIDENCE-GAP/skip, never a new false fail."""
already reads — a run's ``latency.json`` sidecar first (the per-run ledger qa/release_gate.sh
stamps into each persona run dir via ``qa/latency_rollup.py --stamp-into``, derived from the
duo's per-beat transcripts; NOTE the runners themselves write the rollup to the TRANSCRIPT dir,
so without that stamp this gate stays a dormant evidence-gap skip), then a ``latency`` block
inside run.json, then top-level latency fields on run.json / score.json. ABSENT everywhere ->
(None, None, "none"), which makes the latency gates a documented EVIDENCE-GAP/skip, never a
new false fail."""
sidecar = read_json(run / "latency.json")
candidates: list[tuple[dict, str]] = [
(sidecar, str(run / "latency.json")),
Expand Down
62 changes: 62 additions & 0 deletions qa/test_latency_rollup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,65 @@ def test_cli_writes_out_json(tmp_path, capsys):
assert rc == 0
data = json.loads(out.read_text())
assert data["coldopen_s"] == 240.0 and data["s_per_beat"] == 100.0


# ── stamp_sidecars: the bridge that activates the RRI latency gate ─────────────────
# The runners derive the ledger into the TRANSCRIPT dir; release_readiness reads each PERSONA
# run dir's <run>/latency.json sidecar. These cover the stamp that closes that gap.

def test_stamp_sidecars_writes_per_run_latency_json(tmp_path):
run = "duo-stamp"
_write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18) # cold open 240s
_write_beat(tmp_path, run, 2000, api_ms=100000, num_turns=4) # routine 100s
r = latency_rollup.rollup_run(tmp_path, run)
rundirs = [tmp_path / "gate-newbie", tmp_path / "gate-veteran"]
for d in rundirs:
d.mkdir()
written = latency_rollup.stamp_sidecars(r, rundirs)
assert len(written) == 2
for d in rundirs:
sidecar = json.loads((d / "latency.json").read_text())
# exactly the columns release_readiness.read_latency() consumes
assert sidecar["s_per_beat"] == 100.0
assert sidecar["coldopen_s"] == 240.0
assert sidecar["turns_per_beat"] == 4.0


def test_stamp_sidecars_skips_nonexistent_dirs_never_creating_them(tmp_path):
# A stale/typo run-dir path must never fabricate latency evidence by creating a dir.
run = "duo-skip"
_write_beat(tmp_path, run, 1000, api_ms=200000, num_turns=10)
_write_beat(tmp_path, run, 2000, api_ms=90000, num_turns=4)
r = latency_rollup.rollup_run(tmp_path, run)
real = tmp_path / "gate-real"; real.mkdir()
missing = tmp_path / "gate-missing" # does NOT exist
written = latency_rollup.stamp_sidecars(r, [real, missing])
assert written == [str(real / "latency.json")]
assert not missing.exists()


def test_stamp_sidecars_preserves_null_columns_not_zero(tmp_path):
# A cold-open-only run has NULL routine stats; the sidecar must keep null so read_latency
# treats it as ABSENT (a skip), never a fabricated 0.0 that silently passes the gate.
run = "duo-null"
_write_beat(tmp_path, run, 1000, api_ms=180000, num_turns=12) # cold open only
r = latency_rollup.rollup_run(tmp_path, run)
d = tmp_path / "gate-x"; d.mkdir()
latency_rollup.stamp_sidecars(r, [d])
sidecar = json.loads((d / "latency.json").read_text())
assert sidecar["s_per_beat"] is None
assert sidecar["coldopen_s"] == 180.0


def test_cli_stamp_into_writes_sidecars(tmp_path):
# The exact path qa/release_gate.sh drives: --dir/--run + --stamp-into "dir1,dir2".
run = "duo-clistamp"
_write_beat(tmp_path, run, 1000, api_ms=240000, num_turns=18)
_write_beat(tmp_path, run, 2000, api_ms=130000, num_turns=5)
d1 = tmp_path / "gate-a"; d1.mkdir()
d2 = tmp_path / "gate-b"; d2.mkdir()
rc = latency_rollup._main(["--dir", str(tmp_path), "--run", run, "--stamp-into", f"{d1},{d2}"])
assert rc == 0
for d in (d1, d2):
sidecar = json.loads((d / "latency.json").read_text())
assert sidecar["coldopen_s"] == 240.0 and sidecar["s_per_beat"] == 130.0
14 changes: 14 additions & 0 deletions qa/test_release_gate_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,20 @@ def test_release_gate_uses_real_duo_prompt_file(self):
self.assertIn('DUO_PROMPT="$ROOT/qa/play_player_duo.txt"', source)
self.assertNotIn('qa/run_duo.sh "${RUNID}-duo" baldurs-gate veteran', source)

def test_release_gate_stamps_latency_sidecar_into_persona_run_dirs(self):
# The RRI latency gate was DORMANT: run_duo.sh derives the per-beat ledger into the
# TRANSCRIPT dir, but release_readiness.read_latency() reads each PERSONA run dir's
# latency.json sidecar. Lock in the wiring that activates the gate — release_gate.sh must
# stamp the duo rollup into the run dirs via latency_rollup.py --stamp-into, BEFORE the
# RRI rollup reads them — so the gate can never silently fall back to a skip again.
source = (ROOT / "qa" / "release_gate.sh").read_text(encoding="utf-8")

self.assertIn("qa/latency_rollup.py", source)
self.assertIn('--run "${RUNID}-duo"', source)
self.assertIn('--stamp-into "$RUN_DIRS"', source)
# the stamp must run BEFORE release_readiness reads the per-run sidecars
self.assertLess(source.index("--stamp-into"), source.index("python3 qa/release_readiness.py"))

def test_ui_playtest_persists_final_session_surface_before_teardown(self):
source = (ROOT / "qa" / "ui_playtest_app.sh").read_text(encoding="utf-8")

Expand Down
71 changes: 71 additions & 0 deletions qa/test_release_readiness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2634,6 +2634,77 @@ def test_latency_absent_is_skip_and_byte_identical_release(self):
self.assertIsNone(payload["signals"]["latency_s_per_beat"])
self.assertIsNone(payload["signals"]["latency_coldopen_s"])

def _write_duo_beats(self, transcript_dir: Path, run: str, coldopen_ms: int, routine_ms: int) -> None:
# Minimal stream-json duo beat transcripts (one result event per beat) in the
# <run>.dm.<nanos>.jsonl shape the runners write and latency_rollup reads: a cold
# open (nanos=1000) + two routine beats. duration_api_ms is the only timing field read.
transcript_dir.mkdir(parents=True, exist_ok=True)
for nanos, ms, turns in ((1000, coldopen_ms, 18), (2000, routine_ms, 4), (3000, routine_ms, 5)):
res = {
"type": "result", "subtype": "success", "is_error": False,
"api_error_status": None, "duration_api_ms": ms, "num_turns": turns, "result": "prose",
}
(transcript_dir / f"{run}.dm.{nanos}.jsonl").write_text(
json.dumps(res) + "\n", encoding="utf-8")

def test_latency_rollup_stamp_sidecars_activates_the_gate_end_to_end(self):
# END-TO-END SEAM (the wiring PR #954 left dormant): the runners derive the latency
# ledger into the TRANSCRIPT dir, but release_readiness reads each PERSONA run dir's
# latency.json. This drives the REAL production path qa/release_gate.sh now uses —
# latency_rollup.rollup_run() over real duo beat transcripts, then
# latency_rollup.stamp_sidecars() into the persona run dirs — and proves the gate then
# FAILS over budget and PASSES under budget (no hand-written sidecar dict).
sys.path.insert(0, str(ROOT / "qa"))
import latency_rollup

def build_and_run(coldopen_ms: int, routine_ms: int):
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
run = "gate-duo"
self._write_duo_beats(tmp / "transcripts", run, coldopen_ms, routine_ms)
rollup = latency_rollup.rollup_run(tmp / "transcripts", run)
runs = self._five_clean_runs_with_latency(tmp, None) # NO hand-written latency
# THE WIRING: stamp the build-level duo rollup into every persona run dir.
written = latency_rollup.stamp_sidecars(rollup, runs)
self.assertEqual(len(written), 5)
# the sidecar landed in the exact shape release_readiness.read_latency() reads
sidecar = json.loads((runs[0] / "latency.json").read_text())
self.assertIn("s_per_beat", sidecar)
self.assertIn("coldopen_s", sidecar)
story, mech, behavioral, audit, palette = self.write_release_inputs(tmp)
return self.run_rri(
tmp,
"--runs", ",".join(str(r) for r in runs),
"--expected-personas", "newbie,veteran,adversarial,narrative,optimizer",
"--story", str(story), "--mech", str(mech),
"--behavioral", "GREEN", "--behavioral-path", str(behavioral),
"--ui-audit", "PASS", "--ui-audit-log", str(audit),
"--palette-live", "true", "--palette-source", str(palette),
"--build-sha", "deadbee",
)

# OVER budget: cold open 500s (> 240), routine 300s/beat (> 120) -> FAIL, gate evaluated.
rc, _text, payload = build_and_run(coldopen_ms=500_000, routine_ms=300_000)
self.assertEqual(rc, 1)
self.assertFalse(payload["release_ready"])
self.assertIn("latency_s_per_beat", payload["failed_gates"])
self.assertIn("latency_coldopen", payload["failed_gates"])
self.assertEqual(payload["signals"]["latency_s_per_beat"], 300.0)
self.assertEqual(payload["signals"]["latency_coldopen_s"], 500.0)
# the gate is now ACTIVE, not a dormant evidence-gap skip
self.assertEqual(payload["skipped_gates"], [])
self.assertEqual(payload["gates_total"], 13)

# UNDER budget: cold open 150s (< 240), routine 80s/beat (< 120) -> PASS.
rc, _text, payload = build_and_run(coldopen_ms=150_000, routine_ms=80_000)
self.assertEqual(rc, 0)
self.assertTrue(payload["release_ready"])
self.assertEqual(payload["evidence_gaps"], [])
self.assertNotIn("latency_s_per_beat", payload["failed_gates"])
self.assertNotIn("latency_coldopen", payload["failed_gates"])
self.assertEqual(payload["signals"]["latency_s_per_beat"], 80.0)
self.assertEqual(payload["signals"]["latency_coldopen_s"], 150.0)

def test_deterministic_only_marks_llm_gates_skipped_not_failed(self):
with tempfile.TemporaryDirectory() as td:
tmp = Path(td)
Expand Down
Loading