diff --git a/qa/assert_behavioral.py b/qa/assert_behavioral.py index db8739b3..cbf88397 100644 --- a/qa/assert_behavioral.py +++ b/qa/assert_behavioral.py @@ -326,13 +326,75 @@ def chk(name: str, ok: bool, detail: str = "", fatal: bool = True) -> None: # (state.get("combat") truthy) — a non-combat session has no block and these skip. combat = state.get("combat") or {} if combat: - # FATAL: combat left active at end-of-run is a state-integrity failure — a clean run - # ends_combat. Only for a substantial session (a short smoke test cut off mid-fight - # is not a real defect). - if len(mv) >= MIN_BEATS: - chk("combat_not_left_active", not combat.get("active"), - f"combat.active={combat.get('active')!r} at end-of-run — combat left active " - f"(state-integrity fail: a finished session should end_combat)") + # combat left active at end-of-run. Naively this is a state-integrity failure (a clean + # run end_combats), BUT the dominant cause in QA is a HARNESS-LENGTH ARTIFACT, not a DM + # bug: a short emergent duo that ENTERS combat near its beat budget and TRUNCATES + # mid-fight legitimately never reaches end_combat — the fight was cut off, not abandoned. + # (Proven: qa/transcripts/claude-1v1-2 — an opus duo where start_combat fired in the last + # handful of tool calls and the final DM line is literally cut off mid-sentence; the old + # bare FATAL RED-capped all three lenses on a run that did nothing wrong.) So make the + # SEVERITY beat-scoped, exactly like party_traveled: + # + # • A SHORT facade run (< COMBAT_ABANDON_MIN_BEATS) with combat still active is treated + # as a TRUNCATED mid-combat scene → WARN. The standard 6-8 beat emergent combat duo + # lives here: it can't both run a full fight AND wrap it inside its budget. + # • A LONG run (>= COMBAT_ABANDON_MIN_BEATS) that ALSO truncated mid-fight (start_combat + # fired in the last beat or two of the tool stream) is STILL a truncation, not an + # abandon → WARN. The length alone doesn't make a cut-off fight a defect. + # • Only a LONG run where combat started EARLY (room to resolve), end_combat never fired, + # and the fight is STILL active at the snapshot is a genuine ABANDON — a real + # state-integrity bug that corrupts the next load → FATAL. + # + # This is the same conservative, beat-scoped pattern as the party_traveled fix: it keeps + # the FATAL path for the real defect (a long run that left a fight hanging with room to + # resolve) and stops the model-agnostic false-cap on short/truncated emergent runs. + if len(mv) >= MIN_BEATS and combat.get("active"): + # Strictly above MIN_BEATS(6): a real multi-encounter arc, not a short single-fight + # vignette. A run shorter than this that's still mid-fight is presumed truncated. + COMBAT_ABANDON_MIN_BEATS = 10 + # WHERE did the (last) start_combat fire in the ordered tool stream? If it's in the + # final stretch of calls the fight only just began before the run ended ⇒ truncation, + # never an abandon. Build the ordered short-name list once (mirrors the round1 scan's + # extraction below); cheap and self-contained. + _ordered_short: list[str] = [] + for _ev in events: + if _ev.get("type") != "assistant": + continue + for _b in (_ev.get("message", {}) or {}).get("content") or []: + if isinstance(_b, dict) and _b.get("type") == "tool_use": + _ordered_short.append((_b.get("name") or "").split("__")[-1]) + _total_calls = len(_ordered_short) + _last_sc = max((i for i, c in enumerate(_ordered_short) if c == "start_combat"), + default=-1) + # "Started late" = the last start_combat landed in the final ~20% of the tool stream + # (truncation: the fight only just began before the run ended), OR there is NO + # start_combat in the stream at all — a resume-into-combat session whose fight carried + # over from a prior session, which we CANNOT prove started early this run, so it is a + # truncation/resume, never an abandon (this matches the rationale comment above; the + # earlier form FATAL'd a resumed-into-combat run, contradicting it). + started_late = ( + _last_sc < 0 + or (_last_sc >= 0 and _total_calls > 0 + and _last_sc >= int(_total_calls * 0.8)) + ) + # A genuine ABANDON: a SUBSTANTIAL run, combat started EARLY (room to resolve), the DM + # never end_combat'd, yet the fight is still active. Everything else (short run, OR a + # late/truncated start, OR no start_combat in the stream) is a truncation ⇒ WARN. + _abandoned = ( + len(mv) >= COMBAT_ABANDON_MIN_BEATS + and not started_late + and tools.get("end_combat", 0) == 0 + ) + chk("combat_not_left_active", False, + f"combat.active=True at end-of-run — combat left active " + f"(beats={len(mv)}, start_combat@{_last_sc}/{_total_calls} calls, " + f"end_combat={tools.get('end_combat', 0)}) — " + + ("FATAL: substantial run, fight started early with room to resolve and was " + "never end_combat'd (state-integrity fail: a finished session should end_combat)" + if _abandoned else + "WARN: run plausibly TRUNCATED mid-combat (short run or combat started near " + "the beat budget) — a harness-length artifact, not an abandoned fight"), + fatal=_abandoned) # WARN: if a fight started, the action economy should have engaged at some point — # an action was consumed / an attack was made. The final snapshot does not reliably # expose mid-fight action use (it may have been reset on end_combat), so probe the @@ -614,12 +676,24 @@ def _has_spells(c: dict) -> bool: SINGLE_SCENE_MIN_BEATS = 8 # strictly above MIN_BEATS(6): a real arc, not a smoke test in_place_progression = (visited >= 1 and clock_advanced and arc_resolved and session_beats >= SINGLE_SCENE_MIN_BEATS) + # SEVERITY IS BEAT-SCOPED (false-cap fix): "the DM never left the opening scene" is only a + # STUCK-DM failure on a SUBSTANTIAL run. A SHORT run (< SINGLE_SCENE_MIN_BEATS) in one + # location is a legitimate single-scene vignette — the standard 6-beat social duo lives here + # — NOT a frozen stall, so below that length this is a WARN, never a lens-capping RED. It was + # FATAL-capping legitimate short single-scene play on BOTH models (Claude opus AND GLM — a + # model-agnostic false-cap that deflated the duo scores). At/above SINGLE_SCENE_MIN_BEATS the + # strict exception is UNCHANGED (travel >=2, OR a clock-advancing arc-resolving in-place + # drama) — a substantial run that never moves AND never progresses is still a FATAL stuck DM, + # and the anti-gaming AND-logic (clock-only/beats-only deliberately excluded) is preserved. + _pt_fatal = session_beats >= SINGLE_SCENE_MIN_BEATS chk("party_traveled", visited >= 2 or in_place_progression, f"visited {visited}/{len(locs)} location(s) after {session_beats} beats — the party never " f"left the opening scene (travel_to / add_location make_current=True); " f"in-place-progression exception NOT met " f"(clock_advanced={clock_advanced} arc_resolved={arc_resolved} " - f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS})") + f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS}) — " + f"{'FATAL (substantial run, stuck)' if _pt_fatal else 'WARN (short single-scene vignette)'}", + fatal=_pt_fatal) # WARN (the metric is softer): did the world gain/engage faces, or just sit in the seed? npcs_met = sum(1 for c in chars.values() if isinstance(c, dict) and c.get("kind") == "npc" and c.get("met")) diff --git a/qa/gate_corpus/builder.py b/qa/gate_corpus/builder.py index e5d06bfc..f82cabf0 100644 --- a/qa/gate_corpus/builder.py +++ b/qa/gate_corpus/builder.py @@ -224,30 +224,56 @@ def case_combat_resolved(): def case_combat_not_left_active(): - # chk: facade, mv>=MIN_BEATS, state.combat.active=True at end-of-run. World floors must pass - # (mv>=6 activates them) -> the baseline state already has day=2 + 2 visited locations. + # chk: a GENUINE ABANDON (the preserved FATAL path after the truncation-vs-abandon split). + # combat_not_left_active is FATAL only when ALL of: + # • facade mv >= COMBAT_ABANDON_MIN_BEATS(10) — a SUBSTANTIAL run (room to resolve), AND + # • start_combat fired EARLY in the tool stream (NOT in the final ~20%) — the fight had time + # to be wrapped, so it wasn't truncated, AND + # • end_combat was never called, AND + # • state.combat.active is still True at the snapshot. + # A SHORT run (< 10 beats) OR a LATE start_combat is treated as a harness-length truncation ⇒ + # WARN (the qa/transcripts/claude-1v1-2 opus duo: start_combat@36/42 calls + only 7 beats), so + # this fixture must model the real-defect shape, not the truncation shape. state = _clean_player_state() state["combat"] = {"active": True, "round": 3} - moves = [_move("say") for _ in range(6)] - # 6 player beats (>=MIN_BEATS) interleaved with DM replies carrying quoted dialogue so - # both_sides_acted + dm_voices_characters both pass and combat_not_left_active is the SOLE fail. + moves = [_move("say") for _ in range(12)] # >= COMBAT_ABANDON_MIN_BEATS(10): substantial run + # 12 player beats (>= COMBAT_ABANDON_MIN_BEATS) interleaved with DM replies carrying quoted + # dialogue so both_sides_acted + dm_voices_characters both pass. No companion in state -> + # structural_completeness stays inert even at >=10 beats, so combat_not_left_active is the SOLE + # fatal fail. chat = [] - for i in range(6): + for i in range(12): chat.append(_player_chat_row(f"[say] beat {i}")) chat.append(_dm_chat_row('"The fight rages on," she calls.')) - return _roll(), state, chat, moves + # start_combat fires EARLY (first of the tool calls) and an attack resolves it (so combat_resolved + # passes), but end_combat is NEVER called and combat stays active -> a fight ABANDONED with room + # to resolve. The clean trailing `_roll()` keeps start_combat well inside the first 80% of the + # stream (last_sc index 0 of >=5 calls), so `started_late` is False -> the abandon path, FATAL. + events = [ + _assistant_tool_use("t_sc", "mcp__engine__start_combat", {}), + _user_tool_result("t_sc", json.dumps({"ok": True})), + _assistant_tool_use("t_atk", "mcp__engine__attack", {"target": "g1"}), + _user_tool_result("t_atk", json.dumps({"hit": True, "damage": 6})), + ] + _roll() + return events, state, chat, moves def case_party_traveled(): - # chk: session_beats>=MIN_BEATS, day>1 (world_advanced_time passes) but visited < 2 (only the - # opening scene). Isolates party_traveled. Uses chat beats (player rows) for session_beats. + # chk: session_beats >= SINGLE_SCENE_MIN_BEATS(8), day>1 (world_advanced_time passes) but + # visited < 2 (only the opening scene). Isolates party_traveled. Uses chat beats (player rows) + # for session_beats. + # SEVERITY IS BEAT-SCOPED (the 2026-06-19 false-cap fix): party_traveled is FATAL only at + # >= SINGLE_SCENE_MIN_BEATS(8) — below that a single-scene vignette is a legitimate WARN, not a + # frozen stall. So the fixture must carry 8 player beats to land on the PRESERVED FATAL path + # (at 6 it would now correctly WARN and this case would no longer flip RED). Keep the builder in + # lock-step with the committed 8-beat fixture so a regenerate doesn't silently revert the fix. state = _clean_player_state() state["day"] = 2 # world_advanced_time passes state["locations"] = {"loc_start": {"name": "Tavern", "visited": True}} # visited == 1 state["current_location_id"] = "loc_start" state["characters"]["pc1"]["location_id"] = "loc_start" # dm rows must carry dialogue so dm_voices passes (>=3 dm rows present). - chat = ([_player_chat_row(f"[say] beat {i}") for i in range(6)] + + chat = ([_player_chat_row(f"[say] beat {i}") for i in range(8)] + [_dm_chat_row('"We press on," she says.') for _ in range(3)]) return _roll(), state, chat, None diff --git a/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl b/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl index c075e2a9..18310533 100644 --- a/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl +++ b/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl @@ -10,3 +10,15 @@ {"role": "dm", "text": "\"The fight rages on,\" she calls."} {"role": "player", "text": "[say] beat 5"} {"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 6"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 7"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 8"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 9"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 10"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} +{"role": "player", "text": "[say] beat 11"} +{"role": "dm", "text": "\"The fight rages on,\" she calls."} diff --git a/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl b/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl index fc04a054..fcce91cb 100644 --- a/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl +++ b/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl @@ -4,3 +4,9 @@ {"role": "player", "kind": "say", "text": "[say] does a thing"} {"role": "player", "kind": "say", "text": "[say] does a thing"} {"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} +{"role": "player", "kind": "say", "text": "[say] does a thing"} diff --git a/qa/gate_corpus/cases/combat_not_left_active/run.jsonl b/qa/gate_corpus/cases/combat_not_left_active/run.jsonl index b56367f9..f0d836a3 100644 --- a/qa/gate_corpus/cases/combat_not_left_active/run.jsonl +++ b/qa/gate_corpus/cases/combat_not_left_active/run.jsonl @@ -1,2 +1,6 @@ +{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_sc", "name": "mcp__engine__start_combat", "input": {}}]}} +{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_sc", "content": [{"type": "text", "text": "{\"ok\": true}"}], "is_error": false}]}} +{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_atk", "name": "mcp__engine__attack", "input": {"target": "g1"}}]}} +{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_atk", "content": [{"type": "text", "text": "{\"hit\": true, \"damage\": 6}"}], "is_error": false}]}} {"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_roll", "name": "mcp__engine__roll", "input": {"sides": 20}}]}} {"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_roll", "content": [{"type": "text", "text": "{\"total\": 14}"}], "is_error": false}]}} diff --git a/qa/gate_corpus/cases/party_traveled/chat.jsonl b/qa/gate_corpus/cases/party_traveled/chat.jsonl index 5a3fde09..ebfb5b5b 100644 --- a/qa/gate_corpus/cases/party_traveled/chat.jsonl +++ b/qa/gate_corpus/cases/party_traveled/chat.jsonl @@ -4,6 +4,8 @@ {"role": "player", "text": "[say] beat 3"} {"role": "player", "text": "[say] beat 4"} {"role": "player", "text": "[say] beat 5"} +{"role": "player", "text": "[say] beat 6"} +{"role": "player", "text": "[say] beat 7"} {"role": "dm", "text": "\"We press on,\" she says."} {"role": "dm", "text": "\"We press on,\" she says."} {"role": "dm", "text": "\"We press on,\" she says."} diff --git a/qa/test_assert_behavioral.py b/qa/test_assert_behavioral.py index 76d229fb..7454756c 100644 --- a/qa/test_assert_behavioral.py +++ b/qa/test_assert_behavioral.py @@ -862,12 +862,30 @@ def test_party_traveled_red_despite_status_blind_quest_tool_count(tmp_path): assert rc == 1, out -def test_party_traveled_still_red_when_arc_resolved_but_too_few_beats(tmp_path): - # Guard against broadening to beats-only / arc-only: clock advanced + quest completed but - # only 7 beats (< SINGLE_SCENE_MIN_BEATS 8) → exception not met → party_traveled RED. +def test_party_traveled_warns_not_red_on_short_single_scene_vignette(tmp_path): + # SEVERITY IS BEAT-SCOPED (2026-06-19 false-cap fix): a SHORT single-scene run + # (< SINGLE_SCENE_MIN_BEATS 8) that stayed in one location is a legitimate vignette — the + # standard 6-8 beat emergent social/combat duo — NOT a frozen stall. Below 8 beats + # party_traveled is a WARN, never a lens-capping RED (it was over-capping legitimate short + # play on BOTH Claude and GLM). Here: visited=1, day advanced + quest completed but only 7 + # beats (< 8) → the in-place exception's beat-floor isn't met (so it doesn't PASS via the + # exception) AND the FATAL beat-floor isn't met (so it's a WARN, not RED). The run stays GREEN. events = _dm_text_turns(7) state = _single_scene_state(day=2, quest_completed=True, visited_count=1) rc, out = _run_gate(tmp_path, events, state) + assert "[WARN] party_traveled" in out, out + assert "[FAIL] party_traveled" not in out, out + assert rc == 0, out # a short single-scene vignette is not a fatal frozen stall + + +def test_party_traveled_still_red_on_substantial_frozen_run_too_few_visited(tmp_path): + # The PRESERVED FATAL path: at/above SINGLE_SCENE_MIN_BEATS(8) a run that stayed in ONE + # location AND did not progress in place (no completed quest → arc_resolved False) is a real + # stuck-DM frozen stall → RED. (Guards against the beat-scoping weakening the substantial-run + # FATAL: 8 beats, visited=1, clock advanced but arc unresolved → the in-place AND fails → RED.) + events = _dm_text_turns(8) + state = _single_scene_state(day=2, quest_completed=False, visited_count=1) + rc, out = _run_gate(tmp_path, events, state) assert "[FAIL] party_traveled" in out, out assert rc == 1, out