electricsheephq · 100yenadmin · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/qa/assert_behavioral.py b/qa/assert_behavioral.py
@@ -326,13 +326,75 @@ def chk(name: str, ok: bool, detail: str = "", fatal: bool = True) -> None:
         # (state.get("combat") truthy) — a non-combat session has no block and these skip.
         combat = state.get("combat") or {}
         if combat:
-            # FATAL: combat left active at end-of-run is a state-integrity failure — a clean run
-            # ends_combat. Only for a substantial session (a short smoke test cut off mid-fight
-            # is not a real defect).
-            if len(mv) >= MIN_BEATS:
-                chk("combat_not_left_active", not combat.get("active"),
-                    f"combat.active={combat.get('active')!r} at end-of-run — combat left active "
-                    f"(state-integrity fail: a finished session should end_combat)")
+            # combat left active at end-of-run. Naively this is a state-integrity failure (a clean
+            # run end_combats), BUT the dominant cause in QA is a HARNESS-LENGTH ARTIFACT, not a DM
+            # bug: a short emergent duo that ENTERS combat near its beat budget and TRUNCATES
+            # mid-fight legitimately never reaches end_combat — the fight was cut off, not abandoned.
+            # (Proven: qa/transcripts/claude-1v1-2 — an opus duo where start_combat fired in the last
+            # handful of tool calls and the final DM line is literally cut off mid-sentence; the old
+            # bare FATAL RED-capped all three lenses on a run that did nothing wrong.) So make the
+            # SEVERITY beat-scoped, exactly like party_traveled:
+            #
+            #   • A SHORT facade run (< COMBAT_ABANDON_MIN_BEATS) with combat still active is treated
+            #     as a TRUNCATED mid-combat scene → WARN. The standard 6-8 beat emergent combat duo
+            #     lives here: it can't both run a full fight AND wrap it inside its budget.
+            #   • A LONG run (>= COMBAT_ABANDON_MIN_BEATS) that ALSO truncated mid-fight (start_combat
+            #     fired in the last beat or two of the tool stream) is STILL a truncation, not an
+            #     abandon → WARN. The length alone doesn't make a cut-off fight a defect.
+            #   • Only a LONG run where combat started EARLY (room to resolve), end_combat never fired,
+            #     and the fight is STILL active at the snapshot is a genuine ABANDON — a real
+            #     state-integrity bug that corrupts the next load → FATAL.
+            #
+            # This is the same conservative, beat-scoped pattern as the party_traveled fix: it keeps
+            # the FATAL path for the real defect (a long run that left a fight hanging with room to
+            # resolve) and stops the model-agnostic false-cap on short/truncated emergent runs.
+            if len(mv) >= MIN_BEATS and combat.get("active"):
+                # Strictly above MIN_BEATS(6): a real multi-encounter arc, not a short single-fight
+                # vignette. A run shorter than this that's still mid-fight is presumed truncated.
+                COMBAT_ABANDON_MIN_BEATS = 10
+                # WHERE did the (last) start_combat fire in the ordered tool stream? If it's in the
+                # final stretch of calls the fight only just began before the run ended ⇒ truncation,
+                # never an abandon. Build the ordered short-name list once (mirrors the round1 scan's
+                # extraction below); cheap and self-contained.
+                _ordered_short: list[str] = []
+                for _ev in events:
+                    if _ev.get("type") != "assistant":
+                        continue
+                    for _b in (_ev.get("message", {}) or {}).get("content") or []:
+                        if isinstance(_b, dict) and _b.get("type") == "tool_use":
+                            _ordered_short.append((_b.get("name") or "").split("__")[-1])
+                _total_calls = len(_ordered_short)
+                _last_sc = max((i for i, c in enumerate(_ordered_short) if c == "start_combat"),
+                               default=-1)
+                # "Started late" = the last start_combat landed in the final ~20% of the tool stream
+                # (truncation: the fight only just began before the run ended), OR there is NO
+                # start_combat in the stream at all — a resume-into-combat session whose fight carried
+                # over from a prior session, which we CANNOT prove started early this run, so it is a
+                # truncation/resume, never an abandon (this matches the rationale comment above; the
+                # earlier form FATAL'd a resumed-into-combat run, contradicting it).
+                started_late = (
+                    _last_sc < 0
+                    or (_last_sc >= 0 and _total_calls > 0
+                        and _last_sc >= int(_total_calls * 0.8))
+                )
+                # A genuine ABANDON: a SUBSTANTIAL run, combat started EARLY (room to resolve), the DM
+                # never end_combat'd, yet the fight is still active. Everything else (short run, OR a
+                # late/truncated start, OR no start_combat in the stream) is a truncation ⇒ WARN.
+                _abandoned = (
+                    len(mv) >= COMBAT_ABANDON_MIN_BEATS
+                    and not started_late
+                    and tools.get("end_combat", 0) == 0
+                )
+                chk("combat_not_left_active", False,
+                    f"combat.active=True at end-of-run — combat left active "
+                    f"(beats={len(mv)}, start_combat@{_last_sc}/{_total_calls} calls, "
+                    f"end_combat={tools.get('end_combat', 0)}) — "
+                    + ("FATAL: substantial run, fight started early with room to resolve and was "
+                       "never end_combat'd (state-integrity fail: a finished session should end_combat)"
+                       if _abandoned else
+                       "WARN: run plausibly TRUNCATED mid-combat (short run or combat started near "
+                       "the beat budget) — a harness-length artifact, not an abandoned fight"),
+                    fatal=_abandoned)
             # WARN: if a fight started, the action economy should have engaged at some point —
             # an action was consumed / an attack was made. The final snapshot does not reliably
             # expose mid-fight action use (it may have been reset on end_combat), so probe the
@@ -614,12 +676,24 @@ def _has_spells(c: dict) -> bool:
         SINGLE_SCENE_MIN_BEATS = 8  # strictly above MIN_BEATS(6): a real arc, not a smoke test
         in_place_progression = (visited >= 1 and clock_advanced and arc_resolved
                                 and session_beats >= SINGLE_SCENE_MIN_BEATS)
+        # SEVERITY IS BEAT-SCOPED (false-cap fix): "the DM never left the opening scene" is only a
+        # STUCK-DM failure on a SUBSTANTIAL run. A SHORT run (< SINGLE_SCENE_MIN_BEATS) in one
+        # location is a legitimate single-scene vignette — the standard 6-beat social duo lives here
+        # — NOT a frozen stall, so below that length this is a WARN, never a lens-capping RED. It was
+        # FATAL-capping legitimate short single-scene play on BOTH models (Claude opus AND GLM — a
+        # model-agnostic false-cap that deflated the duo scores). At/above SINGLE_SCENE_MIN_BEATS the
+        # strict exception is UNCHANGED (travel >=2, OR a clock-advancing arc-resolving in-place
+        # drama) — a substantial run that never moves AND never progresses is still a FATAL stuck DM,
+        # and the anti-gaming AND-logic (clock-only/beats-only deliberately excluded) is preserved.
+        _pt_fatal = session_beats >= SINGLE_SCENE_MIN_BEATS
         chk("party_traveled", visited >= 2 or in_place_progression,
             f"visited {visited}/{len(locs)} location(s) after {session_beats} beats — the party never "
             f"left the opening scene (travel_to / add_location make_current=True); "
             f"in-place-progression exception NOT met "
             f"(clock_advanced={clock_advanced} arc_resolved={arc_resolved} "
-            f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS})")
+            f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS}) — "
+            f"{'FATAL (substantial run, stuck)' if _pt_fatal else 'WARN (short single-scene vignette)'}",
+            fatal=_pt_fatal)
         # WARN (the metric is softer): did the world gain/engage faces, or just sit in the seed?
         npcs_met = sum(1 for c in chars.values()
                        if isinstance(c, dict) and c.get("kind") == "npc" and c.get("met"))

diff --git a/qa/gate_corpus/builder.py b/qa/gate_corpus/builder.py
@@ -224,30 +224,56 @@ def case_combat_resolved():
 
 
 def case_combat_not_left_active():
-    # chk: facade, mv>=MIN_BEATS, state.combat.active=True at end-of-run. World floors must pass
-    # (mv>=6 activates them) -> the baseline state already has day=2 + 2 visited locations.
+    # chk: a GENUINE ABANDON (the preserved FATAL path after the truncation-vs-abandon split).
+    # combat_not_left_active is FATAL only when ALL of:
+    #   • facade mv >= COMBAT_ABANDON_MIN_BEATS(10) — a SUBSTANTIAL run (room to resolve), AND
+    #   • start_combat fired EARLY in the tool stream (NOT in the final ~20%) — the fight had time
+    #     to be wrapped, so it wasn't truncated, AND
+    #   • end_combat was never called, AND
+    #   • state.combat.active is still True at the snapshot.
+    # A SHORT run (< 10 beats) OR a LATE start_combat is treated as a harness-length truncation ⇒
+    # WARN (the qa/transcripts/claude-1v1-2 opus duo: start_combat@36/42 calls + only 7 beats), so
+    # this fixture must model the real-defect shape, not the truncation shape.
     state = _clean_player_state()
     state["combat"] = {"active": True, "round": 3}
-    moves = [_move("say") for _ in range(6)]
-    # 6 player beats (>=MIN_BEATS) interleaved with DM replies carrying quoted dialogue so
-    # both_sides_acted + dm_voices_characters both pass and combat_not_left_active is the SOLE fail.
+    moves = [_move("say") for _ in range(12)]  # >= COMBAT_ABANDON_MIN_BEATS(10): substantial run
+    # 12 player beats (>= COMBAT_ABANDON_MIN_BEATS) interleaved with DM replies carrying quoted
+    # dialogue so both_sides_acted + dm_voices_characters both pass. No companion in state ->
+    # structural_completeness stays inert even at >=10 beats, so combat_not_left_active is the SOLE
+    # fatal fail.
     chat = []
-    for i in range(6):
+    for i in range(12):
         chat.append(_player_chat_row(f"[say] beat {i}"))
         chat.append(_dm_chat_row('"The fight rages on," she calls.'))
-    return _roll(), state, chat, moves
+    # start_combat fires EARLY (first of the tool calls) and an attack resolves it (so combat_resolved
+    # passes), but end_combat is NEVER called and combat stays active -> a fight ABANDONED with room
+    # to resolve. The clean trailing `_roll()` keeps start_combat well inside the first 80% of the
+    # stream (last_sc index 0 of >=5 calls), so `started_late` is False -> the abandon path, FATAL.
+    events = [
+        _assistant_tool_use("t_sc", "mcp__engine__start_combat", {}),
+        _user_tool_result("t_sc", json.dumps({"ok": True})),
+        _assistant_tool_use("t_atk", "mcp__engine__attack", {"target": "g1"}),
+        _user_tool_result("t_atk", json.dumps({"hit": True, "damage": 6})),
+    ] + _roll()
+    return events, state, chat, moves
 
 
 def case_party_traveled():
-    # chk: session_beats>=MIN_BEATS, day>1 (world_advanced_time passes) but visited < 2 (only the
-    # opening scene). Isolates party_traveled. Uses chat beats (player rows) for session_beats.
+    # chk: session_beats >= SINGLE_SCENE_MIN_BEATS(8), day>1 (world_advanced_time passes) but
+    # visited < 2 (only the opening scene). Isolates party_traveled. Uses chat beats (player rows)
+    # for session_beats.
+    # SEVERITY IS BEAT-SCOPED (the 2026-06-19 false-cap fix): party_traveled is FATAL only at
+    # >= SINGLE_SCENE_MIN_BEATS(8) — below that a single-scene vignette is a legitimate WARN, not a
+    # frozen stall. So the fixture must carry 8 player beats to land on the PRESERVED FATAL path
+    # (at 6 it would now correctly WARN and this case would no longer flip RED). Keep the builder in
+    # lock-step with the committed 8-beat fixture so a regenerate doesn't silently revert the fix.
     state = _clean_player_state()
     state["day"] = 2  # world_advanced_time passes
     state["locations"] = {"loc_start": {"name": "Tavern", "visited": True}}  # visited == 1
     state["current_location_id"] = "loc_start"
     state["characters"]["pc1"]["location_id"] = "loc_start"
     # dm rows must carry dialogue so dm_voices passes (>=3 dm rows present).
-    chat = ([_player_chat_row(f"[say] beat {i}") for i in range(6)] +
+    chat = ([_player_chat_row(f"[say] beat {i}") for i in range(8)] +
             [_dm_chat_row('"We press on," she says.') for _ in range(3)])
     return _roll(), state, chat, None
 

diff --git a/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl b/qa/gate_corpus/cases/combat_not_left_active/chat.jsonl
@@ -10,3 +10,15 @@
 {"role": "dm", "text": "\"The fight rages on,\" she calls."}
 {"role": "player", "text": "[say] beat 5"}
 {"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 6"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 7"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 8"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 9"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 10"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
+{"role": "player", "text": "[say] beat 11"}
+{"role": "dm", "text": "\"The fight rages on,\" she calls."}
diff --git a/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl b/qa/gate_corpus/cases/combat_not_left_active/moves.jsonl
@@ -4,3 +4,9 @@
 {"role": "player", "kind": "say", "text": "[say] does a thing"}
 {"role": "player", "kind": "say", "text": "[say] does a thing"}
 {"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
+{"role": "player", "kind": "say", "text": "[say] does a thing"}
diff --git a/qa/gate_corpus/cases/combat_not_left_active/run.jsonl b/qa/gate_corpus/cases/combat_not_left_active/run.jsonl
@@ -1,2 +1,6 @@
+{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_sc", "name": "mcp__engine__start_combat", "input": {}}]}}
+{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_sc", "content": [{"type": "text", "text": "{\"ok\": true}"}], "is_error": false}]}}
+{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_atk", "name": "mcp__engine__attack", "input": {"target": "g1"}}]}}
+{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_atk", "content": [{"type": "text", "text": "{\"hit\": true, \"damage\": 6}"}], "is_error": false}]}}
 {"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_roll", "name": "mcp__engine__roll", "input": {"sides": 20}}]}}
 {"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_roll", "content": [{"type": "text", "text": "{\"total\": 14}"}], "is_error": false}]}}
diff --git a/qa/gate_corpus/cases/party_traveled/chat.jsonl b/qa/gate_corpus/cases/party_traveled/chat.jsonl
@@ -4,6 +4,8 @@
 {"role": "player", "text": "[say] beat 3"}
 {"role": "player", "text": "[say] beat 4"}
 {"role": "player", "text": "[say] beat 5"}
+{"role": "player", "text": "[say] beat 6"}
+{"role": "player", "text": "[say] beat 7"}
 {"role": "dm", "text": "\"We press on,\" she says."}
 {"role": "dm", "text": "\"We press on,\" she says."}
 {"role": "dm", "text": "\"We press on,\" she says."}
diff --git a/qa/test_assert_behavioral.py b/qa/test_assert_behavioral.py
@@ -862,12 +862,30 @@ def test_party_traveled_red_despite_status_blind_quest_tool_count(tmp_path):
     assert rc == 1, out
 
 
-def test_party_traveled_still_red_when_arc_resolved_but_too_few_beats(tmp_path):
-    # Guard against broadening to beats-only / arc-only: clock advanced + quest completed but
-    # only 7 beats (< SINGLE_SCENE_MIN_BEATS 8) → exception not met → party_traveled RED.
+def test_party_traveled_warns_not_red_on_short_single_scene_vignette(tmp_path):
+    # SEVERITY IS BEAT-SCOPED (2026-06-19 false-cap fix): a SHORT single-scene run
+    # (< SINGLE_SCENE_MIN_BEATS 8) that stayed in one location is a legitimate vignette — the
+    # standard 6-8 beat emergent social/combat duo — NOT a frozen stall. Below 8 beats
+    # party_traveled is a WARN, never a lens-capping RED (it was over-capping legitimate short
+    # play on BOTH Claude and GLM). Here: visited=1, day advanced + quest completed but only 7
+    # beats (< 8) → the in-place exception's beat-floor isn't met (so it doesn't PASS via the
+    # exception) AND the FATAL beat-floor isn't met (so it's a WARN, not RED). The run stays GREEN.
     events = _dm_text_turns(7)
     state = _single_scene_state(day=2, quest_completed=True, visited_count=1)
     rc, out = _run_gate(tmp_path, events, state)
+    assert "[WARN] party_traveled" in out, out
+    assert "[FAIL] party_traveled" not in out, out
+    assert rc == 0, out  # a short single-scene vignette is not a fatal frozen stall
+
+
+def test_party_traveled_still_red_on_substantial_frozen_run_too_few_visited(tmp_path):
+    # The PRESERVED FATAL path: at/above SINGLE_SCENE_MIN_BEATS(8) a run that stayed in ONE
+    # location AND did not progress in place (no completed quest → arc_resolved False) is a real
+    # stuck-DM frozen stall → RED. (Guards against the beat-scoping weakening the substantial-run
+    # FATAL: 8 beats, visited=1, clock advanced but arc unresolved → the in-place AND fails → RED.)
+    events = _dm_text_turns(8)
+    state = _single_scene_state(day=2, quest_completed=False, visited_count=1)
+    rc, out = _run_gate(tmp_path, events, state)
     assert "[FAIL] party_traveled" in out, out
     assert rc == 1, out