Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 82 additions & 8 deletions qa/assert_behavioral.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,75 @@ def chk(name: str, ok: bool, detail: str = "", fatal: bool = True) -> None:
# (state.get("combat") truthy) — a non-combat session has no block and these skip.
combat = state.get("combat") or {}
if combat:
# FATAL: combat left active at end-of-run is a state-integrity failure — a clean run
# ends_combat. Only for a substantial session (a short smoke test cut off mid-fight
# is not a real defect).
if len(mv) >= MIN_BEATS:
chk("combat_not_left_active", not combat.get("active"),
f"combat.active={combat.get('active')!r} at end-of-run — combat left active "
f"(state-integrity fail: a finished session should end_combat)")
# combat left active at end-of-run. Naively this is a state-integrity failure (a clean
# run end_combats), BUT the dominant cause in QA is a HARNESS-LENGTH ARTIFACT, not a DM
# bug: a short emergent duo that ENTERS combat near its beat budget and TRUNCATES
# mid-fight legitimately never reaches end_combat — the fight was cut off, not abandoned.
# (Proven: qa/transcripts/claude-1v1-2 — an opus duo where start_combat fired in the last
# handful of tool calls and the final DM line is literally cut off mid-sentence; the old
# bare FATAL RED-capped all three lenses on a run that did nothing wrong.) So make the
# SEVERITY beat-scoped, exactly like party_traveled:
#
# • A SHORT facade run (< COMBAT_ABANDON_MIN_BEATS) with combat still active is treated
# as a TRUNCATED mid-combat scene → WARN. The standard 6-8 beat emergent combat duo
# lives here: it can't both run a full fight AND wrap it inside its budget.
# • A LONG run (>= COMBAT_ABANDON_MIN_BEATS) that ALSO truncated mid-fight (start_combat
# fired in the last beat or two of the tool stream) is STILL a truncation, not an
# abandon → WARN. The length alone doesn't make a cut-off fight a defect.
# • Only a LONG run where combat started EARLY (room to resolve), end_combat never fired,
# and the fight is STILL active at the snapshot is a genuine ABANDON — a real
# state-integrity bug that corrupts the next load → FATAL.
#
# This is the same conservative, beat-scoped pattern as the party_traveled fix: it keeps
# the FATAL path for the real defect (a long run that left a fight hanging with room to
# resolve) and stops the model-agnostic false-cap on short/truncated emergent runs.
if len(mv) >= MIN_BEATS and combat.get("active"):
# Strictly above MIN_BEATS(6): a real multi-encounter arc, not a short single-fight
# vignette. A run shorter than this that's still mid-fight is presumed truncated.
COMBAT_ABANDON_MIN_BEATS = 10
# WHERE did the (last) start_combat fire in the ordered tool stream? If it's in the
# final stretch of calls the fight only just began before the run ended ⇒ truncation,
# never an abandon. Build the ordered short-name list once (mirrors the round1 scan's
# extraction below); cheap and self-contained.
_ordered_short: list[str] = []
for _ev in events:
if _ev.get("type") != "assistant":
continue
for _b in (_ev.get("message", {}) or {}).get("content") or []:
if isinstance(_b, dict) and _b.get("type") == "tool_use":
_ordered_short.append((_b.get("name") or "").split("__")[-1])
_total_calls = len(_ordered_short)
_last_sc = max((i for i, c in enumerate(_ordered_short) if c == "start_combat"),
default=-1)
# "Started late" = the last start_combat landed in the final ~20% of the tool stream
# (truncation: the fight only just began before the run ended), OR there is NO
# start_combat in the stream at all — a resume-into-combat session whose fight carried
# over from a prior session, which we CANNOT prove started early this run, so it is a
# truncation/resume, never an abandon (this matches the rationale comment above; the
# earlier form FATAL'd a resumed-into-combat run, contradicting it).
started_late = (
_last_sc < 0
or (_last_sc >= 0 and _total_calls > 0
and _last_sc >= int(_total_calls * 0.8))
)
# A genuine ABANDON: a SUBSTANTIAL run, combat started EARLY (room to resolve), the DM
# never end_combat'd, yet the fight is still active. Everything else (short run, OR a
# late/truncated start, OR no start_combat in the stream) is a truncation ⇒ WARN.
_abandoned = (
len(mv) >= COMBAT_ABANDON_MIN_BEATS
and not started_late
and tools.get("end_combat", 0) == 0
)
Comment on lines +383 to +387

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Scope abandon detection to the latest combat segment.

At Line 386, _abandoned checks tools.get("end_combat", 0) == 0 globally. If one combat ended earlier but a later combat was left active, this incorrectly downgrades a genuine abandon to WARN.

💡 Suggested fix
                 _last_sc = max((i for i, c in enumerate(_ordered_short) if c == "start_combat"),
                                default=-1)
+                _last_ec = max((i for i, c in enumerate(_ordered_short) if c == "end_combat"),
+                               default=-1)
@@
-                _abandoned = (
+                current_fight_unclosed = (_last_sc >= 0 and _last_ec < _last_sc)
+                _abandoned = (
                     len(mv) >= COMBAT_ABANDON_MIN_BEATS
                     and not started_late
-                    and tools.get("end_combat", 0) == 0
+                    and current_fight_unclosed
                 )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@qa/assert_behavioral.py` around lines 383 - 387, The `_abandoned` variable
calculation checks `tools.get("end_combat", 0) == 0` globally, which causes
incorrect behavior when multiple combat segments exist. If an earlier combat
ended properly but a later combat remains active, this check incorrectly
evaluates to false. Modify the condition to scope the `end_combat` check to only
the latest/current combat segment rather than checking the global tools
dictionary value, ensuring that abandons are detected accurately for the most
recent combat segment only.

chk("combat_not_left_active", False,
f"combat.active=True at end-of-run — combat left active "
f"(beats={len(mv)}, start_combat@{_last_sc}/{_total_calls} calls, "
f"end_combat={tools.get('end_combat', 0)}) — "
+ ("FATAL: substantial run, fight started early with room to resolve and was "
"never end_combat'd (state-integrity fail: a finished session should end_combat)"
if _abandoned else
"WARN: run plausibly TRUNCATED mid-combat (short run or combat started near "
"the beat budget) — a harness-length artifact, not an abandoned fight"),
fatal=_abandoned)
# WARN: if a fight started, the action economy should have engaged at some point —
# an action was consumed / an attack was made. The final snapshot does not reliably
# expose mid-fight action use (it may have been reset on end_combat), so probe the
Expand Down Expand Up @@ -614,12 +676,24 @@ def _has_spells(c: dict) -> bool:
SINGLE_SCENE_MIN_BEATS = 8 # strictly above MIN_BEATS(6): a real arc, not a smoke test
in_place_progression = (visited >= 1 and clock_advanced and arc_resolved
and session_beats >= SINGLE_SCENE_MIN_BEATS)
# SEVERITY IS BEAT-SCOPED (false-cap fix): "the DM never left the opening scene" is only a
# STUCK-DM failure on a SUBSTANTIAL run. A SHORT run (< SINGLE_SCENE_MIN_BEATS) in one
# location is a legitimate single-scene vignette — the standard 6-beat social duo lives here
# — NOT a frozen stall, so below that length this is a WARN, never a lens-capping RED. It was
# FATAL-capping legitimate short single-scene play on BOTH models (Claude opus AND GLM — a
# model-agnostic false-cap that deflated the duo scores). At/above SINGLE_SCENE_MIN_BEATS the
# strict exception is UNCHANGED (travel >=2, OR a clock-advancing arc-resolving in-place
# drama) — a substantial run that never moves AND never progresses is still a FATAL stuck DM,
# and the anti-gaming AND-logic (clock-only/beats-only deliberately excluded) is preserved.
_pt_fatal = session_beats >= SINGLE_SCENE_MIN_BEATS
chk("party_traveled", visited >= 2 or in_place_progression,
f"visited {visited}/{len(locs)} location(s) after {session_beats} beats — the party never "
f"left the opening scene (travel_to / add_location make_current=True); "
f"in-place-progression exception NOT met "
f"(clock_advanced={clock_advanced} arc_resolved={arc_resolved} "
f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS})")
f"beats>={SINGLE_SCENE_MIN_BEATS}? {session_beats >= SINGLE_SCENE_MIN_BEATS}) — "
f"{'FATAL (substantial run, stuck)' if _pt_fatal else 'WARN (short single-scene vignette)'}",
fatal=_pt_fatal)
# WARN (the metric is softer): did the world gain/engage faces, or just sit in the seed?
npcs_met = sum(1 for c in chars.values()
if isinstance(c, dict) and c.get("kind") == "npc" and c.get("met"))
Expand Down
46 changes: 36 additions & 10 deletions qa/gate_corpus/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,30 +224,56 @@ def case_combat_resolved():


def case_combat_not_left_active():
# chk: facade, mv>=MIN_BEATS, state.combat.active=True at end-of-run. World floors must pass
# (mv>=6 activates them) -> the baseline state already has day=2 + 2 visited locations.
# chk: a GENUINE ABANDON (the preserved FATAL path after the truncation-vs-abandon split).
# combat_not_left_active is FATAL only when ALL of:
# • facade mv >= COMBAT_ABANDON_MIN_BEATS(10) — a SUBSTANTIAL run (room to resolve), AND
# • start_combat fired EARLY in the tool stream (NOT in the final ~20%) — the fight had time
# to be wrapped, so it wasn't truncated, AND
# • end_combat was never called, AND
# • state.combat.active is still True at the snapshot.
# A SHORT run (< 10 beats) OR a LATE start_combat is treated as a harness-length truncation ⇒
# WARN (the qa/transcripts/claude-1v1-2 opus duo: start_combat@36/42 calls + only 7 beats), so
# this fixture must model the real-defect shape, not the truncation shape.
state = _clean_player_state()
state["combat"] = {"active": True, "round": 3}
moves = [_move("say") for _ in range(6)]
# 6 player beats (>=MIN_BEATS) interleaved with DM replies carrying quoted dialogue so
# both_sides_acted + dm_voices_characters both pass and combat_not_left_active is the SOLE fail.
moves = [_move("say") for _ in range(12)] # >= COMBAT_ABANDON_MIN_BEATS(10): substantial run
# 12 player beats (>= COMBAT_ABANDON_MIN_BEATS) interleaved with DM replies carrying quoted
# dialogue so both_sides_acted + dm_voices_characters both pass. No companion in state ->
# structural_completeness stays inert even at >=10 beats, so combat_not_left_active is the SOLE
# fatal fail.
chat = []
for i in range(6):
for i in range(12):
chat.append(_player_chat_row(f"[say] beat {i}"))
chat.append(_dm_chat_row('"The fight rages on," she calls.'))
return _roll(), state, chat, moves
# start_combat fires EARLY (first of the tool calls) and an attack resolves it (so combat_resolved
# passes), but end_combat is NEVER called and combat stays active -> a fight ABANDONED with room
# to resolve. The clean trailing `_roll()` keeps start_combat well inside the first 80% of the
# stream (last_sc index 0 of >=5 calls), so `started_late` is False -> the abandon path, FATAL.
events = [
_assistant_tool_use("t_sc", "mcp__engine__start_combat", {}),
_user_tool_result("t_sc", json.dumps({"ok": True})),
_assistant_tool_use("t_atk", "mcp__engine__attack", {"target": "g1"}),
_user_tool_result("t_atk", json.dumps({"hit": True, "damage": 6})),
] + _roll()
return events, state, chat, moves


def case_party_traveled():
# chk: session_beats>=MIN_BEATS, day>1 (world_advanced_time passes) but visited < 2 (only the
# opening scene). Isolates party_traveled. Uses chat beats (player rows) for session_beats.
# chk: session_beats >= SINGLE_SCENE_MIN_BEATS(8), day>1 (world_advanced_time passes) but
# visited < 2 (only the opening scene). Isolates party_traveled. Uses chat beats (player rows)
# for session_beats.
# SEVERITY IS BEAT-SCOPED (the 2026-06-19 false-cap fix): party_traveled is FATAL only at
# >= SINGLE_SCENE_MIN_BEATS(8) — below that a single-scene vignette is a legitimate WARN, not a
# frozen stall. So the fixture must carry 8 player beats to land on the PRESERVED FATAL path
# (at 6 it would now correctly WARN and this case would no longer flip RED). Keep the builder in
# lock-step with the committed 8-beat fixture so a regenerate doesn't silently revert the fix.
state = _clean_player_state()
state["day"] = 2 # world_advanced_time passes
state["locations"] = {"loc_start": {"name": "Tavern", "visited": True}} # visited == 1
state["current_location_id"] = "loc_start"
state["characters"]["pc1"]["location_id"] = "loc_start"
# dm rows must carry dialogue so dm_voices passes (>=3 dm rows present).
chat = ([_player_chat_row(f"[say] beat {i}") for i in range(6)] +
chat = ([_player_chat_row(f"[say] beat {i}") for i in range(8)] +
[_dm_chat_row('"We press on," she says.') for _ in range(3)])
return _roll(), state, chat, None

Expand Down
12 changes: 12 additions & 0 deletions qa/gate_corpus/cases/combat_not_left_active/chat.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,15 @@
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 5"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 6"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 7"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 8"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 9"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 10"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
{"role": "player", "text": "[say] beat 11"}
{"role": "dm", "text": "\"The fight rages on,\" she calls."}
6 changes: 6 additions & 0 deletions qa/gate_corpus/cases/combat_not_left_active/moves.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
{"role": "player", "kind": "say", "text": "[say] does a thing"}
4 changes: 4 additions & 0 deletions qa/gate_corpus/cases/combat_not_left_active/run.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_sc", "name": "mcp__engine__start_combat", "input": {}}]}}
{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_sc", "content": [{"type": "text", "text": "{\"ok\": true}"}], "is_error": false}]}}
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_atk", "name": "mcp__engine__attack", "input": {"target": "g1"}}]}}
{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_atk", "content": [{"type": "text", "text": "{\"hit\": true, \"damage\": 6}"}], "is_error": false}]}}
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_roll", "name": "mcp__engine__roll", "input": {"sides": 20}}]}}
{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_roll", "content": [{"type": "text", "text": "{\"total\": 14}"}], "is_error": false}]}}
2 changes: 2 additions & 0 deletions qa/gate_corpus/cases/party_traveled/chat.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
{"role": "player", "text": "[say] beat 3"}
{"role": "player", "text": "[say] beat 4"}
{"role": "player", "text": "[say] beat 5"}
{"role": "player", "text": "[say] beat 6"}
{"role": "player", "text": "[say] beat 7"}
{"role": "dm", "text": "\"We press on,\" she says."}
{"role": "dm", "text": "\"We press on,\" she says."}
{"role": "dm", "text": "\"We press on,\" she says."}
24 changes: 21 additions & 3 deletions qa/test_assert_behavioral.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,30 @@ def test_party_traveled_red_despite_status_blind_quest_tool_count(tmp_path):
assert rc == 1, out


def test_party_traveled_still_red_when_arc_resolved_but_too_few_beats(tmp_path):
# Guard against broadening to beats-only / arc-only: clock advanced + quest completed but
# only 7 beats (< SINGLE_SCENE_MIN_BEATS 8) → exception not met → party_traveled RED.
def test_party_traveled_warns_not_red_on_short_single_scene_vignette(tmp_path):
# SEVERITY IS BEAT-SCOPED (2026-06-19 false-cap fix): a SHORT single-scene run
# (< SINGLE_SCENE_MIN_BEATS 8) that stayed in one location is a legitimate vignette — the
# standard 6-8 beat emergent social/combat duo — NOT a frozen stall. Below 8 beats
# party_traveled is a WARN, never a lens-capping RED (it was over-capping legitimate short
# play on BOTH Claude and GLM). Here: visited=1, day advanced + quest completed but only 7
# beats (< 8) → the in-place exception's beat-floor isn't met (so it doesn't PASS via the
# exception) AND the FATAL beat-floor isn't met (so it's a WARN, not RED). The run stays GREEN.
events = _dm_text_turns(7)
state = _single_scene_state(day=2, quest_completed=True, visited_count=1)
rc, out = _run_gate(tmp_path, events, state)
assert "[WARN] party_traveled" in out, out
assert "[FAIL] party_traveled" not in out, out
assert rc == 0, out # a short single-scene vignette is not a fatal frozen stall


def test_party_traveled_still_red_on_substantial_frozen_run_too_few_visited(tmp_path):
# The PRESERVED FATAL path: at/above SINGLE_SCENE_MIN_BEATS(8) a run that stayed in ONE
# location AND did not progress in place (no completed quest → arc_resolved False) is a real
# stuck-DM frozen stall → RED. (Guards against the beat-scoping weakening the substantial-run
# FATAL: 8 beats, visited=1, clock advanced but arc unresolved → the in-place AND fails → RED.)
events = _dm_text_turns(8)
state = _single_scene_state(day=2, quest_completed=False, visited_count=1)
rc, out = _run_gate(tmp_path, events, state)
assert "[FAIL] party_traveled" in out, out
assert rc == 1, out

Expand Down
Loading