From 50e4c6c681704353ac5ec81f2021ea7d671d90fc Mon Sep 17 00:00:00 2001 From: Eva Date: Sat, 20 Jun 2026 02:33:48 +0700 Subject: [PATCH] fix(qa): scope-guard structural_completeness unresolved_arc for authored campaigns (#1036) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROOT CAUSE The `structural_completeness` behavioral gate (qa/assert_behavioral.py) FATAL-capped AUTHORED golden-spine runs to 2.5. Sub-check (b) `unresolved_arc` fires when an active quest reaches session end open across a >=2-location arc with no quest-resolution call. But the campaign-arc quest is SEEDED from the authored adventure `hook` and is multi- session by design; the authored adventures (e.g. embergloom-pact) author NO closable sub-quests, so the DM legitimately never calls complete_quest / set_quest_status — and (b) FATAL-REDs even a clean 25-beat authored run. A self-inflicted false-cap. Sibling #1030 fixed party_traveled / combat_not_left_active the same way but missed this one. FIX (Option A scope guard — mirrors #1030's WARN-vs-FATAL discipline EXACTLY) - Compute `is_authored_campaign = bool(tools.get("start_adventure") or state.get("scenes"))`. `start_adventure` is the authored cold-open call (server.py:697), always in the tool stream `_tally` sees; `state["scenes"]` is non-empty only for seeded authored adventures (server.py serializes it; content.py persists authored scenes). - Demote ONLY sub-check (b) unresolved_arc from FATAL->WARN when authored AND the only open quest is the hook-seeded arc. The gate still APPENDS the WARN message (visibility kept); the run is no longer RED-capped on (b) alone. - Clause (a) approval-frozen stays FATAL ALWAYS. - PRESERVE FATAL for: any NON-authored run (the original narrated-not-engaged failure), AND an authored run that called add_quest (server.py:10165 — the DM's own quest-creation tool, distinguishable from the hook-seeded quest at gate time) and left it unresolved — a genuine dropped thread. New severity: `_unresolved_fatal = unresolved_arc and (not is_authored_campaign or bool(tools.get("add_quest")))`; `_structural_fatal = approval_frozen_run or _unresolved_fatal`. ANTI-SCORE-GAMING DUAL CORPUS PROOF - NEW GREEN fixture qa/gate_corpus/cases/structural_completeness_authored_warn/ (built by builder.py `case_structural_completeness_authored_warn`, recorded under a new RED-only- safe `green_cases` manifest key): authored profile (start_adventure + scenes + frozen companion + active hook quest + 2 locations + no resolution) -> gate exits GREEN with structural_completeness as [WARN]. Locked by test_behavioral_gate_corpus.py ::test_green_case_warns_but_stays_green (the inverse guard: re-promoting (b) to FATAL flips it RED and fails). - The EXISTING non-authored qa/gate_corpus/cases/structural_completeness/ fixture (no start_adventure, no scenes) regenerated cleanly and STILL exits FATAL RED. - 4 unit tests in qa/test_assert_behavioral.py: authored->WARN/GREEN, non-authored->FATAL/ RED, authored+add_quest->FATAL (carve-out), authored-via-scenes->WARN. - Coverage audit (test_manifest_covers_every_fatal_check) stays green — the gate still classifies structural_completeness as FATAL (fatal=, not fatal=False), no drift. - BEHAVIORAL_GATE_TAXONOMY.json hint updated to document the #1036 authored-WARN behavior. Additive, no existing guard weakened. 78 focused tests pass single-process (no xdist). --- qa/BEHAVIORAL_GATE_TAXONOMY.json | 2 +- qa/assert_behavioral.py | 34 +++++++- qa/gate_corpus/builder.py | 81 ++++++++++++++++++- .../run.jsonl | 17 ++++ .../state.json | 53 ++++++++++++ qa/gate_corpus/manifest.json | 13 ++- qa/test_assert_behavioral.py | 60 ++++++++++++++ qa/test_behavioral_gate_corpus.py | 52 ++++++++++++ 8 files changed, 304 insertions(+), 8 deletions(-) create mode 100644 qa/gate_corpus/cases/structural_completeness_authored_warn/run.jsonl create mode 100644 qa/gate_corpus/cases/structural_completeness_authored_warn/state.json diff --git a/qa/BEHAVIORAL_GATE_TAXONOMY.json b/qa/BEHAVIORAL_GATE_TAXONOMY.json index 434fe7fe..fbec8ded 100644 --- a/qa/BEHAVIORAL_GATE_TAXONOMY.json +++ b/qa/BEHAVIORAL_GATE_TAXONOMY.json @@ -174,7 +174,7 @@ "category": "ENGINE_INVARIANT", "likely_code_locations": ["servers/engine/server.py", "servers/engine/models.py", "servers/engine/companion.py"], "retest": "bash qa/run_duo.sh duo-retest", - "hint": "A >=10-beat session with a companion never engaged a core relationship/quest system: either no companion's attitude_value moved off 0 AND no camp/long_rest happened, or an active quest was left open across a >=2-location arc with no quest-resolution call. Either the DM never invoked the relationship/quest tools (record_decision / adjust_attitude / camp_scene / complete_quest evolves_to — DM adherence) or those engine writes never landed (engine). All those tool handlers + the attitude_value/quest-status writes live in server.py (adjust_attitude, record_decision, camp_scene, complete_quest, and the rule-of-three evolution _maybe_schedule_quest_evolution / evolves_to); the field definitions are in models.py; companion.py only surfaces approval CAUSES (approval_tags) for the DM to apply — it never writes state itself. The gate trips at >=10 beats (STRUCTURAL_MIN_BEATS); when validating authored campaigns run >=24 beats so the main quest has room to resolve, else the unresolved-arc sub-check false-REDs. FATAL; skipped in the combat-sprint lane." + "hint": "A >=10-beat session with a companion never engaged a core relationship/quest system: either no companion's attitude_value moved off 0 AND no camp/long_rest happened, or an active quest was left open across a >=2-location arc with no quest-resolution call. Either the DM never invoked the relationship/quest tools (record_decision / adjust_attitude / camp_scene / complete_quest evolves_to — DM adherence) or those engine writes never landed (engine). All those tool handlers + the attitude_value/quest-status writes live in server.py (adjust_attitude, record_decision, camp_scene, complete_quest, and the rule-of-three evolution _maybe_schedule_quest_evolution / evolves_to); the field definitions are in models.py; companion.py only surfaces approval CAUSES (approval_tags) for the DM to apply — it never writes state itself. The gate trips at >=10 beats (STRUCTURAL_MIN_BEATS). #1036 scope guard (Option A, mirrors #1030): on an AUTHORED campaign (start_adventure in the tool stream OR a non-empty state[\"scenes\"]) whose only open quest is the hook-seeded arc, the unresolved-arc sub-check (b) is demoted FATAL->WARN — the hook-seeded campaign arc is multi-session by design and authored adventures author no closable sub-quests, so it no longer false-caps a clean authored run to 2.5; the WARN message is still surfaced. STILL FATAL for: any NON-authored run (the original narrated-not-engaged failure), and an authored run that called add_quest and left it unresolved (a real dropped DM-opened thread). Clause (a) approval-frozen is FATAL ALWAYS. Skipped in the combat-sprint lane." }, "flat_arc": { "category": "DM_ADHERENCE", diff --git a/qa/assert_behavioral.py b/qa/assert_behavioral.py index cbf88397..59898de1 100644 --- a/qa/assert_behavioral.py +++ b/qa/assert_behavioral.py @@ -758,6 +758,28 @@ def _has_spells(c: dict) -> bool: unresolved_arc = bool(active_quests and multi_location_arc and not quest_resolution_engaged) + # AUTHORED-CAMPAIGN SCOPE GUARD for sub-check (b) — #1036 (Option A; mirrors #1030's + # WARN-vs-FATAL discipline for party_traveled / combat_not_left_active). The campaign-arc + # quest is SEEDED from the authored adventure `hook` and is multi-session by design; the + # authored adventures (e.g. embergloom-pact) author NO closable sub-quests, so the DM never + # calls complete_quest / set_quest_status and (b) `unresolved_arc` FATAL-capped a clean + # 25-beat authored run to 2.5 — a self-inflicted false-cap (the main quest can't legitimately + # resolve inside one session). An AUTHORED run is identifiable at the gate from signals + # already in scope: `start_adventure` is the cold-open call for authored runs (in the tool + # stream `_tally` always sees), and `state["scenes"]` is non-empty only for authored + # adventures (the seeded-campaign snapshot). When authored, (b) is demoted FATAL->WARN — + # the gate still APPENDS the WARN message (visibility preserved), but the run is not + # RED-capped on (b) alone. + # + # PRESERVE FATAL for the original failure class: + # - any NON-authored run (the proven 18-beat narrated-not-engaged failure), AND + # - an authored run where the DM ADDED a sub-quest (add_quest) and left it unresolved — + # a genuine dropped thread the DM itself opened, distinct from the hook-seeded campaign + # arc (add_quest is the engine quest-creation tool, server.py:add_quest; it is + # distinguishable from the hook-seeded quest at gate time, so we keep that case FATAL). + is_authored_campaign = bool(tools.get("start_adventure") or state.get("scenes")) + dm_added_quest = bool(tools.get("add_quest")) + bad_bits = [] if approval_frozen_run: bad_bits.append( @@ -767,12 +789,20 @@ def _has_spells(c: dict) -> bool: bad_bits.append( f"{len(active_quests)} quest(s) still active at session end across a " f"{visited}-location arc with no quest-resolution call " - f"({[q.get('title') or q.get('id') or '?' for q in active_quests]})") + f"({[q.get('title') or q.get('id') or '?' for q in active_quests]})" + + (" [authored campaign + only the hook-seeded arc ⇒ WARN, not RED (#1036)]" + if (is_authored_campaign and not dm_added_quest) else "")) + + # Severity: clause (a) approval-frozen stays FATAL ALWAYS. Clause (b) unresolved_arc is + # FATAL unless it's an authored campaign whose ONLY open quest is the hook-seeded arc. + _unresolved_fatal = unresolved_arc and (not is_authored_campaign or dm_added_quest) + _structural_fatal = approval_frozen_run or _unresolved_fatal chk("structural_completeness", not bad_bits, f"a {session_beats}-beat session with a companion never engaged a core system: " + "; ".join(bad_bits) + " — the engine relationship/quest tools (record_decision approval_tags / " - "adjust_attitude / camp_scene / complete_quest evolves_to) were narrated, not used") + "adjust_attitude / camp_scene / complete_quest evolves_to) were narrated, not used", + fatal=_structural_fatal) # (c) FLAT ARC (WARN-FIRST). A LONG run (>= FELT_SHAPE_MIN_BEATS) that CLAIMS three acts # (the engine narrative_arc cursor OR contiguous act-tags) but whose arc never TURNED — no diff --git a/qa/gate_corpus/builder.py b/qa/gate_corpus/builder.py index f82cabf0..7ebc85e5 100644 --- a/qa/gate_corpus/builder.py +++ b/qa/gate_corpus/builder.py @@ -380,6 +380,43 @@ def case_structural_completeness(): return events, state, None, None +def case_structural_completeness_authored_warn(): + # GREEN fixture (#1036) — the Option A scope guard. SAME frozen-companion / active-quest / + # 2-location / no-resolution profile as case_structural_completeness ABOVE, EXCEPT the run is + # an AUTHORED campaign: it emits a `start_adventure` cold-open call (and a non-empty `scenes` + # in state). Under the #1036 guard the unresolved-arc sub-check (b) is demoted FATAL->WARN for + # an authored run whose only open quest is the hook-seeded arc (no add_quest), so the gate must + # exit GREEN with structural_completeness appearing as a [WARN] line — NOT a RED cap. This is + # the anti-score-gaming proof that an authored golden-spine run is no longer false-capped to 2.5. + # NOTE: clause (a) approval-frozen is NOT tripped here (a camp/long_rest engaged the relationship + # system) so (b) is the SOLE softened bit — isolating the #1036 change. Lives OUTSIDE the RED + # manifest (corpus is RED-only); a dedicated GREEN test in test_behavioral_gate_corpus.py asserts + # exit 0 + [WARN] structural_completeness. + state = _clean_player_state() + state["characters"]["cmp1"] = {"name": "Brother Toll", "kind": "companion", + "attitude_value": 0, "location_id": "loc_camp", "xp": 300} + state["party"].append("cmp1") + # Authored snapshot marker: a non-empty scenes list (only authored adventures carry this). + state["scenes"] = [{"id": "s1", "name": "The Embergloom Gate", "location_id": "loc_start"}] + state["quests"] = { + # The hook-seeded campaign arc — left active by design (multi-session). NO add_quest call, + # so it is the only-hook-seeded case the #1036 guard softens to WARN. + "q1": {"id": "q1", "title": "The Embergloom Pact", "status": "active", + "objectives": ["free the prisoners"], "completed_objectives": []}, + } + # start_adventure cold-open (authored signal in the tool stream) + a camp beat (engages the + # relationship system so clause (a) approval-frozen stays clean) + 12 DM text beats. + events = ( + _roll() + + [_assistant_tool_use("t_adv", "mcp__engine__start_adventure", + {"adventure_id": "embergloom-pact"})] + + [_assistant_tool_use("t_camp", "mcp__engine__camp_scene", {})] + + [_user_tool_result("t_camp", json.dumps({"ok": True}))] + + [_assistant_text(f"The scene unfolds, beat {i}.") for i in range(12)] + ) + return events, state, None, None + + def case_xp_awarded_on_progression(): # chk A5: xp mode, session advanced (day>1 AND visited>=2 so the world floors pass), a # reward-worthy seam (a COMPLETED quest — NOT a dead monster, so xp_not_orphaned stays inert), @@ -427,6 +464,16 @@ def case_xp_awarded_on_progression(): ("structural_completeness", case_structural_completeness, "structural_completeness"), ] +# GREEN cases — the inverse guard. These fixtures must NOT trip their named check (the gate must +# exit 0 and surface the check as a [WARN], not a [FAIL]). They lock a SCOPE-GUARD that intentionally +# demotes a former FATAL to WARN, so a future edit that RE-promotes it to FATAL (re-introducing the +# false-cap) is caught. Asserted by test_behavioral_gate_corpus.py::test_green_case_*. Tuple shape: +# (case_dir, builder_fn, warn_check). Kept SEPARATE from _CASES_SPEC (which is RED-only by contract). +_GREEN_CASES_SPEC: list[tuple] = [ + ("structural_completeness_authored_warn", case_structural_completeness_authored_warn, + "structural_completeness"), +] + def _fatal_checks_in_gate() -> set[str]: """Parse the gate source for every FATAL chk(...) name (paren-balanced; mirrors the test).""" @@ -489,10 +536,33 @@ def build() -> dict: "reason": "no faithful minimal fixture constructed yet (auto-flagged by builder)", }) + # GREEN cases — known-GREEN bundles that lock a deliberate FATAL->WARN scope guard (the inverse + # of the RED corpus). Written to disk like the RED cases but recorded under a SEPARATE manifest + # key so the RED-only corpus test never mistakes them for known-REDs. + green_cases: list[dict] = [] + for case_dir, fn, warn_check in _GREEN_CASES_SPEC: + run_events, state, chat, moves = fn() + _write_case(case_dir, run_events, state, chat, moves) + artifacts = ["run.jsonl", "state.json"] + if chat is not None: + artifacts.append("chat.jsonl") + if moves is not None: + artifacts.append("moves.jsonl") + green_cases.append({ + "case_dir": case_dir, + "warn_check": warn_check, + "artifacts": artifacts, + "note": ("must exit GREEN (rc 0) with warn_check as a [WARN] — locks a deliberate " + "FATAL->WARN scope guard so a re-promotion to FATAL is caught (#1036)."), + }) + manifest = { - "_doc": ("Behavioral-gate regression corpus. Each case is a minimal known-RED bundle " - "that must trip its expected_red_check. Regenerate with qa/gate_corpus/builder.py."), + "_doc": ("Behavioral-gate regression corpus. `cases` are minimal known-RED bundles that " + "must trip their expected_red_check; `green_cases` are known-GREEN bundles that " + "must NOT (they lock a FATAL->WARN scope guard). Regenerate with " + "qa/gate_corpus/builder.py."), "cases": manifest_cases, + "green_cases": green_cases, } _MANIFEST.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8") return manifest @@ -502,7 +572,10 @@ def build() -> dict: m = build() n_real = sum(1 for c in m["cases"] if not c.get("todo")) n_todo = sum(1 for c in m["cases"] if c.get("todo")) - print(f"wrote {n_real} corpus case(s) + {n_todo} TODO entr(y/ies) -> {_MANIFEST}") + n_green = len(m.get("green_cases", [])) + print(f"wrote {n_real} RED case(s) + {n_todo} TODO + {n_green} GREEN case(s) -> {_MANIFEST}") for c in m["cases"]: flag = " [TODO]" if c.get("todo") else "" - print(f" {c['case_dir']:32s} -> {c['expected_red_check']}{flag}") + print(f" RED {c['case_dir']:36s} -> {c['expected_red_check']}{flag}") + for c in m.get("green_cases", []): + print(f" GREEN {c['case_dir']:36s} -> {c['warn_check']} [WARN]") diff --git a/qa/gate_corpus/cases/structural_completeness_authored_warn/run.jsonl b/qa/gate_corpus/cases/structural_completeness_authored_warn/run.jsonl new file mode 100644 index 00000000..99265a54 --- /dev/null +++ b/qa/gate_corpus/cases/structural_completeness_authored_warn/run.jsonl @@ -0,0 +1,17 @@ +{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_roll", "name": "mcp__engine__roll", "input": {"sides": 20}}]}} +{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_roll", "content": [{"type": "text", "text": "{\"total\": 14}"}], "is_error": false}]}} +{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_adv", "name": "mcp__engine__start_adventure", "input": {"adventure_id": "embergloom-pact"}}]}} +{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_camp", "name": "mcp__engine__camp_scene", "input": {}}]}} +{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_camp", "content": [{"type": "text", "text": "{\"ok\": true}"}], "is_error": false}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 0."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 1."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 2."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 3."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 4."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 5."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 6."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 7."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 8."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 9."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 10."}]}} +{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 11."}]}} diff --git a/qa/gate_corpus/cases/structural_completeness_authored_warn/state.json b/qa/gate_corpus/cases/structural_completeness_authored_warn/state.json new file mode 100644 index 00000000..778277f8 --- /dev/null +++ b/qa/gate_corpus/cases/structural_completeness_authored_warn/state.json @@ -0,0 +1,53 @@ +{ + "party": [ + "pc1", + "cmp1" + ], + "leveling_mode": "xp", + "day": 2, + "time_of_day": "evening", + "current_location_id": "loc_camp", + "characters": { + "pc1": { + "name": "Dal", + "kind": "player", + "xp": 300, + "location_id": "loc_camp" + }, + "cmp1": { + "name": "Brother Toll", + "kind": "companion", + "attitude_value": 0, + "location_id": "loc_camp", + "xp": 300 + } + }, + "locations": { + "loc_start": { + "name": "Tavern", + "visited": true + }, + "loc_camp": { + "name": "Camp", + "visited": true + } + }, + "scenes": [ + { + "id": "s1", + "name": "The Embergloom Gate", + "location_id": "loc_start" + } + ], + "quests": { + "q1": { + "id": "q1", + "title": "The Embergloom Pact", + "status": "active", + "objectives": [ + "free the prisoners" + ], + "completed_objectives": [] + } + } +} \ No newline at end of file diff --git a/qa/gate_corpus/manifest.json b/qa/gate_corpus/manifest.json index 321c83d2..8fa082eb 100644 --- a/qa/gate_corpus/manifest.json +++ b/qa/gate_corpus/manifest.json @@ -1,5 +1,5 @@ { - "_doc": "Behavioral-gate regression corpus. Each case is a minimal known-RED bundle that must trip its expected_red_check. Regenerate with qa/gate_corpus/builder.py.", + "_doc": "Behavioral-gate regression corpus. `cases` are minimal known-RED bundles that must trip their expected_red_check; `green_cases` are known-GREEN bundles that must NOT (they lock a FATAL->WARN scope guard). Regenerate with qa/gate_corpus/builder.py.", "cases": [ { "case_dir": "dm_produced_output", @@ -188,5 +188,16 @@ ], "real_red_provenance": "" } + ], + "green_cases": [ + { + "case_dir": "structural_completeness_authored_warn", + "warn_check": "structural_completeness", + "artifacts": [ + "run.jsonl", + "state.json" + ], + "note": "must exit GREEN (rc 0) with warn_check as a [WARN] \u2014 locks a deliberate FATAL->WARN scope guard so a re-promotion to FATAL is caught (#1036)." + } ] } diff --git a/qa/test_assert_behavioral.py b/qa/test_assert_behavioral.py index 7454756c..7d64380d 100644 --- a/qa/test_assert_behavioral.py +++ b/qa/test_assert_behavioral.py @@ -663,6 +663,66 @@ def test_structural_completeness_silent_in_combat_sprint(tmp_path): assert "structural_completeness" not in out +# ── #1036: authored-campaign scope guard for sub-check (b) unresolved_arc ────────── +# The campaign-arc quest is seeded from the authored adventure `hook` and is multi-session by +# design; authored adventures author NO closable sub-quests, so complete_quest is never called and +# (b) unresolved_arc FATAL-capped a clean authored run to 2.5. Option A (mirrors #1030): an AUTHORED +# run (start_adventure in the tool stream OR a non-empty state["scenes"]) whose only open quest is +# the hook-seeded arc demotes (b) FATAL->WARN. Clause (a) approval-frozen stays FATAL always; an +# authored run that called add_quest and left it open stays FATAL (a real dropped thread). + +def test_structural_completeness_warns_not_fatal_for_authored_campaign(tmp_path): + # AUTHORED run: a start_adventure cold-open + a camp beat (so clause (a) stays clean) + 12 DM + # beats, a companion frozen at 0, the hook-seeded quest still active across a 2-location arc, + # NO complete_quest, NO add_quest. Under #1036 this must be GREEN (rc 0) with the unresolved-arc + # sub-check surfaced as a [WARN], NOT a [FAIL] cap. Same profile that previously RED-capped to 2.5. + events = (_dm_text_turns(12) + + _toolcall("start_adventure") # authored cold-open signal in the tool stream + + _toolcall("camp_scene")) # engage relationship system -> clause (a) clean + state = _frozen_run_state(approval_moved=False, active_quest=True) + rc, out = _run_gate(tmp_path, events, state) + assert rc == 0, out + assert "[WARN] structural_completeness" in out, out + assert "[FAIL] structural_completeness" not in out, out + + +def test_structural_completeness_still_fatal_for_non_authored_run(tmp_path): + # NON-authored run (no start_adventure, no scenes): the ORIGINAL failure class. Same frozen + # profile must STILL RED — the #1036 guard must not weaken the non-authored path. Camp engaged + # so clause (a) is clean and (b) unresolved_arc is the SOLE fatal -> still FATAL when non-authored. + events = _dm_text_turns(12) + _toolcall("camp_scene") + state = _frozen_run_state(approval_moved=False, active_quest=True) + rc, out = _run_gate(tmp_path, events, state) + assert rc == 1, out + assert "[FAIL] structural_completeness" in out, out + + +def test_structural_completeness_authored_but_add_quest_stays_fatal(tmp_path): + # AUTHORED run that ALSO called add_quest (the DM opened its OWN sub-quest) and left a quest + # unresolved — a genuine dropped thread, NOT just the hook-seeded arc. The #1036 guard keeps + # this FATAL even in authored mode (the dm_added_quest carve-out). Camp engaged -> (a) clean, + # so (b) is the sole fatal and it must still RED. + events = (_dm_text_turns(12) + + _toolcall("start_adventure") + + _toolcall("camp_scene") + + _toolcall("add_quest")) + state = _frozen_run_state(approval_moved=False, active_quest=True) + rc, out = _run_gate(tmp_path, events, state) + assert rc == 1, out + assert "[FAIL] structural_completeness" in out, out + + +def test_structural_completeness_authored_via_scenes_state_warns(tmp_path): + # The secondary authored signal: a non-empty state["scenes"] (no start_adventure tool-call in + # this run's stream — e.g. a RESUMED authored session). Must also soften (b) to WARN/GREEN. + events = _dm_text_turns(12) + _toolcall("camp_scene") + state = _frozen_run_state(approval_moved=False, active_quest=True) + state["scenes"] = [{"id": "s1", "name": "The Embergloom Gate", "location_id": "loc_a"}] + rc, out = _run_gate(tmp_path, events, state) + assert rc == 0, out + assert "[WARN] structural_completeness" in out, out + + # ── flat_arc (WARN-first) — a >=24-beat 3-act run whose arc never turned ─────────── # A run that CLAIMS 3 acts (engine cursor or contiguous tags) but has felt_three_act False (no # real reversal+climax) is a flat fetch-quest shape, not a felt setup→reversal→climax. Shipped diff --git a/qa/test_behavioral_gate_corpus.py b/qa/test_behavioral_gate_corpus.py index d415028d..6db1d0ce 100644 --- a/qa/test_behavioral_gate_corpus.py +++ b/qa/test_behavioral_gate_corpus.py @@ -157,6 +157,58 @@ def test_corpus_case_trips_expected_red_check(case: dict): ) +def _warned_checks(output: str) -> set[str]: + """The set of CHECK names the gate printed as a [WARN] (a non-fatal advisory).""" + out: set[str] = set() + for line in output.splitlines(): + line = line.strip() + if line.startswith("[WARN]"): + rest = line[len("[WARN]"):].strip() + name = rest.split(" — ", 1)[0].split()[0] if rest else "" + if name: + out.add(name) + return out + + +_GREEN_CASES = [ + c for c in (json.loads(_MANIFEST.read_text(encoding="utf-8")).get("green_cases") or []) +] if _MANIFEST.exists() else [] +_GREEN_IDS = [c["case_dir"] for c in _GREEN_CASES] + + +@pytest.mark.parametrize("case", _GREEN_CASES, ids=_GREEN_IDS or ["__none__"]) +def test_green_case_warns_but_stays_green(case): + """Inverse of the RED corpus (#1036). Each GREEN case locks a deliberate FATAL->WARN scope + guard: the gate must (1) exit GREEN (rc 0 — the run is NOT RED-capped) and (2) still surface + its warn_check as a [WARN] line (visibility preserved), and (3) NOT name it as a [FAIL]. A + future edit that re-promotes the softened sub-check to FATAL (re-introducing the false-cap) + flips this case RED and fails here.""" + if not _GREEN_CASES: + pytest.skip("no green_cases in manifest") + case_dir = case["case_dir"] + warn_check = case["warn_check"] + case_path = _CASES_DIR / case_dir + assert case_path.is_dir(), f"green corpus case dir missing: {case_path}" + + rc, output = _run_gate(case_path) + + assert rc == 0, ( + f"green case {case_dir!r} expected GREEN (exit 0) — its {warn_check!r} sub-check must be " + f"a WARN, not a FATAL cap. The gate returned exit {rc}; a gate edit may have RE-PROMOTED " + f"{warn_check!r} to FATAL (re-introducing the #1036 false-cap).\n--- gate output ---\n{output}" + ) + failed = _failed_checks(output) + assert warn_check not in failed, ( + f"green case {case_dir!r} returned its {warn_check!r} as a [FAIL] — it must be a [WARN].\n" + f"--- gate output ---\n{output}" + ) + warned = _warned_checks(output) + assert warn_check in warned, ( + f"green case {case_dir!r}: {warn_check!r} must appear as a [WARN] (visibility preserved), " + f"but it was neither warned nor failed. Warned={sorted(warned)}.\n--- gate output ---\n{output}" + ) + + def test_manifest_covers_every_fatal_check(): """Coverage audit: every FATAL check declared by the gate is either exercised by a corpus case or explicitly marked TODO with a reason. A NEW fatal check added to the gate with no