Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion qa/BEHAVIORAL_GATE_TAXONOMY.json
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@
"category": "ENGINE_INVARIANT",
"likely_code_locations": ["servers/engine/server.py", "servers/engine/models.py", "servers/engine/companion.py"],
"retest": "bash qa/run_duo.sh duo-retest",
"hint": "A >=10-beat session with a companion never engaged a core relationship/quest system: either no companion's attitude_value moved off 0 AND no camp/long_rest happened, or an active quest was left open across a >=2-location arc with no quest-resolution call. Either the DM never invoked the relationship/quest tools (record_decision / adjust_attitude / camp_scene / complete_quest evolves_to — DM adherence) or those engine writes never landed (engine). All those tool handlers + the attitude_value/quest-status writes live in server.py (adjust_attitude, record_decision, camp_scene, complete_quest, and the rule-of-three evolution _maybe_schedule_quest_evolution / evolves_to); the field definitions are in models.py; companion.py only surfaces approval CAUSES (approval_tags) for the DM to apply — it never writes state itself. The gate trips at >=10 beats (STRUCTURAL_MIN_BEATS); when validating authored campaigns run >=24 beats so the main quest has room to resolve, else the unresolved-arc sub-check false-REDs. FATAL; skipped in the combat-sprint lane."
"hint": "A >=10-beat session with a companion never engaged a core relationship/quest system: either no companion's attitude_value moved off 0 AND no camp/long_rest happened, or an active quest was left open across a >=2-location arc with no quest-resolution call. Either the DM never invoked the relationship/quest tools (record_decision / adjust_attitude / camp_scene / complete_quest evolves_to — DM adherence) or those engine writes never landed (engine). All those tool handlers + the attitude_value/quest-status writes live in server.py (adjust_attitude, record_decision, camp_scene, complete_quest, and the rule-of-three evolution _maybe_schedule_quest_evolution / evolves_to); the field definitions are in models.py; companion.py only surfaces approval CAUSES (approval_tags) for the DM to apply — it never writes state itself. The gate trips at >=10 beats (STRUCTURAL_MIN_BEATS). #1036 scope guard (Option A, mirrors #1030): on an AUTHORED campaign (start_adventure in the tool stream OR a non-empty state[\"scenes\"]) whose only open quest is the hook-seeded arc, the unresolved-arc sub-check (b) is demoted FATAL->WARN — the hook-seeded campaign arc is multi-session by design and authored adventures author no closable sub-quests, so it no longer false-caps a clean authored run to 2.5; the WARN message is still surfaced. STILL FATAL for: any NON-authored run (the original narrated-not-engaged failure), and an authored run that called add_quest and left it unresolved (a real dropped DM-opened thread). Clause (a) approval-frozen is FATAL ALWAYS. Skipped in the combat-sprint lane."
},
"flat_arc": {
"category": "DM_ADHERENCE",
Expand Down
34 changes: 32 additions & 2 deletions qa/assert_behavioral.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,28 @@ def _has_spells(c: dict) -> bool:
unresolved_arc = bool(active_quests and multi_location_arc
and not quest_resolution_engaged)

# AUTHORED-CAMPAIGN SCOPE GUARD for sub-check (b) — #1036 (Option A; mirrors #1030's
# WARN-vs-FATAL discipline for party_traveled / combat_not_left_active). The campaign-arc
# quest is SEEDED from the authored adventure `hook` and is multi-session by design; the
# authored adventures (e.g. embergloom-pact) author NO closable sub-quests, so the DM never
# calls complete_quest / set_quest_status and (b) `unresolved_arc` FATAL-capped a clean
# 25-beat authored run to 2.5 — a self-inflicted false-cap (the main quest can't legitimately
# resolve inside one session). An AUTHORED run is identifiable at the gate from signals
# already in scope: `start_adventure` is the cold-open call for authored runs (in the tool
# stream `_tally` always sees), and `state["scenes"]` is non-empty only for authored
# adventures (the seeded-campaign snapshot). When authored, (b) is demoted FATAL->WARN —
# the gate still APPENDS the WARN message (visibility preserved), but the run is not
# RED-capped on (b) alone.
#
# PRESERVE FATAL for the original failure class:
# - any NON-authored run (the proven 18-beat narrated-not-engaged failure), AND
# - an authored run where the DM ADDED a sub-quest (add_quest) and left it unresolved —
# a genuine dropped thread the DM itself opened, distinct from the hook-seeded campaign
# arc (add_quest is the engine quest-creation tool, server.py:add_quest; it is
# distinguishable from the hook-seeded quest at gate time, so we keep that case FATAL).
is_authored_campaign = bool(tools.get("start_adventure") or state.get("scenes"))
dm_added_quest = bool(tools.get("add_quest"))

bad_bits = []
if approval_frozen_run:
bad_bits.append(
Expand All @@ -767,12 +789,20 @@ def _has_spells(c: dict) -> bool:
bad_bits.append(
f"{len(active_quests)} quest(s) still active at session end across a "
f"{visited}-location arc with no quest-resolution call "
f"({[q.get('title') or q.get('id') or '?' for q in active_quests]})")
f"({[q.get('title') or q.get('id') or '?' for q in active_quests]})"
+ (" [authored campaign + only the hook-seeded arc ⇒ WARN, not RED (#1036)]"
if (is_authored_campaign and not dm_added_quest) else ""))

# Severity: clause (a) approval-frozen stays FATAL ALWAYS. Clause (b) unresolved_arc is
# FATAL unless it's an authored campaign whose ONLY open quest is the hook-seeded arc.
_unresolved_fatal = unresolved_arc and (not is_authored_campaign or dm_added_quest)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restrict authored demotion to the actual seeded quest

For resumed authored campaigns, state["scenes"] remains true but the current transcript may not include the earlier add_quest call that created an active side quest. In that case dm_added_quest is false, so this line demotes unresolved_arc to a WARN for every active quest in the state, even when the open quest was a DM-created thread from a prior session rather than the hook-seeded campaign arc. That creates a false-green on the exact dropped-thread scenario the comment says should remain fatal; the demotion needs to verify the open quest is the single seeded hook (or otherwise track quest provenance), not just rely on this run’s tool counts.

Useful? React with 👍 / 👎.

_structural_fatal = approval_frozen_run or _unresolved_fatal
chk("structural_completeness", not bad_bits,
f"a {session_beats}-beat session with a companion never engaged a core system: "
+ "; ".join(bad_bits)
+ " — the engine relationship/quest tools (record_decision approval_tags / "
"adjust_attitude / camp_scene / complete_quest evolves_to) were narrated, not used")
"adjust_attitude / camp_scene / complete_quest evolves_to) were narrated, not used",
fatal=_structural_fatal)

# (c) FLAT ARC (WARN-FIRST). A LONG run (>= FELT_SHAPE_MIN_BEATS) that CLAIMS three acts
# (the engine narrative_arc cursor OR contiguous act-tags) but whose arc never TURNED — no
Expand Down
81 changes: 77 additions & 4 deletions qa/gate_corpus/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,43 @@ def case_structural_completeness():
return events, state, None, None


def case_structural_completeness_authored_warn():
# GREEN fixture (#1036) — the Option A scope guard. SAME frozen-companion / active-quest /
# 2-location / no-resolution profile as case_structural_completeness ABOVE, EXCEPT the run is
# an AUTHORED campaign: it emits a `start_adventure` cold-open call (and a non-empty `scenes`
# in state). Under the #1036 guard the unresolved-arc sub-check (b) is demoted FATAL->WARN for
# an authored run whose only open quest is the hook-seeded arc (no add_quest), so the gate must
# exit GREEN with structural_completeness appearing as a [WARN] line — NOT a RED cap. This is
# the anti-score-gaming proof that an authored golden-spine run is no longer false-capped to 2.5.
# NOTE: clause (a) approval-frozen is NOT tripped here (a camp/long_rest engaged the relationship
# system) so (b) is the SOLE softened bit — isolating the #1036 change. Lives OUTSIDE the RED
# manifest (corpus is RED-only); a dedicated GREEN test in test_behavioral_gate_corpus.py asserts
# exit 0 + [WARN] structural_completeness.
state = _clean_player_state()
state["characters"]["cmp1"] = {"name": "Brother Toll", "kind": "companion",
"attitude_value": 0, "location_id": "loc_camp", "xp": 300}
state["party"].append("cmp1")
# Authored snapshot marker: a non-empty scenes list (only authored adventures carry this).
state["scenes"] = [{"id": "s1", "name": "The Embergloom Gate", "location_id": "loc_start"}]
state["quests"] = {
# The hook-seeded campaign arc — left active by design (multi-session). NO add_quest call,
# so it is the only-hook-seeded case the #1036 guard softens to WARN.
"q1": {"id": "q1", "title": "The Embergloom Pact", "status": "active",
"objectives": ["free the prisoners"], "completed_objectives": []},
}
# start_adventure cold-open (authored signal in the tool stream) + a camp beat (engages the
# relationship system so clause (a) approval-frozen stays clean) + 12 DM text beats.
events = (
_roll()
+ [_assistant_tool_use("t_adv", "mcp__engine__start_adventure",
{"adventure_id": "embergloom-pact"})]
+ [_assistant_tool_use("t_camp", "mcp__engine__camp_scene", {})]
+ [_user_tool_result("t_camp", json.dumps({"ok": True}))]
+ [_assistant_text(f"The scene unfolds, beat {i}.") for i in range(12)]
)
return events, state, None, None


def case_xp_awarded_on_progression():
# chk A5: xp mode, session advanced (day>1 AND visited>=2 so the world floors pass), a
# reward-worthy seam (a COMPLETED quest — NOT a dead monster, so xp_not_orphaned stays inert),
Expand Down Expand Up @@ -427,6 +464,16 @@ def case_xp_awarded_on_progression():
("structural_completeness", case_structural_completeness, "structural_completeness"),
]

# GREEN cases — the inverse guard. These fixtures must NOT trip their named check (the gate must
# exit 0 and surface the check as a [WARN], not a [FAIL]). They lock a SCOPE-GUARD that intentionally
# demotes a former FATAL to WARN, so a future edit that RE-promotes it to FATAL (re-introducing the
# false-cap) is caught. Asserted by test_behavioral_gate_corpus.py::test_green_case_*. Tuple shape:
# (case_dir, builder_fn, warn_check). Kept SEPARATE from _CASES_SPEC (which is RED-only by contract).
_GREEN_CASES_SPEC: list[tuple] = [
("structural_completeness_authored_warn", case_structural_completeness_authored_warn,
"structural_completeness"),
]


def _fatal_checks_in_gate() -> set[str]:
"""Parse the gate source for every FATAL chk(...) name (paren-balanced; mirrors the test)."""
Expand Down Expand Up @@ -489,10 +536,33 @@ def build() -> dict:
"reason": "no faithful minimal fixture constructed yet (auto-flagged by builder)",
})

# GREEN cases — known-GREEN bundles that lock a deliberate FATAL->WARN scope guard (the inverse
# of the RED corpus). Written to disk like the RED cases but recorded under a SEPARATE manifest
# key so the RED-only corpus test never mistakes them for known-REDs.
green_cases: list[dict] = []
for case_dir, fn, warn_check in _GREEN_CASES_SPEC:
run_events, state, chat, moves = fn()
_write_case(case_dir, run_events, state, chat, moves)
artifacts = ["run.jsonl", "state.json"]
if chat is not None:
artifacts.append("chat.jsonl")
if moves is not None:
artifacts.append("moves.jsonl")
green_cases.append({
"case_dir": case_dir,
"warn_check": warn_check,
"artifacts": artifacts,
"note": ("must exit GREEN (rc 0) with warn_check as a [WARN] — locks a deliberate "
"FATAL->WARN scope guard so a re-promotion to FATAL is caught (#1036)."),
})

manifest = {
"_doc": ("Behavioral-gate regression corpus. Each case is a minimal known-RED bundle "
"that must trip its expected_red_check. Regenerate with qa/gate_corpus/builder.py."),
"_doc": ("Behavioral-gate regression corpus. `cases` are minimal known-RED bundles that "
"must trip their expected_red_check; `green_cases` are known-GREEN bundles that "
"must NOT (they lock a FATAL->WARN scope guard). Regenerate with "
"qa/gate_corpus/builder.py."),
"cases": manifest_cases,
"green_cases": green_cases,
}
_MANIFEST.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf-8")
return manifest
Expand All @@ -502,7 +572,10 @@ def build() -> dict:
m = build()
n_real = sum(1 for c in m["cases"] if not c.get("todo"))
n_todo = sum(1 for c in m["cases"] if c.get("todo"))
print(f"wrote {n_real} corpus case(s) + {n_todo} TODO entr(y/ies) -> {_MANIFEST}")
n_green = len(m.get("green_cases", []))
print(f"wrote {n_real} RED case(s) + {n_todo} TODO + {n_green} GREEN case(s) -> {_MANIFEST}")
for c in m["cases"]:
flag = " [TODO]" if c.get("todo") else ""
print(f" {c['case_dir']:32s} -> {c['expected_red_check']}{flag}")
print(f" RED {c['case_dir']:36s} -> {c['expected_red_check']}{flag}")
for c in m.get("green_cases", []):
print(f" GREEN {c['case_dir']:36s} -> {c['warn_check']} [WARN]")
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_roll", "name": "mcp__engine__roll", "input": {"sides": 20}}]}}
{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_roll", "content": [{"type": "text", "text": "{\"total\": 14}"}], "is_error": false}]}}
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_adv", "name": "mcp__engine__start_adventure", "input": {"adventure_id": "embergloom-pact"}}]}}
{"type": "assistant", "message": {"content": [{"type": "tool_use", "id": "t_camp", "name": "mcp__engine__camp_scene", "input": {}}]}}
{"type": "user", "message": {"content": [{"type": "tool_result", "tool_use_id": "t_camp", "content": [{"type": "text", "text": "{\"ok\": true}"}], "is_error": false}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 0."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 1."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 2."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 3."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 4."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 5."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 6."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 7."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 8."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 9."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 10."}]}}
{"type": "assistant", "message": {"content": [{"type": "text", "text": "The scene unfolds, beat 11."}]}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"party": [
"pc1",
"cmp1"
],
"leveling_mode": "xp",
"day": 2,
"time_of_day": "evening",
"current_location_id": "loc_camp",
"characters": {
"pc1": {
"name": "Dal",
"kind": "player",
"xp": 300,
"location_id": "loc_camp"
},
"cmp1": {
"name": "Brother Toll",
"kind": "companion",
"attitude_value": 0,
"location_id": "loc_camp",
"xp": 300
}
},
"locations": {
"loc_start": {
"name": "Tavern",
"visited": true
},
"loc_camp": {
"name": "Camp",
"visited": true
}
},
"scenes": [
{
"id": "s1",
"name": "The Embergloom Gate",
"location_id": "loc_start"
}
],
"quests": {
"q1": {
"id": "q1",
"title": "The Embergloom Pact",
"status": "active",
"objectives": [
"free the prisoners"
],
"completed_objectives": []
}
}
}
13 changes: 12 additions & 1 deletion qa/gate_corpus/manifest.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"_doc": "Behavioral-gate regression corpus. Each case is a minimal known-RED bundle that must trip its expected_red_check. Regenerate with qa/gate_corpus/builder.py.",
"_doc": "Behavioral-gate regression corpus. `cases` are minimal known-RED bundles that must trip their expected_red_check; `green_cases` are known-GREEN bundles that must NOT (they lock a FATAL->WARN scope guard). Regenerate with qa/gate_corpus/builder.py.",
"cases": [
{
"case_dir": "dm_produced_output",
Expand Down Expand Up @@ -188,5 +188,16 @@
],
"real_red_provenance": ""
}
],
"green_cases": [
{
"case_dir": "structural_completeness_authored_warn",
"warn_check": "structural_completeness",
"artifacts": [
"run.jsonl",
"state.json"
],
"note": "must exit GREEN (rc 0) with warn_check as a [WARN] \u2014 locks a deliberate FATAL->WARN scope guard so a re-promotion to FATAL is caught (#1036)."
}
]
}
Loading
Loading