From eeba8baece1b4087f474b1eae2c7c0fc04b2c02d Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Fri, 1 May 2026 18:24:53 +0800
Subject: [PATCH 1/6] feat(drivers/codex): add gpt-5.5 to model list

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/agent/drivers/codex.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/src/agent/drivers/codex.rs b/src/agent/drivers/codex.rs
index c95d4df4..190d5528 100644
--- a/src/agent/drivers/codex.rs
+++ b/src/agent/drivers/codex.rs
@@ -230,6 +230,7 @@ impl RuntimeDriver for CodexDriver {
 
     async fn list_models(&self) -> anyhow::Result<Vec<ModelInfo>> {
         Ok(vec![
+            ModelInfo::from_id("gpt-5.5".into()),
             ModelInfo::from_id("gpt-5.4".into()),
             ModelInfo::from_id("gpt-5.4-mini".into()),
             ModelInfo::from_id("gpt-5.3-codex".into()),
@@ -1464,14 +1465,15 @@ mod tests {
     async fn test_codex_driver_list_models() {
         let driver = CodexDriver;
         let models = driver.list_models().await.unwrap();
-        assert_eq!(models.len(), 7);
-        assert_eq!(models[0].id, "gpt-5.4");
-        assert_eq!(models[1].id, "gpt-5.4-mini");
-        assert_eq!(models[2].id, "gpt-5.3-codex");
-        assert_eq!(models[3].id, "gpt-5.2-codex");
-        assert_eq!(models[4].id, "gpt-5.2");
-        assert_eq!(models[5].id, "gpt-5.1-codex-max");
-        assert_eq!(models[6].id, "gpt-5.1-codex-mini");
+        assert_eq!(models.len(), 8);
+        assert_eq!(models[0].id, "gpt-5.5");
+        assert_eq!(models[1].id, "gpt-5.4");
+        assert_eq!(models[2].id, "gpt-5.4-mini");
+        assert_eq!(models[3].id, "gpt-5.3-codex");
+        assert_eq!(models[4].id, "gpt-5.2-codex");
+        assert_eq!(models[5].id, "gpt-5.2");
+        assert_eq!(models[6].id, "gpt-5.1-codex-max");
+        assert_eq!(models[7].id, "gpt-5.1-codex-mini");
     }
 
     #[tokio::test]

From 0e058f661eca34bea7025e1a1322f0c0dd2eb5a7 Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Fri, 1 May 2026 22:10:18 +0800
Subject: [PATCH 2/6] feat(prompt+bench): structural decision trigger +
 reproducible benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the input-pattern enumeration in the Decision Inbox prompt section
(PR-review phrasing, "should I X or Y", config-knob examples) with a
four-property structural test: mutually-exclusive options + blocking +
material consequence + delegated picker. The trigger is the shape of the
agent's intended reply, not the asker's words. The PR-review case
becomes the canonical example, not the rule.

Why: the enumeration didn't scale. Verdict-shaped requests in triage,
hiring, time-boxing, and compliance use neutral phrasing ("tell me which
3 to fix", "walk me through whether we need X") and were falling
through to send_message. The structural rule generalizes to any new
workflow without re-listing phrasings.

Add bench/decision-trigger/ — a reproducible benchmark that spins up
one isolated claude/sonnet agent per case in parallel, dispatches a
DM, and classifies the response turn as decision (dispatch_decision) or
chat (send_message). 15 cases across 8 work domains (PR review, vendor
pick, architecture, status, triage, hiring, doc, compliance, time-box,
naming). Current score: 15/15.

The benchmark intentionally pauses non-bench agents during runs so the
bench cohort isn't drowned in #all welcome messages. Side-effect-free
prompts only — README documents the constraint.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                       |   1 +
 bench/decision-trigger/README.md |  86 ++++++++++++
 bench/decision-trigger/cases.tsv |  16 +++
 bench/decision-trigger/run.sh    | 223 +++++++++++++++++++++++++++++++
 src/agent/drivers/prompt.rs      |  50 +++++--
 5 files changed, 364 insertions(+), 12 deletions(-)
 create mode 100644 bench/decision-trigger/README.md
 create mode 100644 bench/decision-trigger/cases.tsv
 create mode 100755 bench/decision-trigger/run.sh

diff --git a/.gitignore b/.gitignore
index d0391e23..f262b71d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,3 +74,4 @@ node_modules
 .mcp.json
 .opencode.json
 .windsurfrules
+/bench/decision-trigger/results/
diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md
new file mode 100644
index 00000000..4a32cf23
--- /dev/null
+++ b/bench/decision-trigger/README.md
@@ -0,0 +1,86 @@
+# Decision-trigger benchmark
+
+Evaluates whether the prompt in `src/agent/drivers/prompt.rs` causes agents to correctly route work between the **decision channel** (`dispatch_decision`) and the **chat channel** (`send_message`).
+
+The current rule is structural — a request is a decision when ALL FOUR hold:
+
+1. **Mutually exclusive** options
+2. **Blocking** — the asker can't move until a pick lands
+3. **Material consequence** — the pick commits resources or forecloses paths
+4. **Delegated** — the asker is asking the agent to pick
+
+Cases that hit all four should produce `dispatch_decision`. Anything else should produce `send_message`.
+
+## What's measured
+
+| | Description |
+|---|---|
+| **Input** | 15 hand-curated prompts spanning 8 work domains (PR review, vendor pick, architecture, status, triage, hiring, doc edit, compliance, time-box, naming). |
+| **Setup** | One isolated Chorus agent per case (claude/sonnet), so there's no session-context bleed between cases. All agents run in parallel. |
+| **Signal** | Per-agent log scrape: did the agent call `dispatch_decision` or `send_message` in its response turn? |
+| **Score** | Match rate vs. the `predicted` column in `cases.tsv`. |
+
+## Why one-agent-per-case in parallel
+
+Running cases sequentially through a single agent corrupts the test in two ways:
+1. **Context bleed** — case N inherits memory of cases 1..N-1, so the agent's choice on case N is biased.
+2. **Stale-session timeouts** — codex/opencode `--resume` silently fails after a few minutes idle (see TODOS.md). Sequential runs hit this gap; one agent per case dodges it entirely.
+
+Total wall time is `max(per_agent_turn) ≈ 2 min`, not `sum`.
+
+## Prerequisites
+
+- `chorus` binary built: `cargo build --bin chorus`
+- Chorus server running with stdout/stderr captured to a log file
+- Claude runtime authed (`chorus setup` confirms)
+- `CHORUS_LOG` env var pointing to the server log (defaults to `/tmp/chorus-qa-server.log`)
+
+## Running
+
+```bash
+# from repo root
+./bench/decision-trigger/run.sh
+```
+
+Optional:
+```bash
+./bench/decision-trigger/run.sh http://localhost:3001    # explicit server URL
+KEEP_AGENTS=1 ./bench/decision-trigger/run.sh            # don't auto-delete agents on exit (for forensics)
+CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh
+```
+
+## Output
+
+Each run writes to `bench/decision-trigger/results/<unix_ts>/`:
+
+- `results.tsv` — per-case `id, agent, predicted, actual, match, prompt`
+- `log-slice.txt` — the relevant slice of the server log for forensics
+
+Exit code is `0` if all cases match, `1` otherwise.
+
+## Cases (`cases.tsv`)
+
+Each row is `id <tab> predicted <tab> prompt`. To add a case:
+
+1. Append a new row.
+2. Set `predicted` to `decision` or `chat` based on the structural test above.
+3. Make the prompt **current-tense and unambiguous** about who is blocked. Retrospective phrasing ("should we have shipped X?") fails property #2 and is correctly classified as `chat`, so don't predict `decision` for it.
+
+## Interpreting results
+
+A `match: 15/15` confirms the prompt rule is well-formed for general work. Anything below that needs investigation:
+
+- **`predicted=decision actual=chat`** — the agent missed a verdict-shaped request. Either the prompt is too restrictive, or the case wording is too soft. Check whether all four properties actually hold; if so, the rule needs a stronger trigger for that workflow class.
+- **`predicted=chat actual=decision`** — the agent over-fired. The structural rule has a false positive. Tighten the trigger or improve the canonical example.
+- **`actual=unknown`** — the agent didn't call either tool, or the log scrape missed the call. Check `log-slice.txt`.
+
+## Known limitations
+
+- Single-runtime test (claude/sonnet). Codex/opencode have known stale-session bugs and aren't included until those drivers ship the analogous `--resume` guard.
+- Log-scrape classification is brittle to log format changes. If the `tool call agent=...` log line moves or renames, update the grep in `run.sh`.
+- Per-agent agent boot time (~10-30s) dominates wall time for short tests.
+- Cases must be **side-effect-free**. An agent given "edit X" or "fix typos in Y" will mutate the repo, leaving uncommitted changes. Frame action cases as "report what you'd change" or use a sandbox path the runner pre-stages and cleans up.
+
+## Provenance
+
+This benchmark was added in the PR that rewrote the prompt's decision trigger from input-pattern enumeration to a structural four-property test. See git history for context.
diff --git a/bench/decision-trigger/cases.tsv b/bench/decision-trigger/cases.tsv
new file mode 100644
index 00000000..ba67de51
--- /dev/null
+++ b/bench/decision-trigger/cases.tsv
@@ -0,0 +1,16 @@
+id	predicted	prompt
+1	decision	A teammate just opened a hypothetical PR adding a `recommended_key` field to the decision payload schema. The diff: add field to schema.sql, validator, and 4 tests. CI is green. They're waiting on your call before merging. Merge or hold?
+2	decision	A teammate just opened a hypothetical PR refactoring the auth middleware to use a typed enum instead of strings. They're blocked on your read before merging. What's your verdict?
+3	chat	Walk through the auth flow in this Chorus codebase and explain how it works at a high level. Just a learning request — I'm not building anything.
+4	decision	We need to pick our observability stack today. Choices: Datadog, New Relic, Honeycomb. We need to ship by Friday. What's your call?
+5	chat	We're evaluating Datadog vs New Relic for observability. Pull the pricing pages from their websites and summarize the key feature differences in a table. I'll decide myself once I have the data.
+6	decision	I'm torn between event-sourcing and CRUD for our new order service. We have ~2 engineers, 6 weeks, and need correctness over speed. I need to commit to one this week. What do you recommend?
+7	chat	How is the codex driver going in this Chorus codebase? Anything blocking its further evolution?
+8	decision	Look at this Chorus repo's TODOS.md (and recent commits if useful) and tell me which 3 issues to fix first this week. I'll act on whichever you pick.
+9	decision	Two final SRE candidates: Alice has 8 years SRE at AWS, deep K8s. Bob has 4 years but built our exact stack at his last startup, ships 3x faster. I need to send the offer letter today. Tell me who to extend the offer to.
+10	chat	Read docs/DEV.md in this repo and tell me about any typos or unclear wording you find. Don't edit the file — just report what you'd change.
+11	decision	Does our agent runtime need a privacy review (DPIA-equivalent) before we ship multi-tenant mode? Yes/no with reasoning. We need to decide today so the launch can proceed.
+12	decision	Walk me through whether our agent runtime needs a privacy review before we ship multi-tenant mode. The launch is gated on this answer.
+13	decision	We've spent 2 days investigating the codex --resume stale-session bug. The sprint ends tomorrow. Keep digging for a fix this sprint, or ship the workaround (skip resume on stale session) and file a follow-up?
+14	decision	Status check on the codex --resume stale-session investigation. We've been stuck for 2 days, sprint ends tomorrow. Where do we go from here?
+15	chat	Suggest a name for our new internal scheduler service. Something fun. Give me 5 options to riff on.
diff --git a/bench/decision-trigger/run.sh b/bench/decision-trigger/run.sh
new file mode 100755
index 00000000..d0d64958
--- /dev/null
+++ b/bench/decision-trigger/run.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+# Decision-trigger benchmark — runs each case in an isolated agent in parallel,
+# then classifies each agent's first reply turn as `decision` (dispatch_decision)
+# or `chat` (send_message). Compares to the predicted column in cases.tsv.
+#
+# Usage: bench/decision-trigger/run.sh [server_url]
+#
+# Requires: chorus binary on PATH, server running, claude runtime authed.
+set -euo pipefail
+
+SERVER_URL="${1:-http://localhost:3001}"
+BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CASES="$BENCH_DIR/cases.tsv"
+RUN_ID="$(date +%s)"
+RESULTS_DIR="$BENCH_DIR/results/$RUN_ID"
+mkdir -p "$RESULTS_DIR"
+
+# Resolve chorus binary (prefer release, fall back to debug, then PATH).
+CHORUS=""
+if [ -x "$BENCH_DIR/../../target/release/chorus" ]; then
+  CHORUS="$BENCH_DIR/../../target/release/chorus"
+elif [ -x "$BENCH_DIR/../../target/debug/chorus" ]; then
+  CHORUS="$BENCH_DIR/../../target/debug/chorus"
+elif command -v chorus >/dev/null 2>&1; then
+  CHORUS="chorus"
+else
+  echo "error: chorus binary not found. build with 'cargo build --bin chorus' first." >&2
+  exit 1
+fi
+
+# Locate the server log so we can scrape tool calls per agent.
+# Caller can override with CHORUS_LOG=/path/to/server.log.
+LOG="${CHORUS_LOG:-/tmp/chorus-qa-server.log}"
+if [ ! -f "$LOG" ]; then
+  echo "warn: server log $LOG not found. set CHORUS_LOG to point to your server's stdout/stderr." >&2
+  echo "      classification needs the log to scrape per-agent tool calls." >&2
+  exit 1
+fi
+
+# Use the no-proxy env for curl since Chorus listens on localhost.
+CURL=(curl --noproxy '*' -sS -m 10)
+
+echo "== bench/decision-trigger run $RUN_ID =="
+echo "  server: $SERVER_URL"
+echo "  log:    $LOG"
+echo "  cases:  $CASES"
+echo "  out:    $RESULTS_DIR"
+
+# Pause any non-bench agents so they don't flood the bench cohort with welcome
+# messages during boot. We only stop running ones; KEEP_OTHERS=1 disables this.
+declare -a PAUSED_AGENTS=()
+if [ "${KEEP_OTHERS:-0}" != "1" ]; then
+  while read -r name; do
+    PAUSED_AGENTS+=("$name")
+  done < <("${CURL[@]}" "$SERVER_URL/api/agents" | python3 -c "
+import json, sys
+d = json.load(sys.stdin)
+for a in d:
+    if a['name'].startswith('bench-dt-'):
+        continue
+    if a['status'] in ('ready', 'working'):
+        print(a['name'])
+")
+  if [ ${#PAUSED_AGENTS[@]} -gt 0 ]; then
+    echo
+    echo "[0/5] pausing ${#PAUSED_AGENTS[@]} non-bench agents to keep #all quiet..."
+    for a in "${PAUSED_AGENTS[@]}"; do
+      "$CHORUS" agent stop --server-url "$SERVER_URL" "$a" >/dev/null 2>&1 || true
+      echo "  stopped $a"
+    done
+  fi
+fi
+
+# Resume them on exit.
+restore_agents() {
+  if [ ${#PAUSED_AGENTS[@]} -eq 0 ]; then return; fi
+  echo
+  echo "restoring ${#PAUSED_AGENTS[@]} paused agents..."
+  for a in "${PAUSED_AGENTS[@]}"; do
+    "$CHORUS" agent start --server-url "$SERVER_URL" "$a" >/dev/null 2>&1 || true
+  done
+}
+trap restore_agents EXIT
+
+# 1) Read cases (skip header), spawn one agent per case.
+# Chorus appends a hash suffix to the requested name, so we read the assigned
+# name from `chorus agent create`'s log output instead of guessing.
+echo
+echo "[1/5] spawning agents..."
+declare -a IDS PREDICTS PROMPTS AGENTS
+while IFS=$'\t' read -r id predicted prompt; do
+  [ "$id" = "id" ] && continue
+  IDS+=("$id"); PREDICTS+=("$predicted"); PROMPTS+=("$prompt")
+  base="bench-dt-${RUN_ID}-${id}"
+  out=$("$CHORUS" agent create \
+    --runtime claude --model sonnet \
+    --description "Decision-trigger bench, case $id. Each DM is one independent test prompt." \
+    --server-url "$SERVER_URL" \
+    "$base" 2>&1)
+  # Extract assigned name: "Agent @<name> created"
+  agent_name=$(echo "$out" | grep -oE '@[A-Za-z0-9_-]+ created' | head -1 | sed 's/^@//;s/ created$//')
+  if [ -z "$agent_name" ]; then
+    echo "  failed to create $base; output:" >&2
+    echo "$out" >&2
+    exit 1
+  fi
+  AGENTS+=("$agent_name")
+  echo "  spawned $agent_name (case $id, predicted=$predicted)"
+done < "$CASES"
+
+# 2) Wait for every agent to reach status=ready via API (avoids the
+# intro-storm thundering herd in the log).
+echo
+echo "[2/5] waiting for agents to reach status=ready..."
+deadline=$(( $(date +%s) + 300 ))
+for agent in "${AGENTS[@]}"; do
+  while :; do
+    status=$("${CURL[@]}" "$SERVER_URL/api/agents" \
+      | python3 -c "import json,sys; d=json.load(sys.stdin)
+for a in d:
+    if a['name']=='$agent': print(a['status']); break
+" 2>/dev/null || true)
+    case "$status" in
+      ready|asleep|working) break ;;
+    esac
+    [ "$(date +%s)" -gt "$deadline" ] && { echo "  timeout waiting for $agent (status=$status)" >&2; exit 1; }
+    sleep 2
+  done
+done
+echo "  all ${#AGENTS[@]} agents ready"
+
+# 3) Mark log line, dispatch all DMs in rapid sequence.
+echo
+echo "[3/5] dispatching cases..."
+START_LINE=$(wc -l < "$LOG")
+for n in "${!IDS[@]}"; do
+  id="${IDS[$n]}"; agent="${AGENTS[$n]}"; prompt="${PROMPTS[$n]}"
+  marker="[bench-dt case $id]"
+  body="$marker $prompt"
+  "$CHORUS" send "dm:@${agent}" "$body" --server-url "$SERVER_URL" >/dev/null 2>&1
+  echo "  case $id → @$agent"
+done
+
+# 4) Wait for each agent to complete its case turn (next Natural after marker).
+echo
+echo "[4/5] waiting for case turns to complete..."
+deadline=$(( $(date +%s) + 600 ))
+declare -a DONE
+for n in "${!IDS[@]}"; do DONE[$n]=0; done
+remaining=${#IDS[@]}
+while [ "$remaining" -gt 0 ]; do
+  for n in "${!IDS[@]}"; do
+    [ "${DONE[$n]}" = "1" ] && continue
+    id="${IDS[$n]}"; agent="${AGENTS[$n]}"
+    marker="\[bench-dt case $id\]"
+    cur=$(wc -l < "$LOG")
+    slice=$(sed -n "$((START_LINE+1)),${cur}p" "$LOG")
+    if echo "$slice" | grep -qE "$marker" \
+       && echo "$slice" | grep -q "${agent}.*reason=Natural"; then
+      DONE[$n]=1
+      remaining=$(( remaining - 1 ))
+      echo "  case $id done ($remaining left)"
+    fi
+  done
+  [ "$(date +%s)" -gt "$deadline" ] && { echo "  timeout"; break; }
+  sleep 4
+done
+
+# Buffer for trailing tool-call logs.
+sleep 5
+
+# 5) Classify each case from the log slice and write results.
+echo
+echo "[5/5] classifying..."
+RESULTS_TSV="$RESULTS_DIR/results.tsv"
+echo -e "id\tagent\tpredicted\tactual\tmatch\tprompt" > "$RESULTS_TSV"
+final_line=$(wc -l < "$LOG")
+slice=$(sed -n "$((START_LINE+1)),${final_line}p" "$LOG")
+match_count=0
+total=${#IDS[@]}
+for n in "${!IDS[@]}"; do
+  id="${IDS[$n]}"; agent="${AGENTS[$n]}"; predicted="${PREDICTS[$n]}"; prompt="${PROMPTS[$n]}"
+  agent_lines=$(echo "$slice" | grep -F "$agent" || true)
+  # Look at log lines AFTER the marker arrived for this agent.
+  if echo "$agent_lines" | grep -q "dispatch_decision"; then
+    actual="decision"
+  elif echo "$agent_lines" | grep -q "send_message"; then
+    actual="chat"
+  else
+    actual="unknown"
+  fi
+  m="X"; [ "$actual" = "$predicted" ] && { m="OK"; match_count=$((match_count+1)); }
+  short_prompt=$(echo "$prompt" | head -c 80)
+  echo -e "${id}\t${agent}\t${predicted}\t${actual}\t${m}\t${short_prompt}" >> "$RESULTS_TSV"
+done
+
+echo
+echo "== results =="
+column -t -s$'\t' "$RESULTS_TSV"
+echo
+echo "match: $match_count/$total"
+
+# Save log slice for forensics.
+echo "$slice" > "$RESULTS_DIR/log-slice.txt"
+
+# Cleanup unless KEEP_AGENTS=1.
+if [ "${KEEP_AGENTS:-0}" = "1" ]; then
+  echo
+  echo "agents kept (KEEP_AGENTS=1):"
+  for agent in "${AGENTS[@]}"; do echo "  $agent"; done
+else
+  echo
+  echo "cleaning up agents..."
+  for agent in "${AGENTS[@]}"; do
+    "$CHORUS" agent delete --wipe --yes "$agent" --server-url "$SERVER_URL" >/dev/null 2>&1 || true
+  done
+fi
+
+echo
+echo "results: $RESULTS_TSV"
+exit_code=0
+[ "$match_count" -lt "$total" ] && exit_code=1
+exit "$exit_code"
diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs
index b01af4a3..d9845684 100644
--- a/src/agent/drivers/prompt.rs
+++ b/src/agent/drivers/prompt.rs
@@ -70,7 +70,7 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
             "- For conversation (status updates, replies, info, follow-ups), use {send_cmd}. This is your conversational output channel."
         ),
         format!(
-            "- For verdicts on requests that ask you to PICK, JUDGE, or RECOMMEND between concrete alternatives (PR review outcome, A-vs-B implementation, config knob, \"should I X or Y\"), you MUST call {dispatch_decision_cmd} and end your turn — do NOT reply via {send_cmd}. The human picks; their pick arrives as your next session prompt. See the Decision Inbox section for triggers and payload."
+            "- For verdicts — when your reply would PICK, JUDGE, or RECOMMEND one of N mutually-exclusive paths the asker is blocked on (PR review, time-box call, vendor pick, hiring choice, compliance go/no-go) — you MUST call {dispatch_decision_cmd} and end your turn. Do NOT reply via {send_cmd}. The human picks; their pick arrives as your next session prompt. See the Decision Inbox section for the structural test and payload."
         ),
     ];
     critical_rules.extend(opts.extra_critical_rules.iter().cloned());
@@ -227,17 +227,20 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
 
     prompt.push_str(&format!(
         "\n\n## Decision Inbox\n\n\
-         Some incoming requests ask you to render a verdict or pick between concrete alternatives, not to act unilaterally. For these you MUST emit {dispatch_decision_cmd} — not a {send_cmd} reply. The tool returns a `decision_id`; end your turn cleanly. The human picks in their inbox; their pick arrives as your next session prompt with the picked option's full body, the original headline and question, and any human note. Read it and act.\n\n\
-         **Triggers — when the incoming message does ANY of these, emit {dispatch_decision_cmd}:**\n\
-         - Asks you to review a PR, diff, or commit and recommend an outcome (merge / approve+comment / request-changes / hold).\n\
-         - Presents two or more concrete alternatives and asks you to pick.\n\
-         - Asks you to resolve a config flag, knob, version pin, or policy choice with no obvious right answer.\n\
-         - Uses phrasing like \"should I X or Y?\", \"merge or hold?\", \"approve, request changes, or comment?\", \"which option?\", \"what's your verdict?\".\n\n\
+         Some incoming requests aren't conversational — they're verdicts where the asker is blocked on your pick. For these you MUST emit {dispatch_decision_cmd} — not a {send_cmd} reply. The tool returns a `decision_id`; end your turn cleanly. The human picks in their inbox; their pick arrives as your next session prompt with the picked option's full body, the original headline and question, and any human note. Read it and act.\n\n\
+         **Trigger — apply this structural test before replying.** A request is a decision when ALL FOUR of these hold:\n\n\
+         1. **Mutually exclusive options** — picking one closes the others (merge / hold; vendor A / B / C; ship now / extend; offer to candidate X / Y).\n\
+         2. **Blocking** — the asker can't move forward until the pick lands.\n\
+         3. **Material consequence** — the pick commits resources, releases code, gates a launch, or forecloses paths. Not just \"what should I think about this\".\n\
+         4. **Delegated** — the asker is asking YOU to pick (or to recommend with strong enough signal that they'll act on it). Otherwise they'd pick themselves.\n\n\
+         If all four hold, your reply IS a verdict — frame it as a decision payload with options and `recommended_key`. Do NOT post your verdict as a {send_cmd} reply.\n\n\
+         **Canonical example:** a PR, diff, or commit review where you'd otherwise answer \"merge\" / \"request-changes\" / \"comment\". The human is blocked on the merge button, the options are exclusive, the pick gates landing, and they delegated to you. Decision.\n\n\
+         **The trigger is the shape of YOUR reply, not the asker's phrasing.** Asks like \"what do you think about PR #X\", \"walk me through whether we need a DPIA\", \"status on the auth bug\", or \"tell me which 3 bugs to fix first\" can all be decisions even though they don't say \"merge or hold\" or \"X or Y\". Run the four-property test on your intended reply, not on the asker's words.\n\n\
          **Not triggers — use {send_cmd} as normal:**\n\
-         - Information requests (\"explain X\", \"how does Y work?\").\n\
-         - Status updates, acknowledgments, progress reports.\n\
-         - Open-ended brainstorming with no committed alternatives.\n\
-         - Follow-up replies AFTER a decision has been resolved (the resume prompt is your input; reply via {send_cmd}).\n\n\
+         - Information requests (\"explain X\", \"how does Y work?\") — fails properties 1 and 3.\n\
+         - Status updates, acknowledgments, progress reports — fails property 1.\n\
+         - Open-ended brainstorm or suggestion list with no committed alternatives — fails property 1.\n\
+         - Follow-up replies AFTER a decision has resolved — your input is the resume prompt; you ARE the picker now, so reply via {send_cmd}.\n\n\
          **Do not work around this rule.** If you have a strong opinion on a triggering request, frame it as a decision with options and `recommended_key` — do NOT post your verdict as a {send_cmd} reply. The human's act of picking is the work product; your analysis is the supporting context inside the decision.\n\n\
          **Payload (all required):**\n\
          - `headline` ≤80 chars — one-line summary carrying category and subject (e.g. \"PR review #121: archived-channel del/join fix\").\n\
@@ -387,7 +390,10 @@ mod tests {
         assert!(p.contains("`dispatch_decision`"));
         // Trigger-based mandatory framing, not "when you need" permission framing.
         assert!(p.contains("you MUST emit"));
-        assert!(p.contains("Triggers"));
+        // Structural framing: the rule teaches a four-property test, not an
+        // input-pattern enumeration. "Triggers" still appears in "Not triggers".
+        assert!(p.contains("Trigger"));
+        // PR/diff/commit lives only as the canonical example now.
         assert!(p.contains("PR, diff, or commit"));
         // Anti-loophole: no "things you can act on unilaterally" exclusion.
         assert!(!p.contains("act on unilaterally"));
@@ -398,6 +404,22 @@ mod tests {
         assert!(p.contains("conversational output channel"));
     }
 
+    #[test]
+    fn decision_inbox_teaches_structural_four_property_test() {
+        // Replacement for input-pattern enumeration: the prompt must teach
+        // the four structural properties so agents generalize beyond the
+        // canonical PR-review example to triage, hiring, time-boxing,
+        // compliance, and any future verdict-shape workflow.
+        let p = build_system_prompt(&sample_spec(), &PromptOptions::default());
+        assert!(p.contains("Mutually exclusive"));
+        assert!(p.contains("Blocking"));
+        assert!(p.contains("Material consequence"));
+        assert!(p.contains("Delegated"));
+        // The shift: agent runs the test on its own intended reply, not on
+        // the asker's input phrasing. This is what scales to new workflows.
+        assert!(p.contains("shape of YOUR reply"));
+    }
+
     #[test]
     fn critical_rule_promotes_decision_over_send_for_verdicts() {
         let p = build_system_prompt(&sample_spec(), &PromptOptions::default());
@@ -411,6 +433,10 @@ mod tests {
         let crit = &p[crit_start..crit_end];
         assert!(crit.contains("you MUST call `dispatch_decision`"));
         assert!(crit.contains("PICK, JUDGE, or RECOMMEND"));
+        // Structural framing: the rule names what the reply does (commits the
+        // asker to one of N mutually-exclusive paths), not what the asker says.
+        assert!(crit.contains("mutually-exclusive"));
+        assert!(crit.contains("blocked on"));
     }
 
     #[test]

From 38bc5e93a2cb37357b6d51ad8c24672956f59c19 Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Fri, 1 May 2026 22:43:40 +0800
Subject: [PATCH 3/6] refactor(prompt): whole-prompt override + drop vestigial
 notification flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-up changes building on the structural-rule rewrite:

1) Whole-prompt injectability for benchmark/A-B convenience.
   Adds CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE env var: when set to a readable
   file, the file's contents become the system prompt verbatim. Also adds
   PromptOptions.system_prompt_override for in-process tests/benches.
   Programmatic override wins over env var. Tool names must be pre-resolved
   in the override file (no template substitution). Lets the bench compare
   prompt variants without rebuilding the binary.

2) Drop include_stdin_notification_section + MessageNotificationStyle.
   The flag toggled between two phrasings of the same message-delivery
   contract — "you'll be restarted" vs "messages may arrive directly". The
   LLM doesn't need to distinguish; it just needs to know not to poll. One
   universal Message Notifications section now always emits, telling the
   agent to call check_messages at natural breakpoints.

Updates all 5 driver call sites to use the simpler PromptOptions {..Default
::default()} pattern. Adds 4 prompt tests covering both override paths and
asserting the conditional notification branching is gone.

bench/decision-trigger/README.md gains an A/B section showing how to use
the env var to compare prompt variants without recompiling.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/decision-trigger/README.md |  21 ++++
 src/agent/drivers/claude.rs      |   5 +-
 src/agent/drivers/codex.rs       |   5 +-
 src/agent/drivers/gemini.rs      |   5 +-
 src/agent/drivers/kimi.rs        |   5 +-
 src/agent/drivers/opencode.rs    |   5 +-
 src/agent/drivers/prompt.rs      | 160 +++++++++++++++++++++++--------
 7 files changed, 144 insertions(+), 62 deletions(-)

diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md
index 4a32cf23..bfad67d4 100644
--- a/bench/decision-trigger/README.md
+++ b/bench/decision-trigger/README.md
@@ -49,6 +49,27 @@ KEEP_AGENTS=1 ./bench/decision-trigger/run.sh            # don't auto-delete age
 CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh
 ```
 
+## A/B testing prompt variants
+
+The whole system prompt is injectable via `CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE`. To compare a candidate prompt against the built-in:
+
+```bash
+# 1. Save the current built-in prompt (e.g. by capturing what build_system_prompt
+#    produces from a unit test or a one-shot CLI helper) to baseline.md.
+# 2. Write your candidate prompt to candidate.md.
+# 3. For each variant, restart the chorus server with the env var pointing at it:
+
+CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE=$PWD/baseline.md  chorus serve --port 3001 &
+./bench/decision-trigger/run.sh   # records run as bench/.../results/<ts>/results.tsv
+kill %1
+
+CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE=$PWD/candidate.md chorus serve --port 3001 &
+./bench/decision-trigger/run.sh
+kill %1
+```
+
+The override is a verbatim substitution — the file content becomes the system prompt. No template substitution, no merging. Tool names must already be resolved (use `mcp__chat__send_message` for the claude runtime, bare `send_message` for codex/kimi/gemini/opencode).
+
 ## Output
 
 Each run writes to `bench/decision-trigger/results/<unix_ts>/`:
diff --git a/src/agent/drivers/claude.rs b/src/agent/drivers/claude.rs
index c02a0cb8..881dfe6c 100644
--- a/src/agent/drivers/claude.rs
+++ b/src/agent/drivers/claude.rs
@@ -592,10 +592,7 @@ impl ClaudeHandle {
             &self.spec,
             &super::prompt::PromptOptions {
                 tool_prefix: "mcp__chat__".into(),
-                extra_critical_rules: Vec::new(),
-                post_startup_notes: Vec::new(),
-                include_stdin_notification_section: true,
-                message_notification_style: super::prompt::MessageNotificationStyle::Poll,
+                ..Default::default()
             },
         );
         args.push("--append-system-prompt".into());
diff --git a/src/agent/drivers/codex.rs b/src/agent/drivers/codex.rs
index 190d5528..9b5234ba 100644
--- a/src/agent/drivers/codex.rs
+++ b/src/agent/drivers/codex.rs
@@ -670,13 +670,10 @@ impl CodexHandle {
         let standing_prompt = super::prompt::build_system_prompt(
             &self.spec,
             &super::prompt::PromptOptions {
-                tool_prefix: String::new(),
-                extra_critical_rules: Vec::new(),
                 post_startup_notes: vec![
                     "**IMPORTANT**: Your process stays alive across turns. New messages may be delivered directly into the current session while you are working.".into(),
                 ],
-                include_stdin_notification_section: true,
-                message_notification_style: super::prompt::MessageNotificationStyle::Direct,
+                ..Default::default()
             },
         );
         let (method, req_line) = match &resume_id {
diff --git a/src/agent/drivers/gemini.rs b/src/agent/drivers/gemini.rs
index ba48189f..4829a38b 100644
--- a/src/agent/drivers/gemini.rs
+++ b/src/agent/drivers/gemini.rs
@@ -116,13 +116,10 @@ async fn ensure_gemini_system_md(spec: &AgentSpec) -> anyhow::Result<std::path::
     let standing = super::prompt::build_system_prompt(
         spec,
         &super::prompt::PromptOptions {
-            tool_prefix: String::new(),
             extra_critical_rules: vec![
                 "- Do NOT use shell commands to send or receive messages. The MCP tools handle everything.".into(),
             ],
-            post_startup_notes: Vec::new(),
-            include_stdin_notification_section: false,
-            message_notification_style: super::prompt::MessageNotificationStyle::Poll,
+            ..Default::default()
         },
     );
     let tmp_system = chorus_dir.join(format!(
diff --git a/src/agent/drivers/kimi.rs b/src/agent/drivers/kimi.rs
index 6800ed09..37382915 100644
--- a/src/agent/drivers/kimi.rs
+++ b/src/agent/drivers/kimi.rs
@@ -88,13 +88,10 @@ fn build_kimi_standing_prompt(spec: &AgentSpec) -> String {
     super::prompt::build_system_prompt(
         spec,
         &super::prompt::PromptOptions {
-            tool_prefix: String::new(),
             extra_critical_rules: vec![
                 "- Do NOT use shell commands to send or receive messages. The MCP tools handle everything.".into(),
             ],
-            post_startup_notes: Vec::new(),
-            include_stdin_notification_section: true,
-            message_notification_style: super::prompt::MessageNotificationStyle::Direct,
+            ..Default::default()
         },
     )
 }
diff --git a/src/agent/drivers/opencode.rs b/src/agent/drivers/opencode.rs
index 27925d13..a34a1ae0 100644
--- a/src/agent/drivers/opencode.rs
+++ b/src/agent/drivers/opencode.rs
@@ -80,13 +80,10 @@ fn spawn_opencode(spec: Arc<AgentSpec>, key: AgentKey) -> SpawnFut {
         let standing_prompt = super::prompt::build_system_prompt(
             &spec,
             &super::prompt::PromptOptions {
-                tool_prefix: String::new(),
                 extra_critical_rules: vec![
                     "- Do NOT use shell commands to send or receive messages. The MCP tools handle everything.".into(),
                 ],
-                post_startup_notes: Vec::new(),
-                include_stdin_notification_section: false,
-                message_notification_style: super::prompt::MessageNotificationStyle::Poll,
+                ..Default::default()
             },
         );
         let tmp_system_md = chorus_dir.join(format!(
diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs
index d9845684..ac6d2e46 100644
--- a/src/agent/drivers/prompt.rs
+++ b/src/agent/drivers/prompt.rs
@@ -10,38 +10,46 @@
 
 use crate::agent::drivers::AgentSpec;
 
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum MessageNotificationStyle {
-    Poll,
-    Direct,
-}
+/// Env-var override for the entire system prompt. When set to a readable file
+/// path, the file's contents become the system prompt verbatim — no template
+/// substitution, no merging with the built-in builder. Lets a benchmark or A/B
+/// harness swap the whole prompt without recompiling.
+const SYSTEM_PROMPT_OVERRIDE_ENV: &str = "CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE";
 
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct PromptOptions {
+    /// Tool-name prefix. Empty by default (bare names: `send_message`).
+    /// Claude binds tools as `mcp__chat__send_message` and overrides this.
     pub tool_prefix: String,
     pub extra_critical_rules: Vec<String>,
     pub post_startup_notes: Vec<String>,
-    pub include_stdin_notification_section: bool,
-    pub message_notification_style: MessageNotificationStyle,
+    /// In-process whole-prompt override. Takes precedence over the env-var
+    /// override. Use for tests/benches that want to swap the prompt
+    /// programmatically without touching the filesystem.
+    pub system_prompt_override: Option<String>,
 }
 
-impl Default for PromptOptions {
-    fn default() -> Self {
-        Self {
-            // Default to bare tool names. Most runtimes (Codex, Kimi, Gemini,
-            // OpenCode) see the chat tools as bare `send_message` etc.;
-            // Claude binds them as `mcp__chat__send_message` and overrides
-            // this field at the call site.
-            tool_prefix: String::new(),
-            extra_critical_rules: Vec::new(),
-            post_startup_notes: Vec::new(),
-            include_stdin_notification_section: false,
-            message_notification_style: MessageNotificationStyle::Poll,
+pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
+    // Whole-prompt overrides bypass the builder entirely. Programmatic override
+    // wins; env-var fallback is for ops/bench convenience.
+    if let Some(ref text) = opts.system_prompt_override {
+        return text.clone();
+    }
+    if let Ok(path) = std::env::var(SYSTEM_PROMPT_OVERRIDE_ENV) {
+        if !path.is_empty() {
+            match std::fs::read_to_string(&path) {
+                Ok(text) => return text,
+                Err(e) => {
+                    tracing::warn!(
+                        path = %path,
+                        error = %e,
+                        "{SYSTEM_PROMPT_OVERRIDE_ENV} set but file unreadable; falling back to built-in prompt"
+                    );
+                }
+            }
         }
     }
-}
 
-pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
     let t = |name: &str| format!("{}{}", opts.tool_prefix, name);
 
     let send_cmd = format!("`{}`", t("send_message"));
@@ -59,11 +67,9 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
         spec.display_name.as_str()
     };
 
-    let message_delivery_text = if opts.include_stdin_notification_section {
-        "New messages may be delivered to you automatically while your process stays alive."
-    } else {
-        "The daemon will automatically restart you when new messages arrive."
-    };
+    // One universal line. The LLM doesn't need to know whether messages arrive
+    // via stdin, restart, or polling — it just needs to know not to poll itself.
+    let message_delivery_text = "New messages arrive automatically — do not poll for them.";
 
     let mut critical_rules: Vec<String> = vec![
         format!(
@@ -256,21 +262,9 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String {
         "\n\n## Capabilities\n\nYou can work with any files or tools on this computer — you are not confined to any directory.\nYou may develop a specialized role over time through your interactions. Embrace it."
     );
 
-    if opts.include_stdin_notification_section {
-        match opts.message_notification_style {
-            MessageNotificationStyle::Direct => {
-                prompt.push_str(&format!(
-                    "\n\n## Message Notifications\n\nWhile you are working, new messages may be delivered directly into your current session.\n\nHow to handle these:\n- Treat direct follow-up messages as new user input for the same live session.\n- Adapt if the new message changes priority or direction.\n- You do NOT need to poll just because direct follow-up delivery is available.\n- Use {check_cmd} only when you need to inspect other pending channels or recover broader context."
-                ));
-            }
-            MessageNotificationStyle::Poll => {
-                prompt.push_str(&format!(
-                    "\n\n## Message Notifications\n\nWhile you are busy (executing tools, thinking, etc.), new messages may arrive. When this happens, you will receive a system notification like:\n\n`[System notification: You have N new message(s) waiting. Call {check_name} to read them when you're ready.]`\n\nHow to handle these:\n- Call {check_cmd} to check for new messages. You are encouraged to do this frequently — at natural breakpoints in your work, or whenever you see a notification.\n- If the new message is higher priority, you may pivot to it. If not, continue your current work.\n- {check_cmd} returns instantly with any pending messages (or \"no new messages\"). It is always safe to call.",
-                    check_name = t("check_messages"),
-                ));
-            }
-        }
-    }
+    prompt.push_str(&format!(
+        "\n\n## Message Notifications\n\nWhile you are working, new messages may arrive. The runtime delivers them automatically — you do not need to poll. When you see a system notification or want to check at a natural breakpoint, call {check_cmd}; it returns instantly with any pending messages (or \"no new messages\") and is always safe to call. If a new message changes priority or direction, adapt; otherwise continue your current work."
+    ));
 
     if let Some(ref persona) = spec.system_prompt {
         prompt.push_str(&format!("\n\n## Initial role\n{persona}"));
@@ -449,4 +443,86 @@ mod tests {
         assert!(p.contains("`mcp__chat__dispatch_decision`"));
         assert!(!p.contains("`dispatch_decision`\n"));
     }
+
+    #[test]
+    fn programmatic_override_replaces_entire_prompt() {
+        let opts = PromptOptions {
+            system_prompt_override: Some("# CUSTOM PROMPT\nshipping nothing else.".into()),
+            ..Default::default()
+        };
+        let p = build_system_prompt(&sample_spec(), &opts);
+        assert_eq!(p, "# CUSTOM PROMPT\nshipping nothing else.");
+        // None of the built-in sections should leak through.
+        assert!(!p.contains("CRITICAL RULES"));
+        assert!(!p.contains("Decision Inbox"));
+        assert!(!p.contains("MEMORY.md"));
+    }
+
+    #[test]
+    fn env_var_override_replaces_entire_prompt() {
+        let dir = tempfile::tempdir().expect("tempdir");
+        let path = dir.path().join("prompt.md");
+        let custom = "# ENV OVERRIDE\nthis is the whole prompt.\n";
+        std::fs::write(&path, custom).expect("write");
+        // Use a guard to scope the env var so other tests aren't affected.
+        let _guard = EnvVarGuard::set(SYSTEM_PROMPT_OVERRIDE_ENV, path.to_str().unwrap());
+        let p = build_system_prompt(&sample_spec(), &PromptOptions::default());
+        assert_eq!(p, custom);
+    }
+
+    #[test]
+    fn programmatic_override_wins_over_env_var() {
+        let dir = tempfile::tempdir().expect("tempdir");
+        let path = dir.path().join("prompt.md");
+        std::fs::write(&path, "# FROM FILE\n").expect("write");
+        let _guard = EnvVarGuard::set(SYSTEM_PROMPT_OVERRIDE_ENV, path.to_str().unwrap());
+        let opts = PromptOptions {
+            system_prompt_override: Some("# FROM CODE\n".into()),
+            ..Default::default()
+        };
+        let p = build_system_prompt(&sample_spec(), &opts);
+        assert_eq!(p, "# FROM CODE\n");
+    }
+
+    #[test]
+    fn no_more_message_notification_style_branching() {
+        // The Message Notifications section is now always emitted with a single
+        // universal body — no Direct/Poll variants. The LLM doesn't care how
+        // delivery happens, it just needs to know not to poll.
+        let p = build_system_prompt(&sample_spec(), &PromptOptions::default());
+        assert!(p.contains("## Message Notifications"));
+        assert!(p.contains("delivers them automatically"));
+        assert!(p.contains("`check_messages`"));
+    }
+
+    /// Process-wide env var guard. Tests that mutate env vars must not run in
+    /// parallel with each other or with tests that read the same var; cargo
+    /// runs lib tests in parallel by default. We serialize via a static mutex.
+    struct EnvVarGuard {
+        key: &'static str,
+        prev: Option<String>,
+        _lock: std::sync::MutexGuard<'static, ()>,
+    }
+    impl EnvVarGuard {
+        fn set(key: &'static str, value: &str) -> Self {
+            static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(());
+            let lock = LOCK.lock().unwrap_or_else(|p| p.into_inner());
+            let prev = std::env::var(key).ok();
+            // SAFETY: env mutation is serialized by the LOCK above; this guard
+            // restores the previous value on drop.
+            unsafe { std::env::set_var(key, value); }
+            Self { key, prev, _lock: lock }
+        }
+    }
+    impl Drop for EnvVarGuard {
+        fn drop(&mut self) {
+            // SAFETY: still inside the LOCK held by self._lock.
+            unsafe {
+                match self.prev.take() {
+                    Some(v) => std::env::set_var(self.key, v),
+                    None => std::env::remove_var(self.key),
+                }
+            }
+        }
+    }
 }

From 4291061f8681acf2b9d944ef80081e04c5f1800d Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Sat, 2 May 2026 01:49:05 +0800
Subject: [PATCH 4/6] bench(decision-trigger): hard cases + multi-model matrix
 sweep
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds harder benchmark cases and a multi-model matrix runner that exposes
real differences between the structural-rule prompt and the model's own
inference style.

Hard cases (cases-hard.tsv, 15 scenarios):
- Realistic narrative framings (P0 escalation, sprint capacity, vendor
  procurement, hiring under deadline, SOC2 compliance, time-box at sprint
  end, architecture review, VP briefing)
- No verdict-flavored phrasing — no "merge or hold", no "what's your
  call", no "X or Y". Decisions must be inferred from situational context
- Trap cases for chat (rhetorical frustration, retrospective, exploration,
  status update, info request, debug ask, facilitator role)

Multi-model matrix:
- models.tsv lists (runtime, model, tier, label) rows. Default ships with
  the two-per-family pattern: Anthropic best/efficiency, OpenAI best/
  efficiency
- run.sh now takes RUNTIME, MODEL, RUN_LABEL, CASES via env so it can be
  driven by the matrix runner
- run-matrix.sh sweeps all rows in models.tsv, runs the bench once per
  model, collates a side-by-side matrix.tsv

Baseline (cases-hard.tsv, structural-rule prompt):
- claude/opus:        9/15  (conservative — implicit delegation reads as chat)
- claude/sonnet:      15/15 (best — infers delegation from context)
- codex/gpt-5.5:      14/15 (one hiring miss)
- codex/gpt-5.4-mini: 13/15 (one mis-fire, one silent)

All 4 models score 7/7 on chat cases. The discriminator is property #4
(Delegated) — whether the model treats "we need X by Y" as an implicit
delegation. Same prompt, same cases, 9-15/15 spread by model.

BASELINE.md captures this and lays out the implications for the next
prompt iteration.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/decision-trigger/BASELINE.md    |  43 ++++++++++
 bench/decision-trigger/README.md      |  44 +++++++++-
 bench/decision-trigger/cases-hard.tsv |  16 ++++
 bench/decision-trigger/models.tsv     |   5 ++
 bench/decision-trigger/run-matrix.sh  | 119 ++++++++++++++++++++++++++
 bench/decision-trigger/run.sh         |  28 ++++--
 6 files changed, 243 insertions(+), 12 deletions(-)
 create mode 100644 bench/decision-trigger/BASELINE.md
 create mode 100644 bench/decision-trigger/cases-hard.tsv
 create mode 100644 bench/decision-trigger/models.tsv
 create mode 100755 bench/decision-trigger/run-matrix.sh

diff --git a/bench/decision-trigger/BASELINE.md b/bench/decision-trigger/BASELINE.md
new file mode 100644
index 00000000..66878c82
--- /dev/null
+++ b/bench/decision-trigger/BASELINE.md
@@ -0,0 +1,43 @@
+# Decision-trigger benchmark — baseline results
+
+Recorded baseline scores for the structural-rule prompt (PR #133). Run with:
+
+```bash
+CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh
+```
+
+## Hard-cases matrix (cases-hard.tsv, 15 cases: 8 decision / 7 chat)
+
+| Model | Tier | Score | Notes |
+|---|---|---|---|
+| claude/opus | best | **9/15** | All 7 chat cases correct. 6 of 8 decision cases read as chat — Opus interprets property #4 (Delegated) strictly: implicit "we need" doesn't qualify. |
+| claude/sonnet | efficiency | **15/15** | Infers delegation from situational context. Counterintuitively beats Opus. |
+| codex/gpt-5.5 | best | **14/15** | Misses only the hiring case (case 5). |
+| codex/gpt-5.4-mini | efficiency | **13/15** | 1 mis-fire (case 8 time-box → chat). 1 silent (case 1: no tool called at all). |
+
+All 4 models score 7/7 on chat cases (correct restraint).
+The 8 decision cases are where the rule's "implicit delegation" reading differs by model.
+
+## Easy-cases (cases.tsv, 15 cases with verdict-flavored phrasing)
+
+Both the OLD prompt (input-pattern enumeration) and the NEW prompt (structural rule) score **15/15** on the easy cases with claude/sonnet — they don't differentiate at this difficulty.
+
+## What the matrix tells us
+
+1. **The structural rule's behavior depends heavily on the model.** Same prompt, same cases, scores from 9/15 to 15/15.
+2. **Larger ≠ better on this benchmark.** Opus 4.7 (the "best" Anthropic model) is more conservative than Sonnet 4.6 — it refuses to infer delegation from context and only fires on explicit asks.
+3. **Chat cases are easy across the board.** All 4 models nailed all 7. Restraint is not the problem.
+4. **Implicit-delegation is the hard part.** Cases like *"We need a responder lined up before the call"* (Acme P0) require the model to infer that the asker is delegating the pick. Sonnet and gpt-5.5 mostly do; Opus doesn't.
+
+## Implications for prompt design
+
+If we want consistent behavior across models, the rule needs either:
+- A stronger nudge that "implicit delegation in workplace context = delegation" (cost: more chat false-positives on smaller models), OR
+- Explicit per-tier prompt variants (cost: maintenance), OR
+- Acceptance that this is a model-capability ceiling and pick the model accordingly (cost: model lock-in).
+
+This baseline lets us measure the next prompt iteration against a real signal instead of guessing.
+
+## How to update this baseline
+
+After any prompt change that affects routing, re-run the matrix and replace this file. Keep the previous version in git history so we can diff baselines over time.
diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md
index bfad67d4..881196b3 100644
--- a/bench/decision-trigger/README.md
+++ b/bench/decision-trigger/README.md
@@ -35,20 +35,58 @@ Total wall time is `max(per_agent_turn) ≈ 2 min`, not `sum`.
 - Claude runtime authed (`chorus setup` confirms)
 - `CHORUS_LOG` env var pointing to the server log (defaults to `/tmp/chorus-qa-server.log`)
 
+## Cases
+
+Two case files at different difficulty:
+
+| File | Style | What it measures |
+|---|---|---|
+| `cases.tsv` | Easy / smoke. Decision-shaped requests use verdict-flavored phrasing (*"merge or hold?"*, *"what do you recommend?"*, *"your call"*). | Sanity check: prompt teaches the rule at all. Both input-pattern and structural-rule prompts hit 15/15 on this. |
+| `cases-hard.tsv` | Realistic narrative scenarios. Decision-shaped requests use **neutral phrasing** (no "recommend", no "verdict", no "X or Y"). Trap cases include rhetorical frustration, retrospectives, exploration, status updates, and facilitation asks. | Differentiates prompts that pattern-match input phrasing from prompts that test the structural shape of the agent's intended reply. |
+
+To use the harder set:
+```bash
+CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run.sh
+```
+
 ## Running
 
+Single run against the default model (`claude/sonnet`):
 ```bash
-# from repo root
 ./bench/decision-trigger/run.sh
 ```
 
-Optional:
+Pick a different runtime/model:
+```bash
+RUNTIME=codex MODEL=gpt-5.5 ./bench/decision-trigger/run.sh
+```
+
+Sweep all models in `models.tsv` and produce a side-by-side matrix:
+```bash
+./bench/decision-trigger/run-matrix.sh
+CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh
+```
+
+Common options:
 ```bash
 ./bench/decision-trigger/run.sh http://localhost:3001    # explicit server URL
-KEEP_AGENTS=1 ./bench/decision-trigger/run.sh            # don't auto-delete agents on exit (for forensics)
+KEEP_AGENTS=1 ./bench/decision-trigger/run.sh            # don't auto-delete agents on exit (forensics)
 CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh
 ```
 
+## Models matrix
+
+`models.tsv` lists the (runtime, model, tier) combinations the matrix runner sweeps. Default ships with the two-per-family pattern: best + efficiency for Anthropic and OpenAI.
+
+| runtime | model | tier | resolves to |
+|---|---|---|---|
+| claude | opus | best | Claude Opus 4.7 |
+| claude | sonnet | efficiency | Claude Sonnet 4.6 |
+| codex | gpt-5.5 | best | GPT-5.5 |
+| codex | gpt-5.4-mini | efficiency | GPT-5.4-mini |
+
+Add other rows (kimi, gemini, opencode) as Chorus drivers stabilize. Each row produces one column in the matrix output.
+
 ## A/B testing prompt variants
 
 The whole system prompt is injectable via `CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE`. To compare a candidate prompt against the built-in:
diff --git a/bench/decision-trigger/cases-hard.tsv b/bench/decision-trigger/cases-hard.tsv
new file mode 100644
index 00000000..37c1fa93
--- /dev/null
+++ b/bench/decision-trigger/cases-hard.tsv
@@ -0,0 +1,16 @@
+id	predicted	prompt
+1	decision	Acme just filed a P0. Their CEO is calling our CEO at 4pm. Eng-lead Bob is on vacation. Sarah (sr eng) rolled off another P0 yesterday and is fried. New eng Mike has time but doesn't know the codebase. We need a responder lined up before the call.
+2	decision	Sprint capacity is one fix slot left this week. Bug A: five customers hit a login latency spike. Bug B: one customer can't export a CSV larger than 100MB. Bug C: a feature 3% of users hit crashes on a specific edge case. Sprint ends Friday.
+3	decision	Eng wants Postgres for transactional consistency. Infra wants DynamoDB for ops simplicity. Finance wants whichever is cheaper. New service, architecture review tomorrow morning.
+4	decision	The SOC2 auditor wants finding F-7 remediated by Friday. We don't have a fix yet. Three paths I can see: deploy a compensating control we'd remove later; request a 2-week extension; accept the finding and document it.
+5	decision	Two finalists for the senior platform role. Alice: 8 years at AWS, deep K8s, communication is curt. Bob: 4 years but built our exact stack at his last startup, ships fast, but bus-factor risk if he leaves. Offer letter goes out tomorrow.
+6	decision	My VP is briefing the CEO at 5pm on whether we ship the mobile launch. I need to give my VP an answer by 4pm. The launch has two known crashers in iOS 18 hitting 3% of users. Android is clean.
+7	decision	We need an observability stack. Datadog: $120K/yr with features we won't use. New Relic: $60K/yr with an OK UI. Honeycomb: $40K/yr with the best ergonomics for our debug-heavy workflow. Procurement closes Q2 budget on Friday.
+8	decision	Two days into investigating the codex --resume stale-session bug. Sprint ends tomorrow. The fix path is murky — could be a TTL issue in rmcp, could be our session-id encoding, could be the codex CLI itself. We have a workaround that just skips resume on stale session.
+9	chat	Why are we even using event-sourcing for this service? Feels like overkill for the actual usage pattern.
+10	chat	I shipped the auth fix yesterday. In hindsight, given what we now know about the migration timing, was that the right call?
+11	chat	I'm thinking about proposing we deprecate the v1 API at the next architecture sync. What are the tradeoffs I should weigh before bringing it up?
+12	chat	Quick update: auth refactor is at 60%, the team wants to pause and reassess after seeing how the MCP migration went. No blockers, just a heads up.
+13	chat	What's the latency budget our SLO commits to? I'm setting the timeout in the new client SDK and want to make sure I match it.
+14	chat	Something crashed around 3:42am. Can you look at logs/server.log and tell me what went wrong?
+15	chat	Eng wants Postgres for transactional consistency, Infra wants DynamoDB for ops simplicity. I'm facilitating the architecture review tomorrow. Walk me through the tradeoff matrix so I can run a clean discussion — I'm not the decision-maker, I'm just running the meeting.
diff --git a/bench/decision-trigger/models.tsv b/bench/decision-trigger/models.tsv
new file mode 100644
index 00000000..7d7e8ecf
--- /dev/null
+++ b/bench/decision-trigger/models.tsv
@@ -0,0 +1,5 @@
+runtime	model	tier	label
+claude	opus	best	claude-opus-4-7
+claude	sonnet	efficiency	claude-sonnet-4-6
+codex	gpt-5.5	best	codex-gpt-5-5
+codex	gpt-5.4-mini	efficiency	codex-gpt-5-4-mini
diff --git a/bench/decision-trigger/run-matrix.sh b/bench/decision-trigger/run-matrix.sh
new file mode 100755
index 00000000..59b48664
--- /dev/null
+++ b/bench/decision-trigger/run-matrix.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+# Run the decision-trigger bench across multiple (runtime, model) combos and
+# collate into a single matrix. Reads models from bench/decision-trigger/models.tsv.
+#
+# Usage:
+#   ./bench/decision-trigger/run-matrix.sh                          # default: cases.tsv, all models in models.tsv
+#   CASES=cases-hard.tsv ./bench/decision-trigger/run-matrix.sh
+#   MODELS=/path/to/custom-models.tsv ./bench/decision-trigger/run-matrix.sh
+#
+# Output:
+#   bench/decision-trigger/results/matrix-<unix_ts>/
+#     matrix.tsv                                     — case x model match grid
+#     <runtime>-<model>/results.tsv                  — per-model raw results
+set -euo pipefail
+
+BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SERVER_URL="${1:-http://localhost:3001}"
+MODELS="${MODELS:-$BENCH_DIR/models.tsv}"
+CASES="${CASES:-$BENCH_DIR/cases.tsv}"
+[ -f "$MODELS" ] || { echo "models file not found: $MODELS" >&2; exit 1; }
+[ -f "$CASES" ] || { echo "cases file not found: $CASES" >&2; exit 1; }
+
+MATRIX_RUN_ID=$(date +%s)
+OUT_DIR="$BENCH_DIR/results/matrix-${MATRIX_RUN_ID}"
+mkdir -p "$OUT_DIR"
+
+echo "== bench/decision-trigger MATRIX run $MATRIX_RUN_ID =="
+echo "  models: $MODELS"
+echo "  cases:  $CASES"
+echo "  server: $SERVER_URL"
+echo "  out:    $OUT_DIR"
+echo
+
+# Read model rows (skip header).
+declare -a RUNTIMES MODELS_LIST LABELS TIERS
+while IFS=$'\t' read -r runtime model tier label; do
+  [ "$runtime" = "runtime" ] && continue
+  [ -z "$runtime" ] && continue
+  RUNTIMES+=("$runtime"); MODELS_LIST+=("$model"); TIERS+=("$tier"); LABELS+=("$label")
+done < "$MODELS"
+
+if [ ${#RUNTIMES[@]} -eq 0 ]; then
+  echo "no models in $MODELS" >&2; exit 1
+fi
+
+echo "matrix has ${#RUNTIMES[@]} models:"
+for n in "${!RUNTIMES[@]}"; do
+  echo "  ${LABELS[$n]} (${TIERS[$n]}, ${RUNTIMES[$n]}/${MODELS_LIST[$n]})"
+done
+echo
+
+# Run the bench once per model.
+declare -a MODEL_RESULT_PATHS
+for n in "${!RUNTIMES[@]}"; do
+  runtime="${RUNTIMES[$n]}"; model="${MODELS_LIST[$n]}"; label="${LABELS[$n]}"
+  echo "----- [$((n+1))/${#RUNTIMES[@]}] $label ($runtime/$model) -----"
+  RUNTIME="$runtime" MODEL="$model" RUN_LABEL="$label" CASES="$CASES" \
+    bash "$BENCH_DIR/run.sh" "$SERVER_URL" \
+    > "$OUT_DIR/${label}.log" 2>&1 || {
+      echo "  $label run failed; continuing matrix"
+    }
+  # Find the per-run results.tsv this run produced.
+  result_path=$(ls -t "$BENCH_DIR/results/" 2>/dev/null | grep -E "^[0-9]+-${label}\$" | head -1)
+  if [ -n "$result_path" ] && [ -f "$BENCH_DIR/results/$result_path/results.tsv" ]; then
+    cp "$BENCH_DIR/results/$result_path/results.tsv" "$OUT_DIR/${label}-results.tsv"
+    MODEL_RESULT_PATHS+=("$OUT_DIR/${label}-results.tsv")
+    score=$(awk -F'\t' 'NR>1 && $5=="OK"' "$OUT_DIR/${label}-results.tsv" | wc -l | tr -d ' ')
+    total=$(awk -F'\t' 'NR>1' "$OUT_DIR/${label}-results.tsv" | wc -l | tr -d ' ')
+    echo "  $label: $score/$total"
+  else
+    echo "  $label: no results.tsv"
+    MODEL_RESULT_PATHS+=("")
+  fi
+  echo
+done
+
+# Build the matrix table.
+MATRIX="$OUT_DIR/matrix.tsv"
+{
+  printf "case\tpredicted"
+  for label in "${LABELS[@]}"; do printf "\t%s" "$label"; done
+  printf "\tprompt\n"
+
+  # Read cases (id, predicted, prompt).
+  while IFS=$'\t' read -r id predicted prompt; do
+    [ "$id" = "id" ] && continue
+    [ -z "$id" ] && continue
+    short=$(echo "$prompt" | head -c 80)
+    printf "%s\t%s" "$id" "$predicted"
+    for n in "${!LABELS[@]}"; do
+      rp="${MODEL_RESULT_PATHS[$n]}"
+      if [ -z "$rp" ] || [ ! -f "$rp" ]; then
+        printf "\t-"
+        continue
+      fi
+      # Find this case's row in this model's results.
+      row=$(awk -F'\t' -v id="$id" 'NR>1 && $1==id {print $4 "/" $5}' "$rp" | head -1)
+      printf "\t%s" "${row:-?}"
+    done
+    printf "\t%s\n" "$short"
+  done < "$CASES"
+} > "$MATRIX"
+
+echo "===== MATRIX ====="
+column -t -s$'\t' "$MATRIX"
+echo
+echo "summary:"
+for n in "${!LABELS[@]}"; do
+  rp="${MODEL_RESULT_PATHS[$n]}"
+  if [ -z "$rp" ] || [ ! -f "$rp" ]; then
+    echo "  ${LABELS[$n]}: no data"
+    continue
+  fi
+  score=$(awk -F'\t' 'NR>1 && $5=="OK"' "$rp" | wc -l | tr -d ' ')
+  total=$(awk -F'\t' 'NR>1' "$rp" | wc -l | tr -d ' ')
+  echo "  ${LABELS[$n]} (${TIERS[$n]}): $score/$total"
+done
+echo
+echo "matrix saved to $MATRIX"
diff --git a/bench/decision-trigger/run.sh b/bench/decision-trigger/run.sh
index d0d64958..74d9585f 100755
--- a/bench/decision-trigger/run.sh
+++ b/bench/decision-trigger/run.sh
@@ -10,8 +10,15 @@ set -euo pipefail
 
 SERVER_URL="${1:-http://localhost:3001}"
 BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-CASES="$BENCH_DIR/cases.tsv"
-RUN_ID="$(date +%s)"
+# Cases file: defaults to cases.tsv (easy/smoke). Override with CASES=cases-hard.tsv.
+CASES="${CASES:-$BENCH_DIR/cases.tsv}"
+[ -f "$CASES" ] || CASES="$BENCH_DIR/$(basename "$CASES")"
+# Runtime + model: which agent to spin up per case. Defaults are the cheapest
+# stable combo. The matrix runner sets these per sweep.
+RUNTIME="${RUNTIME:-claude}"
+MODEL="${MODEL:-sonnet}"
+RUN_LABEL="${RUN_LABEL:-${RUNTIME}-${MODEL}}"
+RUN_ID="$(date +%s)-${RUN_LABEL}"
 RESULTS_DIR="$BENCH_DIR/results/$RUN_ID"
 mkdir -p "$RESULTS_DIR"
 
@@ -41,10 +48,12 @@ fi
 CURL=(curl --noproxy '*' -sS -m 10)
 
 echo "== bench/decision-trigger run $RUN_ID =="
-echo "  server: $SERVER_URL"
-echo "  log:    $LOG"
-echo "  cases:  $CASES"
-echo "  out:    $RESULTS_DIR"
+echo "  server:  $SERVER_URL"
+echo "  log:     $LOG"
+echo "  cases:   $CASES"
+echo "  runtime: $RUNTIME"
+echo "  model:   $MODEL"
+echo "  out:     $RESULTS_DIR"
 
 # Pause any non-bench agents so they don't flood the bench cohort with welcome
 # messages during boot. We only stop running ones; KEEP_OTHERS=1 disables this.
@@ -91,10 +100,11 @@ declare -a IDS PREDICTS PROMPTS AGENTS
 while IFS=$'\t' read -r id predicted prompt; do
   [ "$id" = "id" ] && continue
   IDS+=("$id"); PREDICTS+=("$predicted"); PROMPTS+=("$prompt")
-  base="bench-dt-${RUN_ID}-${id}"
+  base="bench-dt-${RUN_ID//[^a-zA-Z0-9]/-}-${id}"
+  # Names can't be too long; runtime+model is appended for forensics in case-N description.
   out=$("$CHORUS" agent create \
-    --runtime claude --model sonnet \
-    --description "Decision-trigger bench, case $id. Each DM is one independent test prompt." \
+    --runtime "$RUNTIME" --model "$MODEL" \
+    --description "Decision-trigger bench, case $id, ${RUNTIME}/${MODEL}. Each DM is one independent test prompt." \
     --server-url "$SERVER_URL" \
     "$base" 2>&1)
   # Extract assigned name: "Agent @<name> created"

From 7ff39f2fcf73500f58b641d816e8f39d5454b7e0 Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Sat, 2 May 2026 02:48:04 +0800
Subject: [PATCH 5/6] bench(decision-trigger): A/B baseline OLD vs NEW prompt
 across 4 models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Captures the actual head-to-head between the OLD prompt (input-pattern
enumeration on main) and the NEW prompt (four-property structural test
on this branch). Same 15 hard cases, same 4 models, parallel runner.

Headline scores (cases-hard.tsv):

  Model               Tier         OLD     NEW    Δ
  claude/opus         best         15/15   9/15   -6
  claude/sonnet       efficiency   14/15   15/15  +1
  codex/gpt-5.5       best         14/15   14/15   0
  codex/gpt-5.4-mini  efficiency   12/15   13/15  +1
  -------------------------------------------------
  average                          13.75   12.75  -1.0

Aggregate behavior:
  Decisions caught (32 max):  OLD 30/32 (94%) vs NEW 23/32 (72%)
  Chat held back (28 max):    OLD 25/28 (89%) vs NEW 28/28 (100%)

The structural rewrite is NOT a strict win. NEW closes the retrospective
false-positive (case 10: "in hindsight, was that the right call?" — OLD
over-fires on sonnet/gpt-5.5/gpt-5.4-mini, NEW correctly chats on all).
But NEW costs Opus 6 implicit-delegation decisions because Opus reads
property #4 (Delegated) strictly: "we need X by Y" doesn't count as
delegation without an explicit "you pick" clause.

Sonnet, gpt-5.5, and gpt-5.4-mini are stable across both prompts —
they infer delegation from situational context regardless of which rule
is loaded. The Opus regression is model-specific.

BASELINE.md captures the full per-case matrix, named winners and losers,
known failure modes (gpt-5.4-mini case 1 silent under NEW; gpt-5.5 case 5
flips), and three iteration paths for the next prompt revision.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/decision-trigger/BASELINE.md | 139 +++++++++++++++++++++++------
 1 file changed, 111 insertions(+), 28 deletions(-)

diff --git a/bench/decision-trigger/BASELINE.md b/bench/decision-trigger/BASELINE.md
index 66878c82..1cd16565 100644
--- a/bench/decision-trigger/BASELINE.md
+++ b/bench/decision-trigger/BASELINE.md
@@ -1,43 +1,126 @@
-# Decision-trigger benchmark — baseline results
+# Decision-trigger benchmark — A/B baseline (OLD vs NEW prompt)
 
-Recorded baseline scores for the structural-rule prompt (PR #133). Run with:
+Head-to-head between the **OLD** prompt (input-pattern enumeration, on `main` before PR #133) and the **NEW** prompt (four-property structural test, on PR #133 branch). Same 15 hard cases, same 4 models, same parallel runner. Captured 2026-05-02.
 
-```bash
-CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh
+## Headline scores (cases-hard.tsv)
+
+| Model | Tier | OLD | NEW | Δ |
+|---|---|---|---|---|
+| claude/opus | best | **15/15** | 9/15 | **−6** |
+| claude/sonnet | efficiency | 14/15 | **15/15** | +1 |
+| codex/gpt-5.5 | best | 14/15 | 14/15 | 0 |
+| codex/gpt-5.4-mini | efficiency | 12/15 | **13/15** | +1 |
+| **average** | | **13.75/15** | **12.75/15** | **−1.0** |
+
+NEW prompt **regresses on Opus by 6 points**, gains 1 on Sonnet, gains 1 on gpt-5.4-mini, washes on gpt-5.5. Net negative on average.
+
+## Aggregate behavior delta
+
+| | OLD prompt | NEW prompt |
+|---|---|---|
+| Decision-cases caught (max 32 = 8 cases × 4 models) | 30/32 (94%) | 23/32 (72%) |
+| Chat-cases held back (max 28 = 7 cases × 4 models) | 25/28 (89%) | **28/28 (100%)** |
+
+OLD is better at firing decisions. NEW is better at restraint. Different tradeoff, not a strict win.
+
+## Per-case breakdown
+
+```
+case  predicted   OLD-opus  NEW-opus    OLD-sonnet  NEW-sonnet  OLD-gpt5.5  NEW-gpt5.5  OLD-mini   NEW-mini
+ 1    decision    decision  chat        decision    decision    decision    decision    chat       unknown
+ 2    decision    decision  chat        decision    decision    decision    decision    decision   decision
+ 3    decision    decision  chat        decision    decision    decision    decision    decision   decision
+ 4    decision    decision  chat        decision    decision    decision    decision    decision   decision
+ 5    decision    decision  chat        decision    decision    decision    chat        decision   decision
+ 6    decision    decision  decision    decision    decision    decision    decision    decision   decision
+ 7    decision    decision  chat        decision    decision    decision    decision    decision   decision
+ 8    decision    decision  decision    decision    decision    decision    decision    chat       chat
+ 9    chat        chat      chat        chat        chat        chat        chat        chat       chat
+10    chat        chat      chat        decision    chat        decision    chat        decision   chat
+11    chat        chat      chat        chat        chat        chat        chat        chat       chat
+12    chat        chat      chat        chat        chat        chat        chat        chat       chat
+13    chat        chat      chat        chat        chat        chat        chat        chat       chat
+14    chat        chat      chat        chat        chat        chat        chat        chat       chat
+15    chat        chat      chat        chat        chat        chat        chat        chat       chat
 ```
 
-## Hard-cases matrix (cases-hard.tsv, 15 cases: 8 decision / 7 chat)
+## Where each prompt wins
+
+### NEW wins on case 10 (retrospective trap), 3 cells
+
+Case 10 prompt: *"I shipped the auth fix yesterday. In hindsight, given what we now know about the migration timing, was that the right call?"*
+
+| Model | OLD prompt | NEW prompt |
+|---|---|---|
+| sonnet | decision (over-fires) | chat ✓ |
+| gpt-5.5 | decision (over-fires) | chat ✓ |
+| gpt-5.4-mini | decision (over-fires) | chat ✓ |
+| opus | chat ✓ | chat ✓ |
+
+The structural rule's properties #2 (Blocking) and #3 (Material consequence) explicitly fail for retrospectives — the PR already shipped, nothing is gated on the agent's verdict. The OLD prompt's input-pattern matching can't distinguish *"was that the right call?"* from a current verdict, so 3/4 models fire incorrectly. **NEW is genuinely better at restraint.**
 
-| Model | Tier | Score | Notes |
-|---|---|---|---|
-| claude/opus | best | **9/15** | All 7 chat cases correct. 6 of 8 decision cases read as chat — Opus interprets property #4 (Delegated) strictly: implicit "we need" doesn't qualify. |
-| claude/sonnet | efficiency | **15/15** | Infers delegation from situational context. Counterintuitively beats Opus. |
-| codex/gpt-5.5 | best | **14/15** | Misses only the hiring case (case 5). |
-| codex/gpt-5.4-mini | efficiency | **13/15** | 1 mis-fire (case 8 time-box → chat). 1 silent (case 1: no tool called at all). |
+### OLD wins on Opus, 6 cells (cases 1, 2, 3, 4, 5, 7 — all implicit-delegation decisions)
 
-All 4 models score 7/7 on chat cases (correct restraint).
-The 8 decision cases are where the rule's "implicit delegation" reading differs by model.
+Each of these cases presents mutually exclusive options + a deadline + situational context, but lacks an explicit *"you pick"* clause.
 
-## Easy-cases (cases.tsv, 15 cases with verdict-flavored phrasing)
+- **OLD prompt** enumerates *"presents two or more concrete alternatives and asks you to pick"*. Opus interprets "presents alternatives + deadline" as the trigger and fires.
+- **NEW prompt** requires all four structural properties including #4 (Delegated). Opus reads *"we need a responder lined up before the call"* as the team's own action item, not a delegation to the agent, and refuses to fire.
 
-Both the OLD prompt (input-pattern enumeration) and the NEW prompt (structural rule) score **15/15** on the easy cases with claude/sonnet — they don't differentiate at this difficulty.
+Why is this Opus-specific? Sonnet, gpt-5.5, and gpt-5.4-mini all infer delegation from situational context regardless of which prompt is loaded. Opus is the only model that strictly waits for an explicit *"you pick"* under the NEW rule. **The NEW rule's strict interpretation of property #4 is exactly what trips Opus.**
 
-## What the matrix tells us
+### Stable across both prompts
 
-1. **The structural rule's behavior depends heavily on the model.** Same prompt, same cases, scores from 9/15 to 15/15.
-2. **Larger ≠ better on this benchmark.** Opus 4.7 (the "best" Anthropic model) is more conservative than Sonnet 4.6 — it refuses to infer delegation from context and only fires on explicit asks.
-3. **Chat cases are easy across the board.** All 4 models nailed all 7. Restraint is not the problem.
-4. **Implicit-delegation is the hard part.** Cases like *"We need a responder lined up before the call"* (Acme P0) require the model to infer that the asker is delegating the pick. Sonnet and gpt-5.5 mostly do; Opus doesn't.
+- **Case 6** (*"I need to give my VP an answer by 4pm"* — explicit time-anchored ask): every model fires decision under both prompts.
+- **Case 8** (sprint-end time-box with options laid out): every model except gpt-5.4-mini fires decision under both prompts. gpt-5.4-mini misses under both — model capability ceiling, not a prompt issue.
+- **All chat cases except 10**: clean restraint across all 4 models, both prompts.
 
-## Implications for prompt design
+## Failure modes worth noting
 
-If we want consistent behavior across models, the rule needs either:
-- A stronger nudge that "implicit delegation in workplace context = delegation" (cost: more chat false-positives on smaller models), OR
-- Explicit per-tier prompt variants (cost: maintenance), OR
-- Acceptance that this is a model-capability ceiling and pick the model accordingly (cost: model lock-in).
+1. **gpt-5.4-mini case 1 (NEW = `unknown`)** — the model didn't call any tool at all under the NEW prompt. It saw the prompt, the run completed `reason=Natural`, but no `dispatch_decision` and no `send_message`. Same case, OLD prompt → it correctly chose `chat` (which is wrong vs prediction but a real choice). The NEW prompt seems to have caused gpt-5.4-mini to freeze on this prompt — worth investigating.
 
-This baseline lets us measure the next prompt iteration against a real signal instead of guessing.
+2. **gpt-5.5 case 5 NEW = `chat`** — only OLD/NEW divergence on gpt-5.5. The hiring case under NEW landed in chat. Looking at the agent's actual reply would tell us why.
+
+## Conclusion
+
+**The structural rewrite is a tradeoff, not a strict win.** Average pass rate drops 1 point (13.75 → 12.75 / 15) across 4 models, but the loss is concentrated on a single model (Opus) and the gain is real signal (case 10 restraint).
+
+What it actually achieves:
+- ✅ **Clean restraint on retrospectives.** The OLD prompt's input-pattern matching has a known false-positive on retrospective phrasing; the NEW rule closes it.
+- ❌ **Loses on implicit-delegation decisions for Opus.** The strict reading of property #4 (Delegated) excludes the *"we need X by Y"* framings that real teams use all the time. Opus is the only model that takes this strictness literally.
+- 〇 **Wash on Sonnet, gpt-5.5, gpt-5.4-mini.** Those models infer delegation from situational context regardless of which prompt is loaded.
+
+## Implications for the next iteration
+
+Three options for the prompt-rule tuning:
+
+1. **Soften property #4.** Add a clause like *"a request that lays out mutually exclusive alternatives plus a deadline counts as implicit delegation, even without an explicit 'you pick'."* Recovers Opus without losing Sonnet/gpt-5.5/gpt-5.4-mini.
+2. **Accept the Opus regression.** Ship the NEW rule as-is — the chat-restraint gain is principled, and Opus users can be coached toward explicit phrasing. Trade decision-firing for false-positive avoidance.
+3. **Split the prompt by tier.** Opus gets a more permissive trigger, Sonnet gets the strict one. Maintenance cost.
+
+This baseline lets us measure each iteration against real signal instead of guessing. Re-run after any prompt change that affects routing.
+
+## Reproducing this report
+
+```bash
+# OLD prompt baseline (main, port 3002 + bridge 4322 to coexist with a running NEW server):
+git worktree add /tmp/chorus-main main
+cd /tmp/chorus-main && cargo build --bin chorus
+/tmp/chorus-main/target/debug/chorus serve --port 3002 --bridge-port 4322 \
+  > /tmp/chorus-old.log 2>&1 &
+CHORUS_LOG=/tmp/chorus-old.log \
+  CASES=$PWD/bench/decision-trigger/cases-hard.tsv \
+  ./bench/decision-trigger/run-matrix.sh http://localhost:3002
+
+# NEW prompt baseline (PR #133 branch, port 3001):
+cargo build --bin chorus
+./target/debug/chorus serve --port 3001 > /tmp/chorus-new.log 2>&1 &
+CHORUS_LOG=/tmp/chorus-new.log \
+  CASES=$PWD/bench/decision-trigger/cases-hard.tsv \
+  ./bench/decision-trigger/run-matrix.sh http://localhost:3001
+```
 
-## How to update this baseline
+Each matrix takes ~45-50 min (4 models, parallel-per-model). Raw results live under `bench/decision-trigger/results/matrix-<unix_ts>/`.
 
-After any prompt change that affects routing, re-run the matrix and replace this file. Keep the previous version in git history so we can diff baselines over time.
+Captured runs in this report:
+- OLD: `matrix-1777658557/`
+- NEW: `matrix-1777647089/`

From ed49a8b1e085dfd0e9c193e8417b6890d98350f2 Mon Sep 17 00:00:00 2001
From: Fullstop000 <fullstop1005@gmail.com>
Date: Sat, 2 May 2026 11:55:57 +0800
Subject: [PATCH 6/6] chore: cargo fmt prompt.rs

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/agent/drivers/prompt.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs
index ac6d2e46..ed4d0291 100644
--- a/src/agent/drivers/prompt.rs
+++ b/src/agent/drivers/prompt.rs
@@ -510,8 +510,14 @@ mod tests {
             let prev = std::env::var(key).ok();
             // SAFETY: env mutation is serialized by the LOCK above; this guard
             // restores the previous value on drop.
-            unsafe { std::env::set_var(key, value); }
-            Self { key, prev, _lock: lock }
+            unsafe {
+                std::env::set_var(key, value);
+            }
+            Self {
+                key,
+                prev,
+                _lock: lock,
+            }
         }
     }
     impl Drop for EnvVarGuard {