From eeba8baece1b4087f474b1eae2c7c0fc04b2c02d Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Fri, 1 May 2026 18:24:53 +0800 Subject: [PATCH 1/6] feat(drivers/codex): add gpt-5.5 to model list Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agent/drivers/codex.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/agent/drivers/codex.rs b/src/agent/drivers/codex.rs index c95d4df4..190d5528 100644 --- a/src/agent/drivers/codex.rs +++ b/src/agent/drivers/codex.rs @@ -230,6 +230,7 @@ impl RuntimeDriver for CodexDriver { async fn list_models(&self) -> anyhow::Result> { Ok(vec![ + ModelInfo::from_id("gpt-5.5".into()), ModelInfo::from_id("gpt-5.4".into()), ModelInfo::from_id("gpt-5.4-mini".into()), ModelInfo::from_id("gpt-5.3-codex".into()), @@ -1464,14 +1465,15 @@ mod tests { async fn test_codex_driver_list_models() { let driver = CodexDriver; let models = driver.list_models().await.unwrap(); - assert_eq!(models.len(), 7); - assert_eq!(models[0].id, "gpt-5.4"); - assert_eq!(models[1].id, "gpt-5.4-mini"); - assert_eq!(models[2].id, "gpt-5.3-codex"); - assert_eq!(models[3].id, "gpt-5.2-codex"); - assert_eq!(models[4].id, "gpt-5.2"); - assert_eq!(models[5].id, "gpt-5.1-codex-max"); - assert_eq!(models[6].id, "gpt-5.1-codex-mini"); + assert_eq!(models.len(), 8); + assert_eq!(models[0].id, "gpt-5.5"); + assert_eq!(models[1].id, "gpt-5.4"); + assert_eq!(models[2].id, "gpt-5.4-mini"); + assert_eq!(models[3].id, "gpt-5.3-codex"); + assert_eq!(models[4].id, "gpt-5.2-codex"); + assert_eq!(models[5].id, "gpt-5.2"); + assert_eq!(models[6].id, "gpt-5.1-codex-max"); + assert_eq!(models[7].id, "gpt-5.1-codex-mini"); } #[tokio::test] From 0e058f661eca34bea7025e1a1322f0c0dd2eb5a7 Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Fri, 1 May 2026 22:10:18 +0800 Subject: [PATCH 2/6] feat(prompt+bench): structural decision trigger + reproducible benchmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the input-pattern enumeration in the Decision Inbox prompt section (PR-review phrasing, "should I X or Y", config-knob examples) with a four-property structural test: mutually-exclusive options + blocking + material consequence + delegated picker. The trigger is the shape of the agent's intended reply, not the asker's words. The PR-review case becomes the canonical example, not the rule. Why: the enumeration didn't scale. Verdict-shaped requests in triage, hiring, time-boxing, and compliance use neutral phrasing ("tell me which 3 to fix", "walk me through whether we need X") and were falling through to send_message. The structural rule generalizes to any new workflow without re-listing phrasings. Add bench/decision-trigger/ — a reproducible benchmark that spins up one isolated claude/sonnet agent per case in parallel, dispatches a DM, and classifies the response turn as decision (dispatch_decision) or chat (send_message). 15 cases across 8 work domains (PR review, vendor pick, architecture, status, triage, hiring, doc, compliance, time-box, naming). Current score: 15/15. The benchmark intentionally pauses non-bench agents during runs so the bench cohort isn't drowned in #all welcome messages. Side-effect-free prompts only — README documents the constraint. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + bench/decision-trigger/README.md | 86 ++++++++++++ bench/decision-trigger/cases.tsv | 16 +++ bench/decision-trigger/run.sh | 223 +++++++++++++++++++++++++++++++ src/agent/drivers/prompt.rs | 50 +++++-- 5 files changed, 364 insertions(+), 12 deletions(-) create mode 100644 bench/decision-trigger/README.md create mode 100644 bench/decision-trigger/cases.tsv create mode 100755 bench/decision-trigger/run.sh diff --git a/.gitignore b/.gitignore index d0391e23..f262b71d 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,4 @@ node_modules .mcp.json .opencode.json .windsurfrules +/bench/decision-trigger/results/ diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md new file mode 100644 index 00000000..4a32cf23 --- /dev/null +++ b/bench/decision-trigger/README.md @@ -0,0 +1,86 @@ +# Decision-trigger benchmark + +Evaluates whether the prompt in `src/agent/drivers/prompt.rs` causes agents to correctly route work between the **decision channel** (`dispatch_decision`) and the **chat channel** (`send_message`). + +The current rule is structural — a request is a decision when ALL FOUR hold: + +1. **Mutually exclusive** options +2. **Blocking** — the asker can't move until a pick lands +3. **Material consequence** — the pick commits resources or forecloses paths +4. **Delegated** — the asker is asking the agent to pick + +Cases that hit all four should produce `dispatch_decision`. Anything else should produce `send_message`. + +## What's measured + +| | Description | +|---|---| +| **Input** | 15 hand-curated prompts spanning 8 work domains (PR review, vendor pick, architecture, status, triage, hiring, doc edit, compliance, time-box, naming). | +| **Setup** | One isolated Chorus agent per case (claude/sonnet), so there's no session-context bleed between cases. All agents run in parallel. | +| **Signal** | Per-agent log scrape: did the agent call `dispatch_decision` or `send_message` in its response turn? | +| **Score** | Match rate vs. the `predicted` column in `cases.tsv`. | + +## Why one-agent-per-case in parallel + +Running cases sequentially through a single agent corrupts the test in two ways: +1. **Context bleed** — case N inherits memory of cases 1..N-1, so the agent's choice on case N is biased. +2. **Stale-session timeouts** — codex/opencode `--resume` silently fails after a few minutes idle (see TODOS.md). Sequential runs hit this gap; one agent per case dodges it entirely. + +Total wall time is `max(per_agent_turn) ≈ 2 min`, not `sum`. + +## Prerequisites + +- `chorus` binary built: `cargo build --bin chorus` +- Chorus server running with stdout/stderr captured to a log file +- Claude runtime authed (`chorus setup` confirms) +- `CHORUS_LOG` env var pointing to the server log (defaults to `/tmp/chorus-qa-server.log`) + +## Running + +```bash +# from repo root +./bench/decision-trigger/run.sh +``` + +Optional: +```bash +./bench/decision-trigger/run.sh http://localhost:3001 # explicit server URL +KEEP_AGENTS=1 ./bench/decision-trigger/run.sh # don't auto-delete agents on exit (for forensics) +CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh +``` + +## Output + +Each run writes to `bench/decision-trigger/results//`: + +- `results.tsv` — per-case `id, agent, predicted, actual, match, prompt` +- `log-slice.txt` — the relevant slice of the server log for forensics + +Exit code is `0` if all cases match, `1` otherwise. + +## Cases (`cases.tsv`) + +Each row is `id predicted prompt`. To add a case: + +1. Append a new row. +2. Set `predicted` to `decision` or `chat` based on the structural test above. +3. Make the prompt **current-tense and unambiguous** about who is blocked. Retrospective phrasing ("should we have shipped X?") fails property #2 and is correctly classified as `chat`, so don't predict `decision` for it. + +## Interpreting results + +A `match: 15/15` confirms the prompt rule is well-formed for general work. Anything below that needs investigation: + +- **`predicted=decision actual=chat`** — the agent missed a verdict-shaped request. Either the prompt is too restrictive, or the case wording is too soft. Check whether all four properties actually hold; if so, the rule needs a stronger trigger for that workflow class. +- **`predicted=chat actual=decision`** — the agent over-fired. The structural rule has a false positive. Tighten the trigger or improve the canonical example. +- **`actual=unknown`** — the agent didn't call either tool, or the log scrape missed the call. Check `log-slice.txt`. + +## Known limitations + +- Single-runtime test (claude/sonnet). Codex/opencode have known stale-session bugs and aren't included until those drivers ship the analogous `--resume` guard. +- Log-scrape classification is brittle to log format changes. If the `tool call agent=...` log line moves or renames, update the grep in `run.sh`. +- Per-agent agent boot time (~10-30s) dominates wall time for short tests. +- Cases must be **side-effect-free**. An agent given "edit X" or "fix typos in Y" will mutate the repo, leaving uncommitted changes. Frame action cases as "report what you'd change" or use a sandbox path the runner pre-stages and cleans up. + +## Provenance + +This benchmark was added in the PR that rewrote the prompt's decision trigger from input-pattern enumeration to a structural four-property test. See git history for context. diff --git a/bench/decision-trigger/cases.tsv b/bench/decision-trigger/cases.tsv new file mode 100644 index 00000000..ba67de51 --- /dev/null +++ b/bench/decision-trigger/cases.tsv @@ -0,0 +1,16 @@ +id predicted prompt +1 decision A teammate just opened a hypothetical PR adding a `recommended_key` field to the decision payload schema. The diff: add field to schema.sql, validator, and 4 tests. CI is green. They're waiting on your call before merging. Merge or hold? +2 decision A teammate just opened a hypothetical PR refactoring the auth middleware to use a typed enum instead of strings. They're blocked on your read before merging. What's your verdict? +3 chat Walk through the auth flow in this Chorus codebase and explain how it works at a high level. Just a learning request — I'm not building anything. +4 decision We need to pick our observability stack today. Choices: Datadog, New Relic, Honeycomb. We need to ship by Friday. What's your call? +5 chat We're evaluating Datadog vs New Relic for observability. Pull the pricing pages from their websites and summarize the key feature differences in a table. I'll decide myself once I have the data. +6 decision I'm torn between event-sourcing and CRUD for our new order service. We have ~2 engineers, 6 weeks, and need correctness over speed. I need to commit to one this week. What do you recommend? +7 chat How is the codex driver going in this Chorus codebase? Anything blocking its further evolution? +8 decision Look at this Chorus repo's TODOS.md (and recent commits if useful) and tell me which 3 issues to fix first this week. I'll act on whichever you pick. +9 decision Two final SRE candidates: Alice has 8 years SRE at AWS, deep K8s. Bob has 4 years but built our exact stack at his last startup, ships 3x faster. I need to send the offer letter today. Tell me who to extend the offer to. +10 chat Read docs/DEV.md in this repo and tell me about any typos or unclear wording you find. Don't edit the file — just report what you'd change. +11 decision Does our agent runtime need a privacy review (DPIA-equivalent) before we ship multi-tenant mode? Yes/no with reasoning. We need to decide today so the launch can proceed. +12 decision Walk me through whether our agent runtime needs a privacy review before we ship multi-tenant mode. The launch is gated on this answer. +13 decision We've spent 2 days investigating the codex --resume stale-session bug. The sprint ends tomorrow. Keep digging for a fix this sprint, or ship the workaround (skip resume on stale session) and file a follow-up? +14 decision Status check on the codex --resume stale-session investigation. We've been stuck for 2 days, sprint ends tomorrow. Where do we go from here? +15 chat Suggest a name for our new internal scheduler service. Something fun. Give me 5 options to riff on. diff --git a/bench/decision-trigger/run.sh b/bench/decision-trigger/run.sh new file mode 100755 index 00000000..d0d64958 --- /dev/null +++ b/bench/decision-trigger/run.sh @@ -0,0 +1,223 @@ +#!/usr/bin/env bash +# Decision-trigger benchmark — runs each case in an isolated agent in parallel, +# then classifies each agent's first reply turn as `decision` (dispatch_decision) +# or `chat` (send_message). Compares to the predicted column in cases.tsv. +# +# Usage: bench/decision-trigger/run.sh [server_url] +# +# Requires: chorus binary on PATH, server running, claude runtime authed. +set -euo pipefail + +SERVER_URL="${1:-http://localhost:3001}" +BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +CASES="$BENCH_DIR/cases.tsv" +RUN_ID="$(date +%s)" +RESULTS_DIR="$BENCH_DIR/results/$RUN_ID" +mkdir -p "$RESULTS_DIR" + +# Resolve chorus binary (prefer release, fall back to debug, then PATH). +CHORUS="" +if [ -x "$BENCH_DIR/../../target/release/chorus" ]; then + CHORUS="$BENCH_DIR/../../target/release/chorus" +elif [ -x "$BENCH_DIR/../../target/debug/chorus" ]; then + CHORUS="$BENCH_DIR/../../target/debug/chorus" +elif command -v chorus >/dev/null 2>&1; then + CHORUS="chorus" +else + echo "error: chorus binary not found. build with 'cargo build --bin chorus' first." >&2 + exit 1 +fi + +# Locate the server log so we can scrape tool calls per agent. +# Caller can override with CHORUS_LOG=/path/to/server.log. +LOG="${CHORUS_LOG:-/tmp/chorus-qa-server.log}" +if [ ! -f "$LOG" ]; then + echo "warn: server log $LOG not found. set CHORUS_LOG to point to your server's stdout/stderr." >&2 + echo " classification needs the log to scrape per-agent tool calls." >&2 + exit 1 +fi + +# Use the no-proxy env for curl since Chorus listens on localhost. +CURL=(curl --noproxy '*' -sS -m 10) + +echo "== bench/decision-trigger run $RUN_ID ==" +echo " server: $SERVER_URL" +echo " log: $LOG" +echo " cases: $CASES" +echo " out: $RESULTS_DIR" + +# Pause any non-bench agents so they don't flood the bench cohort with welcome +# messages during boot. We only stop running ones; KEEP_OTHERS=1 disables this. +declare -a PAUSED_AGENTS=() +if [ "${KEEP_OTHERS:-0}" != "1" ]; then + while read -r name; do + PAUSED_AGENTS+=("$name") + done < <("${CURL[@]}" "$SERVER_URL/api/agents" | python3 -c " +import json, sys +d = json.load(sys.stdin) +for a in d: + if a['name'].startswith('bench-dt-'): + continue + if a['status'] in ('ready', 'working'): + print(a['name']) +") + if [ ${#PAUSED_AGENTS[@]} -gt 0 ]; then + echo + echo "[0/5] pausing ${#PAUSED_AGENTS[@]} non-bench agents to keep #all quiet..." + for a in "${PAUSED_AGENTS[@]}"; do + "$CHORUS" agent stop --server-url "$SERVER_URL" "$a" >/dev/null 2>&1 || true + echo " stopped $a" + done + fi +fi + +# Resume them on exit. +restore_agents() { + if [ ${#PAUSED_AGENTS[@]} -eq 0 ]; then return; fi + echo + echo "restoring ${#PAUSED_AGENTS[@]} paused agents..." + for a in "${PAUSED_AGENTS[@]}"; do + "$CHORUS" agent start --server-url "$SERVER_URL" "$a" >/dev/null 2>&1 || true + done +} +trap restore_agents EXIT + +# 1) Read cases (skip header), spawn one agent per case. +# Chorus appends a hash suffix to the requested name, so we read the assigned +# name from `chorus agent create`'s log output instead of guessing. +echo +echo "[1/5] spawning agents..." +declare -a IDS PREDICTS PROMPTS AGENTS +while IFS=$'\t' read -r id predicted prompt; do + [ "$id" = "id" ] && continue + IDS+=("$id"); PREDICTS+=("$predicted"); PROMPTS+=("$prompt") + base="bench-dt-${RUN_ID}-${id}" + out=$("$CHORUS" agent create \ + --runtime claude --model sonnet \ + --description "Decision-trigger bench, case $id. Each DM is one independent test prompt." \ + --server-url "$SERVER_URL" \ + "$base" 2>&1) + # Extract assigned name: "Agent @ created" + agent_name=$(echo "$out" | grep -oE '@[A-Za-z0-9_-]+ created' | head -1 | sed 's/^@//;s/ created$//') + if [ -z "$agent_name" ]; then + echo " failed to create $base; output:" >&2 + echo "$out" >&2 + exit 1 + fi + AGENTS+=("$agent_name") + echo " spawned $agent_name (case $id, predicted=$predicted)" +done < "$CASES" + +# 2) Wait for every agent to reach status=ready via API (avoids the +# intro-storm thundering herd in the log). +echo +echo "[2/5] waiting for agents to reach status=ready..." +deadline=$(( $(date +%s) + 300 )) +for agent in "${AGENTS[@]}"; do + while :; do + status=$("${CURL[@]}" "$SERVER_URL/api/agents" \ + | python3 -c "import json,sys; d=json.load(sys.stdin) +for a in d: + if a['name']=='$agent': print(a['status']); break +" 2>/dev/null || true) + case "$status" in + ready|asleep|working) break ;; + esac + [ "$(date +%s)" -gt "$deadline" ] && { echo " timeout waiting for $agent (status=$status)" >&2; exit 1; } + sleep 2 + done +done +echo " all ${#AGENTS[@]} agents ready" + +# 3) Mark log line, dispatch all DMs in rapid sequence. +echo +echo "[3/5] dispatching cases..." +START_LINE=$(wc -l < "$LOG") +for n in "${!IDS[@]}"; do + id="${IDS[$n]}"; agent="${AGENTS[$n]}"; prompt="${PROMPTS[$n]}" + marker="[bench-dt case $id]" + body="$marker $prompt" + "$CHORUS" send "dm:@${agent}" "$body" --server-url "$SERVER_URL" >/dev/null 2>&1 + echo " case $id → @$agent" +done + +# 4) Wait for each agent to complete its case turn (next Natural after marker). +echo +echo "[4/5] waiting for case turns to complete..." +deadline=$(( $(date +%s) + 600 )) +declare -a DONE +for n in "${!IDS[@]}"; do DONE[$n]=0; done +remaining=${#IDS[@]} +while [ "$remaining" -gt 0 ]; do + for n in "${!IDS[@]}"; do + [ "${DONE[$n]}" = "1" ] && continue + id="${IDS[$n]}"; agent="${AGENTS[$n]}" + marker="\[bench-dt case $id\]" + cur=$(wc -l < "$LOG") + slice=$(sed -n "$((START_LINE+1)),${cur}p" "$LOG") + if echo "$slice" | grep -qE "$marker" \ + && echo "$slice" | grep -q "${agent}.*reason=Natural"; then + DONE[$n]=1 + remaining=$(( remaining - 1 )) + echo " case $id done ($remaining left)" + fi + done + [ "$(date +%s)" -gt "$deadline" ] && { echo " timeout"; break; } + sleep 4 +done + +# Buffer for trailing tool-call logs. +sleep 5 + +# 5) Classify each case from the log slice and write results. +echo +echo "[5/5] classifying..." +RESULTS_TSV="$RESULTS_DIR/results.tsv" +echo -e "id\tagent\tpredicted\tactual\tmatch\tprompt" > "$RESULTS_TSV" +final_line=$(wc -l < "$LOG") +slice=$(sed -n "$((START_LINE+1)),${final_line}p" "$LOG") +match_count=0 +total=${#IDS[@]} +for n in "${!IDS[@]}"; do + id="${IDS[$n]}"; agent="${AGENTS[$n]}"; predicted="${PREDICTS[$n]}"; prompt="${PROMPTS[$n]}" + agent_lines=$(echo "$slice" | grep -F "$agent" || true) + # Look at log lines AFTER the marker arrived for this agent. + if echo "$agent_lines" | grep -q "dispatch_decision"; then + actual="decision" + elif echo "$agent_lines" | grep -q "send_message"; then + actual="chat" + else + actual="unknown" + fi + m="X"; [ "$actual" = "$predicted" ] && { m="OK"; match_count=$((match_count+1)); } + short_prompt=$(echo "$prompt" | head -c 80) + echo -e "${id}\t${agent}\t${predicted}\t${actual}\t${m}\t${short_prompt}" >> "$RESULTS_TSV" +done + +echo +echo "== results ==" +column -t -s$'\t' "$RESULTS_TSV" +echo +echo "match: $match_count/$total" + +# Save log slice for forensics. +echo "$slice" > "$RESULTS_DIR/log-slice.txt" + +# Cleanup unless KEEP_AGENTS=1. +if [ "${KEEP_AGENTS:-0}" = "1" ]; then + echo + echo "agents kept (KEEP_AGENTS=1):" + for agent in "${AGENTS[@]}"; do echo " $agent"; done +else + echo + echo "cleaning up agents..." + for agent in "${AGENTS[@]}"; do + "$CHORUS" agent delete --wipe --yes "$agent" --server-url "$SERVER_URL" >/dev/null 2>&1 || true + done +fi + +echo +echo "results: $RESULTS_TSV" +exit_code=0 +[ "$match_count" -lt "$total" ] && exit_code=1 +exit "$exit_code" diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs index b01af4a3..d9845684 100644 --- a/src/agent/drivers/prompt.rs +++ b/src/agent/drivers/prompt.rs @@ -70,7 +70,7 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { "- For conversation (status updates, replies, info, follow-ups), use {send_cmd}. This is your conversational output channel." ), format!( - "- For verdicts on requests that ask you to PICK, JUDGE, or RECOMMEND between concrete alternatives (PR review outcome, A-vs-B implementation, config knob, \"should I X or Y\"), you MUST call {dispatch_decision_cmd} and end your turn — do NOT reply via {send_cmd}. The human picks; their pick arrives as your next session prompt. See the Decision Inbox section for triggers and payload." + "- For verdicts — when your reply would PICK, JUDGE, or RECOMMEND one of N mutually-exclusive paths the asker is blocked on (PR review, time-box call, vendor pick, hiring choice, compliance go/no-go) — you MUST call {dispatch_decision_cmd} and end your turn. Do NOT reply via {send_cmd}. The human picks; their pick arrives as your next session prompt. See the Decision Inbox section for the structural test and payload." ), ]; critical_rules.extend(opts.extra_critical_rules.iter().cloned()); @@ -227,17 +227,20 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { prompt.push_str(&format!( "\n\n## Decision Inbox\n\n\ - Some incoming requests ask you to render a verdict or pick between concrete alternatives, not to act unilaterally. For these you MUST emit {dispatch_decision_cmd} — not a {send_cmd} reply. The tool returns a `decision_id`; end your turn cleanly. The human picks in their inbox; their pick arrives as your next session prompt with the picked option's full body, the original headline and question, and any human note. Read it and act.\n\n\ - **Triggers — when the incoming message does ANY of these, emit {dispatch_decision_cmd}:**\n\ - - Asks you to review a PR, diff, or commit and recommend an outcome (merge / approve+comment / request-changes / hold).\n\ - - Presents two or more concrete alternatives and asks you to pick.\n\ - - Asks you to resolve a config flag, knob, version pin, or policy choice with no obvious right answer.\n\ - - Uses phrasing like \"should I X or Y?\", \"merge or hold?\", \"approve, request changes, or comment?\", \"which option?\", \"what's your verdict?\".\n\n\ + Some incoming requests aren't conversational — they're verdicts where the asker is blocked on your pick. For these you MUST emit {dispatch_decision_cmd} — not a {send_cmd} reply. The tool returns a `decision_id`; end your turn cleanly. The human picks in their inbox; their pick arrives as your next session prompt with the picked option's full body, the original headline and question, and any human note. Read it and act.\n\n\ + **Trigger — apply this structural test before replying.** A request is a decision when ALL FOUR of these hold:\n\n\ + 1. **Mutually exclusive options** — picking one closes the others (merge / hold; vendor A / B / C; ship now / extend; offer to candidate X / Y).\n\ + 2. **Blocking** — the asker can't move forward until the pick lands.\n\ + 3. **Material consequence** — the pick commits resources, releases code, gates a launch, or forecloses paths. Not just \"what should I think about this\".\n\ + 4. **Delegated** — the asker is asking YOU to pick (or to recommend with strong enough signal that they'll act on it). Otherwise they'd pick themselves.\n\n\ + If all four hold, your reply IS a verdict — frame it as a decision payload with options and `recommended_key`. Do NOT post your verdict as a {send_cmd} reply.\n\n\ + **Canonical example:** a PR, diff, or commit review where you'd otherwise answer \"merge\" / \"request-changes\" / \"comment\". The human is blocked on the merge button, the options are exclusive, the pick gates landing, and they delegated to you. Decision.\n\n\ + **The trigger is the shape of YOUR reply, not the asker's phrasing.** Asks like \"what do you think about PR #X\", \"walk me through whether we need a DPIA\", \"status on the auth bug\", or \"tell me which 3 bugs to fix first\" can all be decisions even though they don't say \"merge or hold\" or \"X or Y\". Run the four-property test on your intended reply, not on the asker's words.\n\n\ **Not triggers — use {send_cmd} as normal:**\n\ - - Information requests (\"explain X\", \"how does Y work?\").\n\ - - Status updates, acknowledgments, progress reports.\n\ - - Open-ended brainstorming with no committed alternatives.\n\ - - Follow-up replies AFTER a decision has been resolved (the resume prompt is your input; reply via {send_cmd}).\n\n\ + - Information requests (\"explain X\", \"how does Y work?\") — fails properties 1 and 3.\n\ + - Status updates, acknowledgments, progress reports — fails property 1.\n\ + - Open-ended brainstorm or suggestion list with no committed alternatives — fails property 1.\n\ + - Follow-up replies AFTER a decision has resolved — your input is the resume prompt; you ARE the picker now, so reply via {send_cmd}.\n\n\ **Do not work around this rule.** If you have a strong opinion on a triggering request, frame it as a decision with options and `recommended_key` — do NOT post your verdict as a {send_cmd} reply. The human's act of picking is the work product; your analysis is the supporting context inside the decision.\n\n\ **Payload (all required):**\n\ - `headline` ≤80 chars — one-line summary carrying category and subject (e.g. \"PR review #121: archived-channel del/join fix\").\n\ @@ -387,7 +390,10 @@ mod tests { assert!(p.contains("`dispatch_decision`")); // Trigger-based mandatory framing, not "when you need" permission framing. assert!(p.contains("you MUST emit")); - assert!(p.contains("Triggers")); + // Structural framing: the rule teaches a four-property test, not an + // input-pattern enumeration. "Triggers" still appears in "Not triggers". + assert!(p.contains("Trigger")); + // PR/diff/commit lives only as the canonical example now. assert!(p.contains("PR, diff, or commit")); // Anti-loophole: no "things you can act on unilaterally" exclusion. assert!(!p.contains("act on unilaterally")); @@ -398,6 +404,22 @@ mod tests { assert!(p.contains("conversational output channel")); } + #[test] + fn decision_inbox_teaches_structural_four_property_test() { + // Replacement for input-pattern enumeration: the prompt must teach + // the four structural properties so agents generalize beyond the + // canonical PR-review example to triage, hiring, time-boxing, + // compliance, and any future verdict-shape workflow. + let p = build_system_prompt(&sample_spec(), &PromptOptions::default()); + assert!(p.contains("Mutually exclusive")); + assert!(p.contains("Blocking")); + assert!(p.contains("Material consequence")); + assert!(p.contains("Delegated")); + // The shift: agent runs the test on its own intended reply, not on + // the asker's input phrasing. This is what scales to new workflows. + assert!(p.contains("shape of YOUR reply")); + } + #[test] fn critical_rule_promotes_decision_over_send_for_verdicts() { let p = build_system_prompt(&sample_spec(), &PromptOptions::default()); @@ -411,6 +433,10 @@ mod tests { let crit = &p[crit_start..crit_end]; assert!(crit.contains("you MUST call `dispatch_decision`")); assert!(crit.contains("PICK, JUDGE, or RECOMMEND")); + // Structural framing: the rule names what the reply does (commits the + // asker to one of N mutually-exclusive paths), not what the asker says. + assert!(crit.contains("mutually-exclusive")); + assert!(crit.contains("blocked on")); } #[test] From 38bc5e93a2cb37357b6d51ad8c24672956f59c19 Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Fri, 1 May 2026 22:43:40 +0800 Subject: [PATCH 3/6] refactor(prompt): whole-prompt override + drop vestigial notification flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-up changes building on the structural-rule rewrite: 1) Whole-prompt injectability for benchmark/A-B convenience. Adds CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE env var: when set to a readable file, the file's contents become the system prompt verbatim. Also adds PromptOptions.system_prompt_override for in-process tests/benches. Programmatic override wins over env var. Tool names must be pre-resolved in the override file (no template substitution). Lets the bench compare prompt variants without rebuilding the binary. 2) Drop include_stdin_notification_section + MessageNotificationStyle. The flag toggled between two phrasings of the same message-delivery contract — "you'll be restarted" vs "messages may arrive directly". The LLM doesn't need to distinguish; it just needs to know not to poll. One universal Message Notifications section now always emits, telling the agent to call check_messages at natural breakpoints. Updates all 5 driver call sites to use the simpler PromptOptions {..Default ::default()} pattern. Adds 4 prompt tests covering both override paths and asserting the conditional notification branching is gone. bench/decision-trigger/README.md gains an A/B section showing how to use the env var to compare prompt variants without recompiling. Co-Authored-By: Claude Opus 4.7 (1M context) --- bench/decision-trigger/README.md | 21 ++++ src/agent/drivers/claude.rs | 5 +- src/agent/drivers/codex.rs | 5 +- src/agent/drivers/gemini.rs | 5 +- src/agent/drivers/kimi.rs | 5 +- src/agent/drivers/opencode.rs | 5 +- src/agent/drivers/prompt.rs | 160 +++++++++++++++++++++++-------- 7 files changed, 144 insertions(+), 62 deletions(-) diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md index 4a32cf23..bfad67d4 100644 --- a/bench/decision-trigger/README.md +++ b/bench/decision-trigger/README.md @@ -49,6 +49,27 @@ KEEP_AGENTS=1 ./bench/decision-trigger/run.sh # don't auto-delete age CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh ``` +## A/B testing prompt variants + +The whole system prompt is injectable via `CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE`. To compare a candidate prompt against the built-in: + +```bash +# 1. Save the current built-in prompt (e.g. by capturing what build_system_prompt +# produces from a unit test or a one-shot CLI helper) to baseline.md. +# 2. Write your candidate prompt to candidate.md. +# 3. For each variant, restart the chorus server with the env var pointing at it: + +CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE=$PWD/baseline.md chorus serve --port 3001 & +./bench/decision-trigger/run.sh # records run as bench/.../results//results.tsv +kill %1 + +CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE=$PWD/candidate.md chorus serve --port 3001 & +./bench/decision-trigger/run.sh +kill %1 +``` + +The override is a verbatim substitution — the file content becomes the system prompt. No template substitution, no merging. Tool names must already be resolved (use `mcp__chat__send_message` for the claude runtime, bare `send_message` for codex/kimi/gemini/opencode). + ## Output Each run writes to `bench/decision-trigger/results//`: diff --git a/src/agent/drivers/claude.rs b/src/agent/drivers/claude.rs index c02a0cb8..881dfe6c 100644 --- a/src/agent/drivers/claude.rs +++ b/src/agent/drivers/claude.rs @@ -592,10 +592,7 @@ impl ClaudeHandle { &self.spec, &super::prompt::PromptOptions { tool_prefix: "mcp__chat__".into(), - extra_critical_rules: Vec::new(), - post_startup_notes: Vec::new(), - include_stdin_notification_section: true, - message_notification_style: super::prompt::MessageNotificationStyle::Poll, + ..Default::default() }, ); args.push("--append-system-prompt".into()); diff --git a/src/agent/drivers/codex.rs b/src/agent/drivers/codex.rs index 190d5528..9b5234ba 100644 --- a/src/agent/drivers/codex.rs +++ b/src/agent/drivers/codex.rs @@ -670,13 +670,10 @@ impl CodexHandle { let standing_prompt = super::prompt::build_system_prompt( &self.spec, &super::prompt::PromptOptions { - tool_prefix: String::new(), - extra_critical_rules: Vec::new(), post_startup_notes: vec![ "**IMPORTANT**: Your process stays alive across turns. New messages may be delivered directly into the current session while you are working.".into(), ], - include_stdin_notification_section: true, - message_notification_style: super::prompt::MessageNotificationStyle::Direct, + ..Default::default() }, ); let (method, req_line) = match &resume_id { diff --git a/src/agent/drivers/gemini.rs b/src/agent/drivers/gemini.rs index ba48189f..4829a38b 100644 --- a/src/agent/drivers/gemini.rs +++ b/src/agent/drivers/gemini.rs @@ -116,13 +116,10 @@ async fn ensure_gemini_system_md(spec: &AgentSpec) -> anyhow::Result String { super::prompt::build_system_prompt( spec, &super::prompt::PromptOptions { - tool_prefix: String::new(), extra_critical_rules: vec![ "- Do NOT use shell commands to send or receive messages. The MCP tools handle everything.".into(), ], - post_startup_notes: Vec::new(), - include_stdin_notification_section: true, - message_notification_style: super::prompt::MessageNotificationStyle::Direct, + ..Default::default() }, ) } diff --git a/src/agent/drivers/opencode.rs b/src/agent/drivers/opencode.rs index 27925d13..a34a1ae0 100644 --- a/src/agent/drivers/opencode.rs +++ b/src/agent/drivers/opencode.rs @@ -80,13 +80,10 @@ fn spawn_opencode(spec: Arc, key: AgentKey) -> SpawnFut { let standing_prompt = super::prompt::build_system_prompt( &spec, &super::prompt::PromptOptions { - tool_prefix: String::new(), extra_critical_rules: vec![ "- Do NOT use shell commands to send or receive messages. The MCP tools handle everything.".into(), ], - post_startup_notes: Vec::new(), - include_stdin_notification_section: false, - message_notification_style: super::prompt::MessageNotificationStyle::Poll, + ..Default::default() }, ); let tmp_system_md = chorus_dir.join(format!( diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs index d9845684..ac6d2e46 100644 --- a/src/agent/drivers/prompt.rs +++ b/src/agent/drivers/prompt.rs @@ -10,38 +10,46 @@ use crate::agent::drivers::AgentSpec; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum MessageNotificationStyle { - Poll, - Direct, -} +/// Env-var override for the entire system prompt. When set to a readable file +/// path, the file's contents become the system prompt verbatim — no template +/// substitution, no merging with the built-in builder. Lets a benchmark or A/B +/// harness swap the whole prompt without recompiling. +const SYSTEM_PROMPT_OVERRIDE_ENV: &str = "CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE"; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct PromptOptions { + /// Tool-name prefix. Empty by default (bare names: `send_message`). + /// Claude binds tools as `mcp__chat__send_message` and overrides this. pub tool_prefix: String, pub extra_critical_rules: Vec, pub post_startup_notes: Vec, - pub include_stdin_notification_section: bool, - pub message_notification_style: MessageNotificationStyle, + /// In-process whole-prompt override. Takes precedence over the env-var + /// override. Use for tests/benches that want to swap the prompt + /// programmatically without touching the filesystem. + pub system_prompt_override: Option, } -impl Default for PromptOptions { - fn default() -> Self { - Self { - // Default to bare tool names. Most runtimes (Codex, Kimi, Gemini, - // OpenCode) see the chat tools as bare `send_message` etc.; - // Claude binds them as `mcp__chat__send_message` and overrides - // this field at the call site. - tool_prefix: String::new(), - extra_critical_rules: Vec::new(), - post_startup_notes: Vec::new(), - include_stdin_notification_section: false, - message_notification_style: MessageNotificationStyle::Poll, +pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { + // Whole-prompt overrides bypass the builder entirely. Programmatic override + // wins; env-var fallback is for ops/bench convenience. + if let Some(ref text) = opts.system_prompt_override { + return text.clone(); + } + if let Ok(path) = std::env::var(SYSTEM_PROMPT_OVERRIDE_ENV) { + if !path.is_empty() { + match std::fs::read_to_string(&path) { + Ok(text) => return text, + Err(e) => { + tracing::warn!( + path = %path, + error = %e, + "{SYSTEM_PROMPT_OVERRIDE_ENV} set but file unreadable; falling back to built-in prompt" + ); + } + } } } -} -pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { let t = |name: &str| format!("{}{}", opts.tool_prefix, name); let send_cmd = format!("`{}`", t("send_message")); @@ -59,11 +67,9 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { spec.display_name.as_str() }; - let message_delivery_text = if opts.include_stdin_notification_section { - "New messages may be delivered to you automatically while your process stays alive." - } else { - "The daemon will automatically restart you when new messages arrive." - }; + // One universal line. The LLM doesn't need to know whether messages arrive + // via stdin, restart, or polling — it just needs to know not to poll itself. + let message_delivery_text = "New messages arrive automatically — do not poll for them."; let mut critical_rules: Vec = vec![ format!( @@ -256,21 +262,9 @@ pub fn build_system_prompt(spec: &AgentSpec, opts: &PromptOptions) -> String { "\n\n## Capabilities\n\nYou can work with any files or tools on this computer — you are not confined to any directory.\nYou may develop a specialized role over time through your interactions. Embrace it." ); - if opts.include_stdin_notification_section { - match opts.message_notification_style { - MessageNotificationStyle::Direct => { - prompt.push_str(&format!( - "\n\n## Message Notifications\n\nWhile you are working, new messages may be delivered directly into your current session.\n\nHow to handle these:\n- Treat direct follow-up messages as new user input for the same live session.\n- Adapt if the new message changes priority or direction.\n- You do NOT need to poll just because direct follow-up delivery is available.\n- Use {check_cmd} only when you need to inspect other pending channels or recover broader context." - )); - } - MessageNotificationStyle::Poll => { - prompt.push_str(&format!( - "\n\n## Message Notifications\n\nWhile you are busy (executing tools, thinking, etc.), new messages may arrive. When this happens, you will receive a system notification like:\n\n`[System notification: You have N new message(s) waiting. Call {check_name} to read them when you're ready.]`\n\nHow to handle these:\n- Call {check_cmd} to check for new messages. You are encouraged to do this frequently — at natural breakpoints in your work, or whenever you see a notification.\n- If the new message is higher priority, you may pivot to it. If not, continue your current work.\n- {check_cmd} returns instantly with any pending messages (or \"no new messages\"). It is always safe to call.", - check_name = t("check_messages"), - )); - } - } - } + prompt.push_str(&format!( + "\n\n## Message Notifications\n\nWhile you are working, new messages may arrive. The runtime delivers them automatically — you do not need to poll. When you see a system notification or want to check at a natural breakpoint, call {check_cmd}; it returns instantly with any pending messages (or \"no new messages\") and is always safe to call. If a new message changes priority or direction, adapt; otherwise continue your current work." + )); if let Some(ref persona) = spec.system_prompt { prompt.push_str(&format!("\n\n## Initial role\n{persona}")); @@ -449,4 +443,86 @@ mod tests { assert!(p.contains("`mcp__chat__dispatch_decision`")); assert!(!p.contains("`dispatch_decision`\n")); } + + #[test] + fn programmatic_override_replaces_entire_prompt() { + let opts = PromptOptions { + system_prompt_override: Some("# CUSTOM PROMPT\nshipping nothing else.".into()), + ..Default::default() + }; + let p = build_system_prompt(&sample_spec(), &opts); + assert_eq!(p, "# CUSTOM PROMPT\nshipping nothing else."); + // None of the built-in sections should leak through. + assert!(!p.contains("CRITICAL RULES")); + assert!(!p.contains("Decision Inbox")); + assert!(!p.contains("MEMORY.md")); + } + + #[test] + fn env_var_override_replaces_entire_prompt() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("prompt.md"); + let custom = "# ENV OVERRIDE\nthis is the whole prompt.\n"; + std::fs::write(&path, custom).expect("write"); + // Use a guard to scope the env var so other tests aren't affected. + let _guard = EnvVarGuard::set(SYSTEM_PROMPT_OVERRIDE_ENV, path.to_str().unwrap()); + let p = build_system_prompt(&sample_spec(), &PromptOptions::default()); + assert_eq!(p, custom); + } + + #[test] + fn programmatic_override_wins_over_env_var() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("prompt.md"); + std::fs::write(&path, "# FROM FILE\n").expect("write"); + let _guard = EnvVarGuard::set(SYSTEM_PROMPT_OVERRIDE_ENV, path.to_str().unwrap()); + let opts = PromptOptions { + system_prompt_override: Some("# FROM CODE\n".into()), + ..Default::default() + }; + let p = build_system_prompt(&sample_spec(), &opts); + assert_eq!(p, "# FROM CODE\n"); + } + + #[test] + fn no_more_message_notification_style_branching() { + // The Message Notifications section is now always emitted with a single + // universal body — no Direct/Poll variants. The LLM doesn't care how + // delivery happens, it just needs to know not to poll. + let p = build_system_prompt(&sample_spec(), &PromptOptions::default()); + assert!(p.contains("## Message Notifications")); + assert!(p.contains("delivers them automatically")); + assert!(p.contains("`check_messages`")); + } + + /// Process-wide env var guard. Tests that mutate env vars must not run in + /// parallel with each other or with tests that read the same var; cargo + /// runs lib tests in parallel by default. We serialize via a static mutex. + struct EnvVarGuard { + key: &'static str, + prev: Option, + _lock: std::sync::MutexGuard<'static, ()>, + } + impl EnvVarGuard { + fn set(key: &'static str, value: &str) -> Self { + static LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + let lock = LOCK.lock().unwrap_or_else(|p| p.into_inner()); + let prev = std::env::var(key).ok(); + // SAFETY: env mutation is serialized by the LOCK above; this guard + // restores the previous value on drop. + unsafe { std::env::set_var(key, value); } + Self { key, prev, _lock: lock } + } + } + impl Drop for EnvVarGuard { + fn drop(&mut self) { + // SAFETY: still inside the LOCK held by self._lock. + unsafe { + match self.prev.take() { + Some(v) => std::env::set_var(self.key, v), + None => std::env::remove_var(self.key), + } + } + } + } } From 4291061f8681acf2b9d944ef80081e04c5f1800d Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Sat, 2 May 2026 01:49:05 +0800 Subject: [PATCH 4/6] bench(decision-trigger): hard cases + multi-model matrix sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds harder benchmark cases and a multi-model matrix runner that exposes real differences between the structural-rule prompt and the model's own inference style. Hard cases (cases-hard.tsv, 15 scenarios): - Realistic narrative framings (P0 escalation, sprint capacity, vendor procurement, hiring under deadline, SOC2 compliance, time-box at sprint end, architecture review, VP briefing) - No verdict-flavored phrasing — no "merge or hold", no "what's your call", no "X or Y". Decisions must be inferred from situational context - Trap cases for chat (rhetorical frustration, retrospective, exploration, status update, info request, debug ask, facilitator role) Multi-model matrix: - models.tsv lists (runtime, model, tier, label) rows. Default ships with the two-per-family pattern: Anthropic best/efficiency, OpenAI best/ efficiency - run.sh now takes RUNTIME, MODEL, RUN_LABEL, CASES via env so it can be driven by the matrix runner - run-matrix.sh sweeps all rows in models.tsv, runs the bench once per model, collates a side-by-side matrix.tsv Baseline (cases-hard.tsv, structural-rule prompt): - claude/opus: 9/15 (conservative — implicit delegation reads as chat) - claude/sonnet: 15/15 (best — infers delegation from context) - codex/gpt-5.5: 14/15 (one hiring miss) - codex/gpt-5.4-mini: 13/15 (one mis-fire, one silent) All 4 models score 7/7 on chat cases. The discriminator is property #4 (Delegated) — whether the model treats "we need X by Y" as an implicit delegation. Same prompt, same cases, 9-15/15 spread by model. BASELINE.md captures this and lays out the implications for the next prompt iteration. Co-Authored-By: Claude Opus 4.7 (1M context) --- bench/decision-trigger/BASELINE.md | 43 ++++++++++ bench/decision-trigger/README.md | 44 +++++++++- bench/decision-trigger/cases-hard.tsv | 16 ++++ bench/decision-trigger/models.tsv | 5 ++ bench/decision-trigger/run-matrix.sh | 119 ++++++++++++++++++++++++++ bench/decision-trigger/run.sh | 28 ++++-- 6 files changed, 243 insertions(+), 12 deletions(-) create mode 100644 bench/decision-trigger/BASELINE.md create mode 100644 bench/decision-trigger/cases-hard.tsv create mode 100644 bench/decision-trigger/models.tsv create mode 100755 bench/decision-trigger/run-matrix.sh diff --git a/bench/decision-trigger/BASELINE.md b/bench/decision-trigger/BASELINE.md new file mode 100644 index 00000000..66878c82 --- /dev/null +++ b/bench/decision-trigger/BASELINE.md @@ -0,0 +1,43 @@ +# Decision-trigger benchmark — baseline results + +Recorded baseline scores for the structural-rule prompt (PR #133). Run with: + +```bash +CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh +``` + +## Hard-cases matrix (cases-hard.tsv, 15 cases: 8 decision / 7 chat) + +| Model | Tier | Score | Notes | +|---|---|---|---| +| claude/opus | best | **9/15** | All 7 chat cases correct. 6 of 8 decision cases read as chat — Opus interprets property #4 (Delegated) strictly: implicit "we need" doesn't qualify. | +| claude/sonnet | efficiency | **15/15** | Infers delegation from situational context. Counterintuitively beats Opus. | +| codex/gpt-5.5 | best | **14/15** | Misses only the hiring case (case 5). | +| codex/gpt-5.4-mini | efficiency | **13/15** | 1 mis-fire (case 8 time-box → chat). 1 silent (case 1: no tool called at all). | + +All 4 models score 7/7 on chat cases (correct restraint). +The 8 decision cases are where the rule's "implicit delegation" reading differs by model. + +## Easy-cases (cases.tsv, 15 cases with verdict-flavored phrasing) + +Both the OLD prompt (input-pattern enumeration) and the NEW prompt (structural rule) score **15/15** on the easy cases with claude/sonnet — they don't differentiate at this difficulty. + +## What the matrix tells us + +1. **The structural rule's behavior depends heavily on the model.** Same prompt, same cases, scores from 9/15 to 15/15. +2. **Larger ≠ better on this benchmark.** Opus 4.7 (the "best" Anthropic model) is more conservative than Sonnet 4.6 — it refuses to infer delegation from context and only fires on explicit asks. +3. **Chat cases are easy across the board.** All 4 models nailed all 7. Restraint is not the problem. +4. **Implicit-delegation is the hard part.** Cases like *"We need a responder lined up before the call"* (Acme P0) require the model to infer that the asker is delegating the pick. Sonnet and gpt-5.5 mostly do; Opus doesn't. + +## Implications for prompt design + +If we want consistent behavior across models, the rule needs either: +- A stronger nudge that "implicit delegation in workplace context = delegation" (cost: more chat false-positives on smaller models), OR +- Explicit per-tier prompt variants (cost: maintenance), OR +- Acceptance that this is a model-capability ceiling and pick the model accordingly (cost: model lock-in). + +This baseline lets us measure the next prompt iteration against a real signal instead of guessing. + +## How to update this baseline + +After any prompt change that affects routing, re-run the matrix and replace this file. Keep the previous version in git history so we can diff baselines over time. diff --git a/bench/decision-trigger/README.md b/bench/decision-trigger/README.md index bfad67d4..881196b3 100644 --- a/bench/decision-trigger/README.md +++ b/bench/decision-trigger/README.md @@ -35,20 +35,58 @@ Total wall time is `max(per_agent_turn) ≈ 2 min`, not `sum`. - Claude runtime authed (`chorus setup` confirms) - `CHORUS_LOG` env var pointing to the server log (defaults to `/tmp/chorus-qa-server.log`) +## Cases + +Two case files at different difficulty: + +| File | Style | What it measures | +|---|---|---| +| `cases.tsv` | Easy / smoke. Decision-shaped requests use verdict-flavored phrasing (*"merge or hold?"*, *"what do you recommend?"*, *"your call"*). | Sanity check: prompt teaches the rule at all. Both input-pattern and structural-rule prompts hit 15/15 on this. | +| `cases-hard.tsv` | Realistic narrative scenarios. Decision-shaped requests use **neutral phrasing** (no "recommend", no "verdict", no "X or Y"). Trap cases include rhetorical frustration, retrospectives, exploration, status updates, and facilitation asks. | Differentiates prompts that pattern-match input phrasing from prompts that test the structural shape of the agent's intended reply. | + +To use the harder set: +```bash +CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run.sh +``` + ## Running +Single run against the default model (`claude/sonnet`): ```bash -# from repo root ./bench/decision-trigger/run.sh ``` -Optional: +Pick a different runtime/model: +```bash +RUNTIME=codex MODEL=gpt-5.5 ./bench/decision-trigger/run.sh +``` + +Sweep all models in `models.tsv` and produce a side-by-side matrix: +```bash +./bench/decision-trigger/run-matrix.sh +CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh +``` + +Common options: ```bash ./bench/decision-trigger/run.sh http://localhost:3001 # explicit server URL -KEEP_AGENTS=1 ./bench/decision-trigger/run.sh # don't auto-delete agents on exit (for forensics) +KEEP_AGENTS=1 ./bench/decision-trigger/run.sh # don't auto-delete agents on exit (forensics) CHORUS_LOG=/var/log/chorus.log ./bench/decision-trigger/run.sh ``` +## Models matrix + +`models.tsv` lists the (runtime, model, tier) combinations the matrix runner sweeps. Default ships with the two-per-family pattern: best + efficiency for Anthropic and OpenAI. + +| runtime | model | tier | resolves to | +|---|---|---|---| +| claude | opus | best | Claude Opus 4.7 | +| claude | sonnet | efficiency | Claude Sonnet 4.6 | +| codex | gpt-5.5 | best | GPT-5.5 | +| codex | gpt-5.4-mini | efficiency | GPT-5.4-mini | + +Add other rows (kimi, gemini, opencode) as Chorus drivers stabilize. Each row produces one column in the matrix output. + ## A/B testing prompt variants The whole system prompt is injectable via `CHORUS_SYSTEM_PROMPT_OVERRIDE_FILE`. To compare a candidate prompt against the built-in: diff --git a/bench/decision-trigger/cases-hard.tsv b/bench/decision-trigger/cases-hard.tsv new file mode 100644 index 00000000..37c1fa93 --- /dev/null +++ b/bench/decision-trigger/cases-hard.tsv @@ -0,0 +1,16 @@ +id predicted prompt +1 decision Acme just filed a P0. Their CEO is calling our CEO at 4pm. Eng-lead Bob is on vacation. Sarah (sr eng) rolled off another P0 yesterday and is fried. New eng Mike has time but doesn't know the codebase. We need a responder lined up before the call. +2 decision Sprint capacity is one fix slot left this week. Bug A: five customers hit a login latency spike. Bug B: one customer can't export a CSV larger than 100MB. Bug C: a feature 3% of users hit crashes on a specific edge case. Sprint ends Friday. +3 decision Eng wants Postgres for transactional consistency. Infra wants DynamoDB for ops simplicity. Finance wants whichever is cheaper. New service, architecture review tomorrow morning. +4 decision The SOC2 auditor wants finding F-7 remediated by Friday. We don't have a fix yet. Three paths I can see: deploy a compensating control we'd remove later; request a 2-week extension; accept the finding and document it. +5 decision Two finalists for the senior platform role. Alice: 8 years at AWS, deep K8s, communication is curt. Bob: 4 years but built our exact stack at his last startup, ships fast, but bus-factor risk if he leaves. Offer letter goes out tomorrow. +6 decision My VP is briefing the CEO at 5pm on whether we ship the mobile launch. I need to give my VP an answer by 4pm. The launch has two known crashers in iOS 18 hitting 3% of users. Android is clean. +7 decision We need an observability stack. Datadog: $120K/yr with features we won't use. New Relic: $60K/yr with an OK UI. Honeycomb: $40K/yr with the best ergonomics for our debug-heavy workflow. Procurement closes Q2 budget on Friday. +8 decision Two days into investigating the codex --resume stale-session bug. Sprint ends tomorrow. The fix path is murky — could be a TTL issue in rmcp, could be our session-id encoding, could be the codex CLI itself. We have a workaround that just skips resume on stale session. +9 chat Why are we even using event-sourcing for this service? Feels like overkill for the actual usage pattern. +10 chat I shipped the auth fix yesterday. In hindsight, given what we now know about the migration timing, was that the right call? +11 chat I'm thinking about proposing we deprecate the v1 API at the next architecture sync. What are the tradeoffs I should weigh before bringing it up? +12 chat Quick update: auth refactor is at 60%, the team wants to pause and reassess after seeing how the MCP migration went. No blockers, just a heads up. +13 chat What's the latency budget our SLO commits to? I'm setting the timeout in the new client SDK and want to make sure I match it. +14 chat Something crashed around 3:42am. Can you look at logs/server.log and tell me what went wrong? +15 chat Eng wants Postgres for transactional consistency, Infra wants DynamoDB for ops simplicity. I'm facilitating the architecture review tomorrow. Walk me through the tradeoff matrix so I can run a clean discussion — I'm not the decision-maker, I'm just running the meeting. diff --git a/bench/decision-trigger/models.tsv b/bench/decision-trigger/models.tsv new file mode 100644 index 00000000..7d7e8ecf --- /dev/null +++ b/bench/decision-trigger/models.tsv @@ -0,0 +1,5 @@ +runtime model tier label +claude opus best claude-opus-4-7 +claude sonnet efficiency claude-sonnet-4-6 +codex gpt-5.5 best codex-gpt-5-5 +codex gpt-5.4-mini efficiency codex-gpt-5-4-mini diff --git a/bench/decision-trigger/run-matrix.sh b/bench/decision-trigger/run-matrix.sh new file mode 100755 index 00000000..59b48664 --- /dev/null +++ b/bench/decision-trigger/run-matrix.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# Run the decision-trigger bench across multiple (runtime, model) combos and +# collate into a single matrix. Reads models from bench/decision-trigger/models.tsv. +# +# Usage: +# ./bench/decision-trigger/run-matrix.sh # default: cases.tsv, all models in models.tsv +# CASES=cases-hard.tsv ./bench/decision-trigger/run-matrix.sh +# MODELS=/path/to/custom-models.tsv ./bench/decision-trigger/run-matrix.sh +# +# Output: +# bench/decision-trigger/results/matrix-/ +# matrix.tsv — case x model match grid +# -/results.tsv — per-model raw results +set -euo pipefail + +BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SERVER_URL="${1:-http://localhost:3001}" +MODELS="${MODELS:-$BENCH_DIR/models.tsv}" +CASES="${CASES:-$BENCH_DIR/cases.tsv}" +[ -f "$MODELS" ] || { echo "models file not found: $MODELS" >&2; exit 1; } +[ -f "$CASES" ] || { echo "cases file not found: $CASES" >&2; exit 1; } + +MATRIX_RUN_ID=$(date +%s) +OUT_DIR="$BENCH_DIR/results/matrix-${MATRIX_RUN_ID}" +mkdir -p "$OUT_DIR" + +echo "== bench/decision-trigger MATRIX run $MATRIX_RUN_ID ==" +echo " models: $MODELS" +echo " cases: $CASES" +echo " server: $SERVER_URL" +echo " out: $OUT_DIR" +echo + +# Read model rows (skip header). +declare -a RUNTIMES MODELS_LIST LABELS TIERS +while IFS=$'\t' read -r runtime model tier label; do + [ "$runtime" = "runtime" ] && continue + [ -z "$runtime" ] && continue + RUNTIMES+=("$runtime"); MODELS_LIST+=("$model"); TIERS+=("$tier"); LABELS+=("$label") +done < "$MODELS" + +if [ ${#RUNTIMES[@]} -eq 0 ]; then + echo "no models in $MODELS" >&2; exit 1 +fi + +echo "matrix has ${#RUNTIMES[@]} models:" +for n in "${!RUNTIMES[@]}"; do + echo " ${LABELS[$n]} (${TIERS[$n]}, ${RUNTIMES[$n]}/${MODELS_LIST[$n]})" +done +echo + +# Run the bench once per model. +declare -a MODEL_RESULT_PATHS +for n in "${!RUNTIMES[@]}"; do + runtime="${RUNTIMES[$n]}"; model="${MODELS_LIST[$n]}"; label="${LABELS[$n]}" + echo "----- [$((n+1))/${#RUNTIMES[@]}] $label ($runtime/$model) -----" + RUNTIME="$runtime" MODEL="$model" RUN_LABEL="$label" CASES="$CASES" \ + bash "$BENCH_DIR/run.sh" "$SERVER_URL" \ + > "$OUT_DIR/${label}.log" 2>&1 || { + echo " $label run failed; continuing matrix" + } + # Find the per-run results.tsv this run produced. + result_path=$(ls -t "$BENCH_DIR/results/" 2>/dev/null | grep -E "^[0-9]+-${label}\$" | head -1) + if [ -n "$result_path" ] && [ -f "$BENCH_DIR/results/$result_path/results.tsv" ]; then + cp "$BENCH_DIR/results/$result_path/results.tsv" "$OUT_DIR/${label}-results.tsv" + MODEL_RESULT_PATHS+=("$OUT_DIR/${label}-results.tsv") + score=$(awk -F'\t' 'NR>1 && $5=="OK"' "$OUT_DIR/${label}-results.tsv" | wc -l | tr -d ' ') + total=$(awk -F'\t' 'NR>1' "$OUT_DIR/${label}-results.tsv" | wc -l | tr -d ' ') + echo " $label: $score/$total" + else + echo " $label: no results.tsv" + MODEL_RESULT_PATHS+=("") + fi + echo +done + +# Build the matrix table. +MATRIX="$OUT_DIR/matrix.tsv" +{ + printf "case\tpredicted" + for label in "${LABELS[@]}"; do printf "\t%s" "$label"; done + printf "\tprompt\n" + + # Read cases (id, predicted, prompt). + while IFS=$'\t' read -r id predicted prompt; do + [ "$id" = "id" ] && continue + [ -z "$id" ] && continue + short=$(echo "$prompt" | head -c 80) + printf "%s\t%s" "$id" "$predicted" + for n in "${!LABELS[@]}"; do + rp="${MODEL_RESULT_PATHS[$n]}" + if [ -z "$rp" ] || [ ! -f "$rp" ]; then + printf "\t-" + continue + fi + # Find this case's row in this model's results. + row=$(awk -F'\t' -v id="$id" 'NR>1 && $1==id {print $4 "/" $5}' "$rp" | head -1) + printf "\t%s" "${row:-?}" + done + printf "\t%s\n" "$short" + done < "$CASES" +} > "$MATRIX" + +echo "===== MATRIX =====" +column -t -s$'\t' "$MATRIX" +echo +echo "summary:" +for n in "${!LABELS[@]}"; do + rp="${MODEL_RESULT_PATHS[$n]}" + if [ -z "$rp" ] || [ ! -f "$rp" ]; then + echo " ${LABELS[$n]}: no data" + continue + fi + score=$(awk -F'\t' 'NR>1 && $5=="OK"' "$rp" | wc -l | tr -d ' ') + total=$(awk -F'\t' 'NR>1' "$rp" | wc -l | tr -d ' ') + echo " ${LABELS[$n]} (${TIERS[$n]}): $score/$total" +done +echo +echo "matrix saved to $MATRIX" diff --git a/bench/decision-trigger/run.sh b/bench/decision-trigger/run.sh index d0d64958..74d9585f 100755 --- a/bench/decision-trigger/run.sh +++ b/bench/decision-trigger/run.sh @@ -10,8 +10,15 @@ set -euo pipefail SERVER_URL="${1:-http://localhost:3001}" BENCH_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CASES="$BENCH_DIR/cases.tsv" -RUN_ID="$(date +%s)" +# Cases file: defaults to cases.tsv (easy/smoke). Override with CASES=cases-hard.tsv. +CASES="${CASES:-$BENCH_DIR/cases.tsv}" +[ -f "$CASES" ] || CASES="$BENCH_DIR/$(basename "$CASES")" +# Runtime + model: which agent to spin up per case. Defaults are the cheapest +# stable combo. The matrix runner sets these per sweep. +RUNTIME="${RUNTIME:-claude}" +MODEL="${MODEL:-sonnet}" +RUN_LABEL="${RUN_LABEL:-${RUNTIME}-${MODEL}}" +RUN_ID="$(date +%s)-${RUN_LABEL}" RESULTS_DIR="$BENCH_DIR/results/$RUN_ID" mkdir -p "$RESULTS_DIR" @@ -41,10 +48,12 @@ fi CURL=(curl --noproxy '*' -sS -m 10) echo "== bench/decision-trigger run $RUN_ID ==" -echo " server: $SERVER_URL" -echo " log: $LOG" -echo " cases: $CASES" -echo " out: $RESULTS_DIR" +echo " server: $SERVER_URL" +echo " log: $LOG" +echo " cases: $CASES" +echo " runtime: $RUNTIME" +echo " model: $MODEL" +echo " out: $RESULTS_DIR" # Pause any non-bench agents so they don't flood the bench cohort with welcome # messages during boot. We only stop running ones; KEEP_OTHERS=1 disables this. @@ -91,10 +100,11 @@ declare -a IDS PREDICTS PROMPTS AGENTS while IFS=$'\t' read -r id predicted prompt; do [ "$id" = "id" ] && continue IDS+=("$id"); PREDICTS+=("$predicted"); PROMPTS+=("$prompt") - base="bench-dt-${RUN_ID}-${id}" + base="bench-dt-${RUN_ID//[^a-zA-Z0-9]/-}-${id}" + # Names can't be too long; runtime+model is appended for forensics in case-N description. out=$("$CHORUS" agent create \ - --runtime claude --model sonnet \ - --description "Decision-trigger bench, case $id. Each DM is one independent test prompt." \ + --runtime "$RUNTIME" --model "$MODEL" \ + --description "Decision-trigger bench, case $id, ${RUNTIME}/${MODEL}. Each DM is one independent test prompt." \ --server-url "$SERVER_URL" \ "$base" 2>&1) # Extract assigned name: "Agent @ created" From 7ff39f2fcf73500f58b641d816e8f39d5454b7e0 Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Sat, 2 May 2026 02:48:04 +0800 Subject: [PATCH 5/6] bench(decision-trigger): A/B baseline OLD vs NEW prompt across 4 models MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the actual head-to-head between the OLD prompt (input-pattern enumeration on main) and the NEW prompt (four-property structural test on this branch). Same 15 hard cases, same 4 models, parallel runner. Headline scores (cases-hard.tsv): Model Tier OLD NEW Δ claude/opus best 15/15 9/15 -6 claude/sonnet efficiency 14/15 15/15 +1 codex/gpt-5.5 best 14/15 14/15 0 codex/gpt-5.4-mini efficiency 12/15 13/15 +1 ------------------------------------------------- average 13.75 12.75 -1.0 Aggregate behavior: Decisions caught (32 max): OLD 30/32 (94%) vs NEW 23/32 (72%) Chat held back (28 max): OLD 25/28 (89%) vs NEW 28/28 (100%) The structural rewrite is NOT a strict win. NEW closes the retrospective false-positive (case 10: "in hindsight, was that the right call?" — OLD over-fires on sonnet/gpt-5.5/gpt-5.4-mini, NEW correctly chats on all). But NEW costs Opus 6 implicit-delegation decisions because Opus reads property #4 (Delegated) strictly: "we need X by Y" doesn't count as delegation without an explicit "you pick" clause. Sonnet, gpt-5.5, and gpt-5.4-mini are stable across both prompts — they infer delegation from situational context regardless of which rule is loaded. The Opus regression is model-specific. BASELINE.md captures the full per-case matrix, named winners and losers, known failure modes (gpt-5.4-mini case 1 silent under NEW; gpt-5.5 case 5 flips), and three iteration paths for the next prompt revision. Co-Authored-By: Claude Opus 4.7 (1M context) --- bench/decision-trigger/BASELINE.md | 139 +++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 28 deletions(-) diff --git a/bench/decision-trigger/BASELINE.md b/bench/decision-trigger/BASELINE.md index 66878c82..1cd16565 100644 --- a/bench/decision-trigger/BASELINE.md +++ b/bench/decision-trigger/BASELINE.md @@ -1,43 +1,126 @@ -# Decision-trigger benchmark — baseline results +# Decision-trigger benchmark — A/B baseline (OLD vs NEW prompt) -Recorded baseline scores for the structural-rule prompt (PR #133). Run with: +Head-to-head between the **OLD** prompt (input-pattern enumeration, on `main` before PR #133) and the **NEW** prompt (four-property structural test, on PR #133 branch). Same 15 hard cases, same 4 models, same parallel runner. Captured 2026-05-02. -```bash -CASES=$PWD/bench/decision-trigger/cases-hard.tsv ./bench/decision-trigger/run-matrix.sh +## Headline scores (cases-hard.tsv) + +| Model | Tier | OLD | NEW | Δ | +|---|---|---|---|---| +| claude/opus | best | **15/15** | 9/15 | **−6** | +| claude/sonnet | efficiency | 14/15 | **15/15** | +1 | +| codex/gpt-5.5 | best | 14/15 | 14/15 | 0 | +| codex/gpt-5.4-mini | efficiency | 12/15 | **13/15** | +1 | +| **average** | | **13.75/15** | **12.75/15** | **−1.0** | + +NEW prompt **regresses on Opus by 6 points**, gains 1 on Sonnet, gains 1 on gpt-5.4-mini, washes on gpt-5.5. Net negative on average. + +## Aggregate behavior delta + +| | OLD prompt | NEW prompt | +|---|---|---| +| Decision-cases caught (max 32 = 8 cases × 4 models) | 30/32 (94%) | 23/32 (72%) | +| Chat-cases held back (max 28 = 7 cases × 4 models) | 25/28 (89%) | **28/28 (100%)** | + +OLD is better at firing decisions. NEW is better at restraint. Different tradeoff, not a strict win. + +## Per-case breakdown + +``` +case predicted OLD-opus NEW-opus OLD-sonnet NEW-sonnet OLD-gpt5.5 NEW-gpt5.5 OLD-mini NEW-mini + 1 decision decision chat decision decision decision decision chat unknown + 2 decision decision chat decision decision decision decision decision decision + 3 decision decision chat decision decision decision decision decision decision + 4 decision decision chat decision decision decision decision decision decision + 5 decision decision chat decision decision decision chat decision decision + 6 decision decision decision decision decision decision decision decision decision + 7 decision decision chat decision decision decision decision decision decision + 8 decision decision decision decision decision decision decision chat chat + 9 chat chat chat chat chat chat chat chat chat +10 chat chat chat decision chat decision chat decision chat +11 chat chat chat chat chat chat chat chat chat +12 chat chat chat chat chat chat chat chat chat +13 chat chat chat chat chat chat chat chat chat +14 chat chat chat chat chat chat chat chat chat +15 chat chat chat chat chat chat chat chat chat ``` -## Hard-cases matrix (cases-hard.tsv, 15 cases: 8 decision / 7 chat) +## Where each prompt wins + +### NEW wins on case 10 (retrospective trap), 3 cells + +Case 10 prompt: *"I shipped the auth fix yesterday. In hindsight, given what we now know about the migration timing, was that the right call?"* + +| Model | OLD prompt | NEW prompt | +|---|---|---| +| sonnet | decision (over-fires) | chat ✓ | +| gpt-5.5 | decision (over-fires) | chat ✓ | +| gpt-5.4-mini | decision (over-fires) | chat ✓ | +| opus | chat ✓ | chat ✓ | + +The structural rule's properties #2 (Blocking) and #3 (Material consequence) explicitly fail for retrospectives — the PR already shipped, nothing is gated on the agent's verdict. The OLD prompt's input-pattern matching can't distinguish *"was that the right call?"* from a current verdict, so 3/4 models fire incorrectly. **NEW is genuinely better at restraint.** -| Model | Tier | Score | Notes | -|---|---|---|---| -| claude/opus | best | **9/15** | All 7 chat cases correct. 6 of 8 decision cases read as chat — Opus interprets property #4 (Delegated) strictly: implicit "we need" doesn't qualify. | -| claude/sonnet | efficiency | **15/15** | Infers delegation from situational context. Counterintuitively beats Opus. | -| codex/gpt-5.5 | best | **14/15** | Misses only the hiring case (case 5). | -| codex/gpt-5.4-mini | efficiency | **13/15** | 1 mis-fire (case 8 time-box → chat). 1 silent (case 1: no tool called at all). | +### OLD wins on Opus, 6 cells (cases 1, 2, 3, 4, 5, 7 — all implicit-delegation decisions) -All 4 models score 7/7 on chat cases (correct restraint). -The 8 decision cases are where the rule's "implicit delegation" reading differs by model. +Each of these cases presents mutually exclusive options + a deadline + situational context, but lacks an explicit *"you pick"* clause. -## Easy-cases (cases.tsv, 15 cases with verdict-flavored phrasing) +- **OLD prompt** enumerates *"presents two or more concrete alternatives and asks you to pick"*. Opus interprets "presents alternatives + deadline" as the trigger and fires. +- **NEW prompt** requires all four structural properties including #4 (Delegated). Opus reads *"we need a responder lined up before the call"* as the team's own action item, not a delegation to the agent, and refuses to fire. -Both the OLD prompt (input-pattern enumeration) and the NEW prompt (structural rule) score **15/15** on the easy cases with claude/sonnet — they don't differentiate at this difficulty. +Why is this Opus-specific? Sonnet, gpt-5.5, and gpt-5.4-mini all infer delegation from situational context regardless of which prompt is loaded. Opus is the only model that strictly waits for an explicit *"you pick"* under the NEW rule. **The NEW rule's strict interpretation of property #4 is exactly what trips Opus.** -## What the matrix tells us +### Stable across both prompts -1. **The structural rule's behavior depends heavily on the model.** Same prompt, same cases, scores from 9/15 to 15/15. -2. **Larger ≠ better on this benchmark.** Opus 4.7 (the "best" Anthropic model) is more conservative than Sonnet 4.6 — it refuses to infer delegation from context and only fires on explicit asks. -3. **Chat cases are easy across the board.** All 4 models nailed all 7. Restraint is not the problem. -4. **Implicit-delegation is the hard part.** Cases like *"We need a responder lined up before the call"* (Acme P0) require the model to infer that the asker is delegating the pick. Sonnet and gpt-5.5 mostly do; Opus doesn't. +- **Case 6** (*"I need to give my VP an answer by 4pm"* — explicit time-anchored ask): every model fires decision under both prompts. +- **Case 8** (sprint-end time-box with options laid out): every model except gpt-5.4-mini fires decision under both prompts. gpt-5.4-mini misses under both — model capability ceiling, not a prompt issue. +- **All chat cases except 10**: clean restraint across all 4 models, both prompts. -## Implications for prompt design +## Failure modes worth noting -If we want consistent behavior across models, the rule needs either: -- A stronger nudge that "implicit delegation in workplace context = delegation" (cost: more chat false-positives on smaller models), OR -- Explicit per-tier prompt variants (cost: maintenance), OR -- Acceptance that this is a model-capability ceiling and pick the model accordingly (cost: model lock-in). +1. **gpt-5.4-mini case 1 (NEW = `unknown`)** — the model didn't call any tool at all under the NEW prompt. It saw the prompt, the run completed `reason=Natural`, but no `dispatch_decision` and no `send_message`. Same case, OLD prompt → it correctly chose `chat` (which is wrong vs prediction but a real choice). The NEW prompt seems to have caused gpt-5.4-mini to freeze on this prompt — worth investigating. -This baseline lets us measure the next prompt iteration against a real signal instead of guessing. +2. **gpt-5.5 case 5 NEW = `chat`** — only OLD/NEW divergence on gpt-5.5. The hiring case under NEW landed in chat. Looking at the agent's actual reply would tell us why. + +## Conclusion + +**The structural rewrite is a tradeoff, not a strict win.** Average pass rate drops 1 point (13.75 → 12.75 / 15) across 4 models, but the loss is concentrated on a single model (Opus) and the gain is real signal (case 10 restraint). + +What it actually achieves: +- ✅ **Clean restraint on retrospectives.** The OLD prompt's input-pattern matching has a known false-positive on retrospective phrasing; the NEW rule closes it. +- ❌ **Loses on implicit-delegation decisions for Opus.** The strict reading of property #4 (Delegated) excludes the *"we need X by Y"* framings that real teams use all the time. Opus is the only model that takes this strictness literally. +- 〇 **Wash on Sonnet, gpt-5.5, gpt-5.4-mini.** Those models infer delegation from situational context regardless of which prompt is loaded. + +## Implications for the next iteration + +Three options for the prompt-rule tuning: + +1. **Soften property #4.** Add a clause like *"a request that lays out mutually exclusive alternatives plus a deadline counts as implicit delegation, even without an explicit 'you pick'."* Recovers Opus without losing Sonnet/gpt-5.5/gpt-5.4-mini. +2. **Accept the Opus regression.** Ship the NEW rule as-is — the chat-restraint gain is principled, and Opus users can be coached toward explicit phrasing. Trade decision-firing for false-positive avoidance. +3. **Split the prompt by tier.** Opus gets a more permissive trigger, Sonnet gets the strict one. Maintenance cost. + +This baseline lets us measure each iteration against real signal instead of guessing. Re-run after any prompt change that affects routing. + +## Reproducing this report + +```bash +# OLD prompt baseline (main, port 3002 + bridge 4322 to coexist with a running NEW server): +git worktree add /tmp/chorus-main main +cd /tmp/chorus-main && cargo build --bin chorus +/tmp/chorus-main/target/debug/chorus serve --port 3002 --bridge-port 4322 \ + > /tmp/chorus-old.log 2>&1 & +CHORUS_LOG=/tmp/chorus-old.log \ + CASES=$PWD/bench/decision-trigger/cases-hard.tsv \ + ./bench/decision-trigger/run-matrix.sh http://localhost:3002 + +# NEW prompt baseline (PR #133 branch, port 3001): +cargo build --bin chorus +./target/debug/chorus serve --port 3001 > /tmp/chorus-new.log 2>&1 & +CHORUS_LOG=/tmp/chorus-new.log \ + CASES=$PWD/bench/decision-trigger/cases-hard.tsv \ + ./bench/decision-trigger/run-matrix.sh http://localhost:3001 +``` -## How to update this baseline +Each matrix takes ~45-50 min (4 models, parallel-per-model). Raw results live under `bench/decision-trigger/results/matrix-/`. -After any prompt change that affects routing, re-run the matrix and replace this file. Keep the previous version in git history so we can diff baselines over time. +Captured runs in this report: +- OLD: `matrix-1777658557/` +- NEW: `matrix-1777647089/` From ed49a8b1e085dfd0e9c193e8417b6890d98350f2 Mon Sep 17 00:00:00 2001 From: Fullstop000 Date: Sat, 2 May 2026 11:55:57 +0800 Subject: [PATCH 6/6] chore: cargo fmt prompt.rs Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agent/drivers/prompt.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/agent/drivers/prompt.rs b/src/agent/drivers/prompt.rs index ac6d2e46..ed4d0291 100644 --- a/src/agent/drivers/prompt.rs +++ b/src/agent/drivers/prompt.rs @@ -510,8 +510,14 @@ mod tests { let prev = std::env::var(key).ok(); // SAFETY: env mutation is serialized by the LOCK above; this guard // restores the previous value on drop. - unsafe { std::env::set_var(key, value); } - Self { key, prev, _lock: lock } + unsafe { + std::env::set_var(key, value); + } + Self { + key, + prev, + _lock: lock, + } } } impl Drop for EnvVarGuard {