From f9715cdd1970be0d39d6bd6b6e842ee195fed2b3 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 15:59:51 +0200 Subject: [PATCH 01/23] docs(spike): scope Codex CLI feasibility Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- AGENTS.md | 3 +- README.md | 1 + docs/PRD.md | 1 + docs/decision-log.md | 40 +++++ docs/product/CODEX-CLI-SPIKE-SCOPE.md | 137 +++++++++++++++++ package.json | 3 +- scripts/codex-cli-feasibility-probe.js | 180 +++++++++++++++++++++++ test/codex-cli-feasibility-probe.test.js | 79 ++++++++++ 8 files changed, 442 insertions(+), 2 deletions(-) create mode 100644 docs/product/CODEX-CLI-SPIKE-SCOPE.md create mode 100644 scripts/codex-cli-feasibility-probe.js create mode 100644 test/codex-cli-feasibility-probe.test.js diff --git a/AGENTS.md b/AGENTS.md index aaad35f..8bcc0dd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -30,7 +30,8 @@ Read in this order: Read in this order: 1. [docs/product/ROUTER-PHASE-PLAN.md](docs/product/ROUTER-PHASE-PLAN.md) 2. [docs/decision-log.md](docs/decision-log.md) -3. [docs/REPLAY-GUIDE.md](docs/REPLAY-GUIDE.md) +3. [docs/product/CODEX-CLI-SPIKE-SCOPE.md](docs/product/CODEX-CLI-SPIKE-SCOPE.md) +4. [docs/REPLAY-GUIDE.md](docs/REPLAY-GUIDE.md) ### I need security and risk context diff --git a/README.md b/README.md index 60ae06b..831703d 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `switchboard advise --surface openai-codex "your prompt"` | Returns an advisory routing recommendation for a selected surface without taking over execution. | You want a cross-surface recommendation or policy check before running a turn. | | `switchboard probe continuity` | Runs a continuity probe for prompt-driven turns and reports whether session continuity checks pass. | You want to verify non-interactive continuity behavior after changes. | | `switchboard probe continuity-interactive` | Runs the interactive continuity probe and verifies resume/session behavior across turns. | You want to validate interactive continuity and related checks. | +| `npm run switchboard:spike:codex-cli` | Inspects the local Codex CLI command surface and maps two routed turns to Codex `exec`/`resume --model` plans without making live model calls. | You want a product-aligned feasibility signal for Codex CLI route authority before building a deeper integration. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | ### Interactive Mode Clarification diff --git a/docs/PRD.md b/docs/PRD.md index d226e44..b675048 100644 --- a/docs/PRD.md +++ b/docs/PRD.md @@ -16,6 +16,7 @@ The PRD content has been split into focused documents so each read is shorter an - Router contracts: [contracts/router-contracts.md](contracts/router-contracts.md) - MVP product scope: [product/MVP-PRD.md](product/MVP-PRD.md) - Router phase execution plan: [product/ROUTER-PHASE-PLAN.md](product/ROUTER-PHASE-PLAN.md) +- Codex CLI feasibility spike scope: [product/CODEX-CLI-SPIKE-SCOPE.md](product/CODEX-CLI-SPIKE-SCOPE.md) - Decision history: [decision-log.md](decision-log.md) - Replay and evaluation guide: [REPLAY-GUIDE.md](REPLAY-GUIDE.md) diff --git a/docs/decision-log.md b/docs/decision-log.md index e835fa3..2c84321 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -332,3 +332,43 @@ Consequences: Follow-up: - Next review milestone: (1) implement and live-verify improved advisory injection; (2) complete a client surface survey covering at minimum Cursor, GitHub Copilot Chat, Gemini CLI, and one gateway-backed path; (3) revisit Option C and Option D based on survey findings before broad adoption promotion. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/MVP-PRD.md (Assumption B, Section 8 MVP defer list), docs/product/PRODUCT-PRD.md (Section 17.1 deferred items updated 2026-05-13), README.md (Interactive Mode Clarification section added 2026-05-13), src/switchboard/claude-hook-bridge.js (advisory injection comment) + +## Milestone 5 Spike: Codex CLI Feasibility + +Decision ID: DEC-2026-05-13-codex-cli-feasibility-spike +Related deferred item: PRODUCT-PRD.md Section 17.1 deferred items; milestone 5 second-surface proof +Status: committed +Date: 2026-05-13 +Owners: team + +Context: +- The current committed near-term path remains advisory injection inside running Claude sessions, but this alone does not answer whether Codex CLI can provide a better route-authority boundary. +- The product question is not whether OpenAI SDK calls can change models. The question is whether the Codex CLI user surface exposes a supported boundary where Switchboard can choose the execution target with low UX friction while preserving session continuity. + +Options considered: +- Option A: continue advisory hardening only and postpone all Codex feasibility testing. +- Option B: run a Codex CLI command-surface spike first, focused on supported `codex exec` and `codex exec resume` route authority. +- Option C: use OpenAI SDK calls as a proxy for Codex feasibility. + +Tradeoffs: +- Option A: lowest immediate effort, highest unresolved product-risk uncertainty. +- Option B: small incremental engineering overhead, answers the client-surface question directly, but initially proves command capability rather than live execution. +- Option C: easy to implement and live-test, but risks falsely validating the wrong surface because SDK calls do not represent Codex CLI session/tool/runtime behavior. + +Verification signal: +- Expected signal: a reproducible probe that can show whether local Codex CLI exposes route-selected model authority at launch, non-interactive execution, or resume boundaries, and whether it should be treated as authoritative or advisory. +- Evidence observed: local Codex CLI help exposes `--model` for interactive launch, `codex exec --model`, and `codex exec resume --last --model`. A new probe (`scripts/codex-cli-feasibility-probe.js`) reports this as resume-boundary route authority, not in-session automatic switching. + +Decision: +- Chosen option: Option B. +- Scope of commitment: treat Codex CLI as a candidate for route authority at `exec`/`resume` boundaries and continue the spike with a live two-turn resume probe before making product promises. +- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions until live resume evidence is collected. + +Consequences: +- Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. +- Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority. +- Migration impact: low; probe is additive and does not alter core Claude workflow contracts. + +Follow-up: +- Next review milestone: use the written spike scope to run a live Codex CLI `exec` plus `exec resume --last --model ... --json` probe and inspect whether session continuity and route-selected model changes are visible in durable evidence. +- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, test/codex-cli-feasibility-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md new file mode 100644 index 0000000..4608640 --- /dev/null +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -0,0 +1,137 @@ +# Codex CLI Feasibility Spike Scope + +## Status + +Status: active spike + +Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). + +## Purpose + +Determine whether Codex CLI can support Switchboard's core product promise better than the current Claude Code path: + +```text +Switchboard chooses the right execution target before a turn, +preserves continuity when switching is worth it, +and reduces the user's model-selection overhead. +``` + +This spike is not a product build. It is a bounded feasibility check. + +## Product Question + +Can Codex CLI provide a supported route-authority boundary where Switchboard can choose a model or execution target per turn while preserving enough session continuity to feel natural for software-delivery work? + +## What We Need To Verify + +1. **Route authority** + - Can Switchboard pass a route-selected model/profile into Codex CLI using supported CLI options? + - Does this work for both a new turn and a resumed session? + +2. **Continuity** + - Can a second routed Codex CLI turn resume the prior session after the first turn? + - Does continuity remain usable when the second turn uses a different route-selected model? + +3. **Evidence** + - Can the probe capture durable evidence that the selected model/profile changed between turns? + - Can the probe capture durable evidence that the second turn resumed the expected session? + - Is the evidence inspectable without reading raw implementation details? + +4. **User friction** + - Does the resulting workflow materially reduce model-selection overhead compared with manual Codex model choice? + - Does it avoid the Claude-style pattern of repeatedly exiting and re-entering an interactive session for each routed turn? + +5. **Boundary fit** + - Does the integration consume router contracts and target metadata rather than Codex-specific shortcuts? + - Does the result validate a reusable router boundary rather than a one-off Codex script? + +## Success Criteria + +The spike is successful only if all required criteria are met: + +1. A two-turn live probe can run through Codex CLI with two different Switchboard-selected models. +2. The second turn resumes the first turn's Codex session through a supported CLI mechanism. +3. The probe records both selected target IDs and resolved Codex models. +4. The probe records enough session evidence to show that continuity was preserved. +5. The workflow requires no manual model selection by the user after the prompt is provided. +6. The implementation remains a spike/probe and does not replace the Claude MVP path. + +## Partial Success + +The spike is partial, but still useful, if Codex CLI supports route-selected models only at `exec` or `resume` boundaries. + +That outcome would mean Codex CLI may support a non-interactive or wrapper-style Switchboard workflow, but it does not prove automatic switching inside a running interactive Codex TUI. + +## Failure Criteria + +The spike should be considered failed or blocked if any of the following are true: + +1. Codex CLI does not expose a supported model/profile option for resumed turns. +2. Resumed turns cannot preserve usable session continuity. +3. Route-selected model changes are accepted syntactically but cannot be verified from durable evidence. +4. The only viable path requires private, unsupported, or brittle Codex internals. +5. The resulting workflow has the same or worse cognitive overhead as manual model selection. + +## Explicit Non-Goals + +This spike must not attempt to: + +1. Build a full Codex workflow product. +2. Replace the Claude Code MVP. +3. Implement automatic switching inside a running Codex TUI unless Codex exposes a supported mechanism. +4. Build a provider gateway or proxy. +5. Add cross-vendor routing beyond the Codex CLI surface. +6. Tune routing policy or target taxonomy. +7. Add persistent production state beyond probe evidence. +8. Make public product claims before live evidence is collected. + +## Probe Plan + +### Phase 1: Command-Surface Probe + +Status: implemented. + +Use [../../scripts/codex-cli-feasibility-probe.js](../../scripts/codex-cli-feasibility-probe.js) to inspect local Codex CLI help output and produce a capability report. + +Expected output: + +- `status: partial` when `codex exec --model` and `codex exec resume --last --model` are available. +- `authoritativeInsideRunningSession: false` unless a supported in-session mechanism is discovered. +- Two planned routed turns with different selected targets and models. + +### Phase 2: Live Resume Probe + +Status: next. + +Run two real Codex CLI turns: + +1. First turn: route an implementation/debugging prompt to a strong coding target and execute with `codex exec --model --json`. +2. Second turn: route a summary/acknowledgement prompt to a cheap/fast target and execute with `codex exec resume --last --model --json`. +3. Capture JSON/session evidence. +4. Evaluate model change, session continuity, and user friction against the success criteria above. + +### Phase 3: Product Decision + +Classify the result as one of: + +- `verified`: Codex CLI supports route-selected resumed turns with usable continuity and inspectable evidence. +- `partial`: Codex CLI supports route-selected command boundaries but not a low-friction interactive workflow. +- `blocked`: Codex CLI cannot support route authority with continuity through supported mechanisms. + +## Stop Conditions + +Stop the spike when one of these is true: + +1. The success criteria are met and documented. +2. A failure criterion is met and documented. +3. The next step would require building production workflow infrastructure instead of collecting feasibility evidence. +4. The next step would require relying on unsupported Codex internals. + +## Deliverables + +The spike should produce: + +1. A command-surface capability report. +2. A live two-turn resume probe result, if feasible. +3. A short decision-log update classifying the result as `verified`, `partial`, or `blocked`. +4. A recommendation for whether Codex CLI should remain a serious candidate for the next product surface. diff --git a/package.json b/package.json index 581c330..960fb01 100644 --- a/package.json +++ b/package.json @@ -41,7 +41,8 @@ "switchboard:interactive": "node bin/switchboard.js --interactive", "switchboard:explain": "node bin/switchboard.js explain --json", "switchboard:continuity": "node bin/switchboard.js probe continuity --no-tools --inter-turn-delay-ms 1000", - "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json" + "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json", + "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js" }, "repository": { "type": "git", diff --git a/scripts/codex-cli-feasibility-probe.js b/scripts/codex-cli-feasibility-probe.js new file mode 100644 index 0000000..31d934a --- /dev/null +++ b/scripts/codex-cli-feasibility-probe.js @@ -0,0 +1,180 @@ +#!/usr/bin/env node +import { spawnSync } from "node:child_process"; +import fs from "node:fs"; +import { fileURLToPath } from "node:url"; +import { routePrompt } from "../src/router/router.js"; +import { OPENAI_TARGETS_PATH } from "../src/switchboard/paths.js"; +import { getProfileModelMap, getTargetProfileMap } from "../src/adapters/model-mappings.js"; + +const TARGET_TO_PROFILE = getTargetProfileMap("openai-codex"); +const PROFILE_TO_MODEL = getProfileModelMap("openai-codex"); + +function readJson(filePath) { + // eslint-disable-next-line security/detect-non-literal-fs-filename -- probe reads the known targets path or explicit test fixture path. + return JSON.parse(fs.readFileSync(filePath, "utf8")); +} + +function getArg(args, flag) { + const idx = args.lastIndexOf(flag); + if (idx === -1 || idx + 1 >= args.length) return null; + return args[idx + 1]; +} + +function runHelp(codexBin, args) { + const result = spawnSync(codexBin, args, { + encoding: "utf8", + env: { ...process.env, NO_COLOR: "1" } + }); + return { + command: [codexBin, ...args].join(" "), + status: result.status, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0 + }; +} + +function hasOption(helpText, option) { + return new RegExp(`(^|\\n)\\s*(?:-[^\\n,]+,\\s*)?${option.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}(\\s|[<\\[])`).test(helpText); +} + +function hasCommand(helpText, command) { + return new RegExp(`(^|\\n)\\s*${command}\\b`).test(helpText); +} + +function routeTurnPlan({ input, session, targets }) { + const route = routePrompt({ + input, + session, + targets, + executionSupported: false + }); + const targetId = route.selectedTarget?.id || null; + const profile = targetId ? TARGET_TO_PROFILE[targetId] || null : null; + const model = profile ? PROFILE_TO_MODEL[profile] || null : null; + return { + input, + route: { + status: route.status, + mode: route.mode, + selectedTargetId: targetId, + targetClass: route.selectedTarget?.target_class || null, + shouldSwitch: route.shouldSwitch, + explanation: route.explanation + }, + codex: { + profile, + model, + execArgs: model ? ["exec", "--model", model, input] : null, + resumeArgs: model ? ["exec", "resume", "--last", "--model", model, input] : null + } + }; +} + +export function runCodexCliFeasibilityProbe({ + codexBin = "codex", + targets = readJson(OPENAI_TARGETS_PATH).targets +} = {}) { + const rootHelp = runHelp(codexBin, ["--help"]); + const execHelp = runHelp(codexBin, ["exec", "--help"]); + const resumeHelp = runHelp(codexBin, ["exec", "resume", "--help"]); + + const rootText = `${rootHelp.stdout}\n${rootHelp.stderr}`; + const execText = `${execHelp.stdout}\n${execHelp.stderr}`; + const resumeText = `${resumeHelp.stdout}\n${resumeHelp.stderr}`; + const commandAvailable = rootHelp.ok; + + const capabilities = { + interactiveModelAtLaunch: commandAvailable && hasOption(rootText, "--model"), + execModelAtLaunch: execHelp.ok && hasOption(execText, "--model"), + execResumeCommand: execHelp.ok && hasCommand(execText, "resume"), + execResumeModelOverride: resumeHelp.ok && hasOption(resumeText, "--model"), + execResumeLastSession: resumeHelp.ok && hasOption(resumeText, "--last"), + jsonEvents: execHelp.ok && hasOption(execText, "--json") + }; + + const session = { + mode: "plan", + currentTargetId: null, + turnCount: 0, + routingOverride: "auto", + vendorClient: "openai-codex", + clientSurface: "codex-cli" + }; + + const first = routeTurnPlan({ + input: "Implement the retry logic with clear tests and error handling.", + session, + targets + }); + if (first.route.status === "ok") { + session.mode = first.route.mode; + session.currentTargetId = first.route.selectedTargetId; + session.turnCount += 1; + } + const second = routeTurnPlan({ + input: "Thanks, summarize the outcome briefly.", + session, + targets + }); + + const targetChanged = Boolean( + first.route.selectedTargetId && + second.route.selectedTargetId && + first.route.selectedTargetId !== second.route.selectedTargetId + ); + const resumeBoundaryRerouteSupported = Boolean( + targetChanged && + capabilities.execResumeCommand && + capabilities.execResumeModelOverride && + capabilities.execResumeLastSession + ); + + return { + status: !commandAvailable + ? "blocked" + : resumeBoundaryRerouteSupported + ? "partial" + : "advisory_only", + surface: "codex-cli", + verdict: { + authoritativeInsideRunningSession: false, + resumeBoundaryRerouteSupported, + nonInteractiveTurnRoutingSupported: Boolean(capabilities.execModelAtLaunch), + advisorySupported: commandAvailable, + targetChanged + }, + capabilities, + turnPlans: [first, second], + evidence: { + commands: [ + { command: rootHelp.command, status: rootHelp.status, ok: rootHelp.ok }, + { command: execHelp.command, status: execHelp.status, ok: execHelp.ok }, + { command: resumeHelp.command, status: resumeHelp.status, ok: resumeHelp.ok } + ], + interpretation: resumeBoundaryRerouteSupported + ? "Codex CLI appears to support route-selected model changes at exec/resume boundaries, not from inside an already-running interactive TUI." + : "Codex CLI did not expose enough local command capability to prove route-selected model changes at a resume boundary." + }, + limitations: [ + "This probe does not execute live model calls.", + "This probe does not prove model changes from inside an already-running Codex TUI session.", + "A follow-up live probe should run two non-interactive Codex turns with resume and different --model values, then inspect JSON/session evidence." + ] + }; +} + +async function main() { + const args = process.argv.slice(2); + const codexBin = getArg(args, "--codex-bin") || "codex"; + const result = runCodexCliFeasibilityProbe({ codexBin }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + process.exitCode = result.status === "blocked" ? 1 : 0; +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((error) => { + process.stderr.write(`codex-cli-feasibility-probe failed: ${error.message}\n`); + process.exitCode = 1; + }); +} diff --git a/test/codex-cli-feasibility-probe.test.js b/test/codex-cli-feasibility-probe.test.js new file mode 100644 index 0000000..cf34360 --- /dev/null +++ b/test/codex-cli-feasibility-probe.test.js @@ -0,0 +1,79 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runCodexCliFeasibilityProbe } from "../scripts/codex-cli-feasibility-probe.js"; + +function createTargets() { + return [ + { + id: "openai-quick", + label: "quick", + target_class: "cheap_fast", + capabilities: ["chat", "structured_output"], + privacy_tier: "external", + availability: "available" + }, + { + id: "openai-coder", + label: "best coder", + target_class: "strong_coding", + capabilities: [ + "chat", + "reasoning", + "structured_output", + "repo_context", + "file_read", + "file_edit", + "shell_execution", + "test_execution" + ], + privacy_tier: "external", + availability: "available" + } + ]; +} + +function createFakeCodexBin() { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-cli-probe-test-")); + const binPath = path.join(dir, "codex"); + fs.writeFileSync( + binPath, + `#!/usr/bin/env node +const args = process.argv.slice(2); +if (args.join(" ") === "--help") { + console.log("Commands:\\n exec Run Codex non-interactively\\n resume Resume a previous interactive session\\nOptions:\\n -m, --model \\n"); + process.exit(0); +} +if (args.join(" ") === "exec --help") { + console.log("Commands:\\n resume Resume a previous session\\nOptions:\\n -m, --model \\n --json\\n"); + process.exit(0); +} +if (args.join(" ") === "exec resume --help") { + console.log("Options:\\n --last\\n -m, --model \\n --json\\n"); + process.exit(0); +} +process.exit(2); +`, + "utf8" + ); + fs.chmodSync(binPath, 0o755); + return binPath; +} + +test("codex CLI feasibility probe reports resume-boundary routing without claiming in-session authority", () => { + const result = runCodexCliFeasibilityProbe({ + codexBin: createFakeCodexBin(), + targets: createTargets() + }); + + assert.equal(result.surface, "codex-cli"); + assert.equal(result.status, "partial"); + assert.equal(result.verdict.authoritativeInsideRunningSession, false); + assert.equal(result.verdict.resumeBoundaryRerouteSupported, true); + assert.equal(result.verdict.nonInteractiveTurnRoutingSupported, true); + assert.equal(result.verdict.targetChanged, true); + assert.equal(result.turnPlans[0].route.selectedTargetId, "openai-coder"); + assert.equal(result.turnPlans[1].route.selectedTargetId, "openai-quick"); +}); From 9e62a93ce1f46191f12752ce8cc7fca8da1456c8 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 16:12:43 +0200 Subject: [PATCH 02/23] test(spike): verify Codex CLI resume routing Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 1 + docs/decision-log.md | 13 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 12 +- package.json | 3 +- scripts/codex-cli-feasibility-probe.js | 238 +++++++++++++++++++++-- test/codex-cli-feasibility-probe.test.js | 34 ++++ 6 files changed, 280 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 831703d..68acb30 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `switchboard probe continuity` | Runs a continuity probe for prompt-driven turns and reports whether session continuity checks pass. | You want to verify non-interactive continuity behavior after changes. | | `switchboard probe continuity-interactive` | Runs the interactive continuity probe and verifies resume/session behavior across turns. | You want to validate interactive continuity and related checks. | | `npm run switchboard:spike:codex-cli` | Inspects the local Codex CLI command surface and maps two routed turns to Codex `exec`/`resume --model` plans without making live model calls. | You want a product-aligned feasibility signal for Codex CLI route authority before building a deeper integration. | +| `npm run switchboard:spike:codex-cli:live` | Runs the bounded two-turn Codex CLI resume probe with route-selected models and captures JSON/session evidence. | You are ready to collect live evidence for the Codex CLI feasibility spike. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | ### Interactive Mode Clarification diff --git a/docs/decision-log.md b/docs/decision-log.md index 2c84321..0a06922 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -337,7 +337,7 @@ Follow-up: Decision ID: DEC-2026-05-13-codex-cli-feasibility-spike Related deferred item: PRODUCT-PRD.md Section 17.1 deferred items; milestone 5 second-surface proof -Status: committed +Status: committed; live resume boundary verified Date: 2026-05-13 Owners: team @@ -357,18 +357,19 @@ Tradeoffs: Verification signal: - Expected signal: a reproducible probe that can show whether local Codex CLI exposes route-selected model authority at launch, non-interactive execution, or resume boundaries, and whether it should be treated as authoritative or advisory. -- Evidence observed: local Codex CLI help exposes `--model` for interactive launch, `codex exec --model`, and `codex exec resume --last --model`. A new probe (`scripts/codex-cli-feasibility-probe.js`) reports this as resume-boundary route authority, not in-session automatic switching. +- Evidence observed: local Codex CLI help exposes `--model` for interactive launch, `codex exec --model`, and `codex exec resume --last --model`. The command-surface probe reports this as resume-boundary route authority, not in-session automatic switching. +- Live evidence observed on 2026-05-13: `npm run switchboard:spike:codex-cli:live` completed a two-turn probe. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 resumed with `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns reported shared thread/session evidence `019e21ad-1f30-72d0-bec0-08d275284eaf`. Decision: - Chosen option: Option B. -- Scope of commitment: treat Codex CLI as a candidate for route authority at `exec`/`resume` boundaries and continue the spike with a live two-turn resume probe before making product promises. -- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions until live resume evidence is collected. +- Scope of commitment: treat Codex CLI as a verified candidate for route authority at `exec`/`resume` boundaries and continue product evaluation against the written spike scope before changing the primary MVP path. +- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions beyond the verified non-interactive resume workflow. Consequences: - Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. -- Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority. +- Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority, and live mode records selected targets, resolved models, final-message evidence, JSON event counts, and session/thread IDs. - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Follow-up: -- Next review milestone: use the written spike scope to run a live Codex CLI `exec` plus `exec resume --last --model ... --json` probe and inspect whether session continuity and route-selected model changes are visible in durable evidence. +- Next review milestone: decide whether the verified Codex CLI `exec`/`resume` boundary is strong enough to justify a product workflow spike, or whether the lack of in-session automatic switching keeps this in the deferred candidate bucket. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, test/codex-cli-feasibility-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 4608640..14cac97 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -2,7 +2,7 @@ ## Status -Status: active spike +Status: live resume boundary verified; product decision pending Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). @@ -101,7 +101,7 @@ Expected output: ### Phase 2: Live Resume Probe -Status: next. +Status: verified for `exec`/`resume` boundary continuity. Run two real Codex CLI turns: @@ -110,6 +110,14 @@ Run two real Codex CLI turns: 3. Capture JSON/session evidence. 4. Evaluate model change, session continuity, and user friction against the success criteria above. +Observed 2026-05-13: + +- First routed turn selected `openai-coder` / `codex-best-coder` / `gpt-5.5`. +- Second routed turn selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`. +- `codex exec resume --last --model gpt-5.4-mini --json` completed successfully. +- Both turns reported shared thread/session evidence: `019e21ad-1f30-72d0-bec0-08d275284eaf`. +- This verifies route-selected model changes at Codex CLI `exec`/`resume` boundaries, not model changes inside an already-running interactive Codex TUI. + ### Phase 3: Product Decision Classify the result as one of: diff --git a/package.json b/package.json index 960fb01..27fd82e 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,8 @@ "switchboard:explain": "node bin/switchboard.js explain --json", "switchboard:continuity": "node bin/switchboard.js probe continuity --no-tools --inter-turn-delay-ms 1000", "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json", - "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js" + "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js", + "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live" }, "repository": { "type": "git", diff --git a/scripts/codex-cli-feasibility-probe.js b/scripts/codex-cli-feasibility-probe.js index 31d934a..be81e0a 100644 --- a/scripts/codex-cli-feasibility-probe.js +++ b/scripts/codex-cli-feasibility-probe.js @@ -1,6 +1,8 @@ #!/usr/bin/env node import { spawnSync } from "node:child_process"; import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; import { fileURLToPath } from "node:url"; import { routePrompt } from "../src/router/router.js"; import { OPENAI_TARGETS_PATH } from "../src/switchboard/paths.js"; @@ -20,6 +22,10 @@ function getArg(args, flag) { return args[idx + 1]; } +function hasFlag(args, flag) { + return args.includes(flag); +} + function runHelp(codexBin, args) { const result = spawnSync(codexBin, args, { encoding: "utf8", @@ -71,9 +77,204 @@ function routeTurnPlan({ input, session, targets }) { }; } +function runCodexCommand(codexBin, args, { cwd = process.cwd() } = {}) { + const result = spawnSync(codexBin, args, { + cwd, + encoding: "utf8", + env: { ...process.env, NO_COLOR: "1" } + }); + return { + command: [codexBin, ...args].join(" "), + args, + status: result.status, + signal: result.signal, + stdout: result.stdout || "", + stderr: result.stderr || "", + ok: result.status === 0 + }; +} + +function parseJsonLines(text) { + const events = []; + for (const line of text.split(/\r?\n/)) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + events.push(JSON.parse(trimmed)); + } catch { + // Codex may print warnings before JSON events; keep raw output in evidence. + } + } + return events; +} + +function isUuid(value) { + if (typeof value !== "string" || value.length !== 36) return false; + const parts = value.split("-"); + if (parts.length !== 5) return false; + const lengths = [8, 4, 4, 4, 12]; + return parts.every((part, index) => part.length === lengths[index] && [...part].every((char) => /[0-9a-f]/i.test(char))); +} + +function collectSessionIds(value, ids = new Set()) { + if (!value || typeof value !== "object") return ids; + for (const [key, child] of Object.entries(value)) { + if (typeof child === "string" && /session/i.test(key) && isUuid(child)) { + ids.add(child); + } else if (child && typeof child === "object") { + collectSessionIds(child, ids); + } + } + return ids; +} + +function extractSessionIds(text, events) { + const ids = collectSessionIds({ events }); + for (const token of text.split(/[^0-9a-f-]+/i)) { + if (isUuid(token)) { + ids.add(token); + } + } + return [...ids]; +} + +function readIfExists(filePath) { + try { + // eslint-disable-next-line security/detect-non-literal-fs-filename -- probe reads its own temp output files. + return fs.readFileSync(filePath, "utf8"); + } catch (error) { + if (error.code === "ENOENT") return ""; + throw error; + } +} + +function tailText(text, maxLength = 1200) { + if (!text) return ""; + return text.length > maxLength ? text.slice(text.length - maxLength) : text; +} + +function summarizeLiveTurn({ label, plan, commandResult, outputPath }) { + const events = parseJsonLines(commandResult.stdout); + const text = `${commandResult.stdout}\n${commandResult.stderr}`; + const sessionIds = extractSessionIds(text, events); + const finalMessage = readIfExists(outputPath); + return { + label, + selectedTargetId: plan.route.selectedTargetId, + selectedProfile: plan.codex.profile, + selectedModel: plan.codex.model, + command: commandResult.command, + status: commandResult.status, + signal: commandResult.signal, + ok: commandResult.ok, + stdoutTail: tailText(commandResult.stdout), + stderrTail: tailText(commandResult.stderr), + outputPath, + finalMessageBytes: Buffer.byteLength(finalMessage, "utf8"), + jsonEventCount: events.length, + sessionIds + }; +} + +function runLiveResumeProbe({ codexBin, first, second, cwd = process.cwd() }) { + if (!first.codex.model || !second.codex.model) { + return { + status: "blocked", + reason: "The router did not resolve Codex models for both live turns.", + turns: [] + }; + } + + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-cli-feasibility-")); + const firstOutputPath = path.join(tempDir, "turn-1-final-message.txt"); + const secondOutputPath = path.join(tempDir, "turn-2-final-message.txt"); + + const firstResult = runCodexCommand( + codexBin, + [ + "exec", + "--model", + first.codex.model, + "--sandbox", + "read-only", + "--json", + "--output-last-message", + firstOutputPath, + "--cd", + cwd, + first.input + ], + { cwd } + ); + const firstTurn = summarizeLiveTurn({ + label: "first", + plan: first, + commandResult: firstResult, + outputPath: firstOutputPath + }); + + if (!firstResult.ok) { + return { + status: "blocked", + reason: "The first live Codex CLI turn failed.", + turns: [firstTurn] + }; + } + + const secondResult = runCodexCommand( + codexBin, + [ + "exec", + "resume", + "--last", + "--model", + second.codex.model, + "-c", + 'sandbox_mode="read-only"', + "--json", + "--output-last-message", + secondOutputPath, + second.input + ], + { cwd } + ); + const secondTurn = summarizeLiveTurn({ + label: "second", + plan: second, + commandResult: secondResult, + outputPath: secondOutputPath + }); + + if (!secondResult.ok) { + return { + status: "blocked", + reason: "The resumed live Codex CLI turn failed.", + turns: [firstTurn, secondTurn] + }; + } + + const sharedSessionIds = firstTurn.sessionIds.filter((id) => secondTurn.sessionIds.includes(id)); + const modelChanged = first.codex.model !== second.codex.model; + const continuityEvidence = sharedSessionIds.length > 0 ? "shared_session_id" : "resume_last_success_without_session_id"; + + return { + status: modelChanged && sharedSessionIds.length > 0 ? "verified" : "partial", + reason: + sharedSessionIds.length > 0 + ? "The live resumed turn completed with a different route-selected model and shared session evidence." + : "The live resumed turn completed with a different route-selected model, but session continuity was not visible as a shared session id.", + turns: [firstTurn, secondTurn], + modelChanged, + continuityEvidence, + sharedSessionIds + }; +} + export function runCodexCliFeasibilityProbe({ codexBin = "codex", - targets = readJson(OPENAI_TARGETS_PATH).targets + targets = readJson(OPENAI_TARGETS_PATH).targets, + live = false, + cwd = process.cwd() } = {}) { const rootHelp = runHelp(codexBin, ["--help"]); const execHelp = runHelp(codexBin, ["exec", "--help"]); @@ -103,7 +304,7 @@ export function runCodexCliFeasibilityProbe({ }; const first = routeTurnPlan({ - input: "Implement the retry logic with clear tests and error handling.", + input: "Do not edit files or run commands. Briefly outline how you would implement retry logic with clear tests and error handling.", session, targets }); @@ -130,22 +331,29 @@ export function runCodexCliFeasibilityProbe({ capabilities.execResumeLastSession ); - return { - status: !commandAvailable + const surfaceStatus = !commandAvailable ? "blocked" : resumeBoundaryRerouteSupported ? "partial" - : "advisory_only", + : "advisory_only"; + const liveProbe = live && surfaceStatus !== "blocked" ? runLiveResumeProbe({ codexBin, first, second, cwd }) : null; + const status = liveProbe ? liveProbe.status : surfaceStatus; + + return { + status, surface: "codex-cli", + mode: live ? "live_resume" : "command_surface", verdict: { authoritativeInsideRunningSession: false, resumeBoundaryRerouteSupported, nonInteractiveTurnRoutingSupported: Boolean(capabilities.execModelAtLaunch), advisorySupported: commandAvailable, - targetChanged + targetChanged, + liveResumeVerified: liveProbe?.status === "verified" || false }, capabilities, turnPlans: [first, second], + liveProbe, evidence: { commands: [ { command: rootHelp.command, status: rootHelp.status, ok: rootHelp.ok }, @@ -156,18 +364,24 @@ export function runCodexCliFeasibilityProbe({ ? "Codex CLI appears to support route-selected model changes at exec/resume boundaries, not from inside an already-running interactive TUI." : "Codex CLI did not expose enough local command capability to prove route-selected model changes at a resume boundary." }, - limitations: [ - "This probe does not execute live model calls.", - "This probe does not prove model changes from inside an already-running Codex TUI session.", - "A follow-up live probe should run two non-interactive Codex turns with resume and different --model values, then inspect JSON/session evidence." - ] + limitations: live + ? [ + "This probe does not prove model changes from inside an already-running Codex TUI session.", + "A verified result requires shared session evidence in Codex CLI JSON/stdout/stderr output." + ] + : [ + "This probe does not execute live model calls.", + "This probe does not prove model changes from inside an already-running Codex TUI session.", + "A follow-up live probe should run two non-interactive Codex turns with resume and different --model values, then inspect JSON/session evidence." + ] }; } async function main() { const args = process.argv.slice(2); const codexBin = getArg(args, "--codex-bin") || "codex"; - const result = runCodexCliFeasibilityProbe({ codexBin }); + const cwd = getArg(args, "--cwd") || process.cwd(); + const result = runCodexCliFeasibilityProbe({ codexBin, live: hasFlag(args, "--live"), cwd }); process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); process.exitCode = result.status === "blocked" ? 1 : 0; } diff --git a/test/codex-cli-feasibility-probe.test.js b/test/codex-cli-feasibility-probe.test.js index cf34360..90c54b0 100644 --- a/test/codex-cli-feasibility-probe.test.js +++ b/test/codex-cli-feasibility-probe.test.js @@ -41,6 +41,7 @@ function createFakeCodexBin() { fs.writeFileSync( binPath, `#!/usr/bin/env node +const fs = require("node:fs"); const args = process.argv.slice(2); if (args.join(" ") === "--help") { console.log("Commands:\\n exec Run Codex non-interactively\\n resume Resume a previous interactive session\\nOptions:\\n -m, --model \\n"); @@ -54,6 +55,20 @@ if (args.join(" ") === "exec resume --help") { console.log("Options:\\n --last\\n -m, --model \\n --json\\n"); process.exit(0); } +if (args[0] === "exec" && args[1] === "--model") { + const outputPath = args[args.indexOf("--output-last-message") + 1]; + fs.writeFileSync(outputPath, "implemented retry logic"); + console.log(JSON.stringify({ type: "session", session_id: "11111111-1111-4111-8111-111111111111" })); + console.log(JSON.stringify({ type: "turn_complete", model: args[2] })); + process.exit(0); +} +if (args[0] === "exec" && args[1] === "resume" && args.includes("--last")) { + const outputPath = args[args.indexOf("--output-last-message") + 1]; + fs.writeFileSync(outputPath, "summarized outcome"); + console.log(JSON.stringify({ type: "session", session_id: "11111111-1111-4111-8111-111111111111" })); + console.log(JSON.stringify({ type: "turn_complete", model: args[args.indexOf("--model") + 1] })); + process.exit(0); +} process.exit(2); `, "utf8" @@ -77,3 +92,22 @@ test("codex CLI feasibility probe reports resume-boundary routing without claimi assert.equal(result.turnPlans[0].route.selectedTargetId, "openai-coder"); assert.equal(result.turnPlans[1].route.selectedTargetId, "openai-quick"); }); + +test("codex CLI live feasibility probe verifies resumed turns only with shared session evidence", () => { + const result = runCodexCliFeasibilityProbe({ + codexBin: createFakeCodexBin(), + targets: createTargets(), + live: true + }); + + assert.equal(result.mode, "live_resume"); + assert.equal(result.status, "verified"); + assert.equal(result.verdict.liveResumeVerified, true); + assert.equal(result.liveProbe.modelChanged, true); + assert.equal(result.liveProbe.continuityEvidence, "shared_session_id"); + assert.deepEqual(result.liveProbe.sharedSessionIds, ["11111111-1111-4111-8111-111111111111"]); + assert.equal(result.liveProbe.turns[0].selectedTargetId, "openai-coder"); + assert.equal(result.liveProbe.turns[1].selectedTargetId, "openai-quick"); + assert.equal(result.liveProbe.turns[0].finalMessageBytes > 0, true); + assert.equal(result.liveProbe.turns[1].finalMessageBytes > 0, true); +}); From 21b86dfc651fcb4973224daba00c4a57c5024af3 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 16:51:17 +0200 Subject: [PATCH 03/23] docs(spike): clarify Codex parity bar Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 11 ++++---- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 36 ++++++++++++++++++++------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index 0a06922..dc279db 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -337,13 +337,14 @@ Follow-up: Decision ID: DEC-2026-05-13-codex-cli-feasibility-spike Related deferred item: PRODUCT-PRD.md Section 17.1 deferred items; milestone 5 second-surface proof -Status: committed; live resume boundary verified +Status: committed; resume boundary verified; beyond-parity evidence pending Date: 2026-05-13 Owners: team Context: - The current committed near-term path remains advisory injection inside running Claude sessions, but this alone does not answer whether Codex CLI can provide a better route-authority boundary. - The product question is not whether OpenAI SDK calls can change models. The question is whether the Codex CLI user surface exposes a supported boundary where Switchboard can choose the execution target with low UX friction while preserving session continuity. +- The primary differentiator beyond Claude parity is in-session model switching. A command-boundary `exec`/`resume` workflow is useful evidence, but it is not enough to justify a product-direction change by itself. Options considered: - Option A: continue advisory hardening only and postpone all Codex feasibility testing. @@ -362,14 +363,14 @@ Verification signal: Decision: - Chosen option: Option B. -- Scope of commitment: treat Codex CLI as a verified candidate for route authority at `exec`/`resume` boundaries and continue product evaluation against the written spike scope before changing the primary MVP path. -- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions beyond the verified non-interactive resume workflow. +- Scope of commitment: treat Codex CLI as verified only for route authority at `exec`/`resume` boundaries. This is partial/parity evidence, not a verified beyond-parity product path. +- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions until a supported in-session switch mechanism is found and verified. Consequences: - Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. - Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority, and live mode records selected targets, resolved models, final-message evidence, JSON event counts, and session/thread IDs. -- Migration impact: low; probe is additive and does not alter core Claude workflow contracts. +- Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence. Follow-up: -- Next review milestone: decide whether the verified Codex CLI `exec`/`resume` boundary is strong enough to justify a product workflow spike, or whether the lack of in-session automatic switching keeps this in the deferred candidate bucket. +- Next review milestone: investigate whether Codex CLI exposes a supported in-session model switch mechanism. If no supported mechanism exists, keep Codex CLI in the deferred candidate bucket despite the verified `exec`/`resume` boundary. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, test/codex-cli-feasibility-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 14cac97..2961df3 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -2,7 +2,7 @@ ## Status -Status: live resume boundary verified; product decision pending +Status: resume boundary verified; beyond-parity evidence pending Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). @@ -20,13 +20,14 @@ This spike is not a product build. It is a bounded feasibility check. ## Product Question -Can Codex CLI provide a supported route-authority boundary where Switchboard can choose a model or execution target per turn while preserving enough session continuity to feel natural for software-delivery work? +Can Codex CLI provide a supported in-session route-authority boundary where Switchboard can choose a model or execution target during an ongoing user session, preserving continuity without forcing a command-boundary workflow? ## What We Need To Verify 1. **Route authority** - Can Switchboard pass a route-selected model/profile into Codex CLI using supported CLI options? - Does this work for both a new turn and a resumed session? + - Does Codex CLI expose any supported mechanism to change the model inside an already-running interactive session? 2. **Continuity** - Can a second routed Codex CLI turn resume the prior session after the first turn? @@ -40,6 +41,7 @@ Can Codex CLI provide a supported route-authority boundary where Switchboard can 4. **User friction** - Does the resulting workflow materially reduce model-selection overhead compared with manual Codex model choice? - Does it avoid the Claude-style pattern of repeatedly exiting and re-entering an interactive session for each routed turn? + - Does it go beyond Claude parity by allowing the user to remain inside the same interactive session while Switchboard changes the route? 5. **Boundary fit** - Does the integration consume router contracts and target metadata rather than Codex-specific shortcuts? @@ -49,8 +51,8 @@ Can Codex CLI provide a supported route-authority boundary where Switchboard can The spike is successful only if all required criteria are met: -1. A two-turn live probe can run through Codex CLI with two different Switchboard-selected models. -2. The second turn resumes the first turn's Codex session through a supported CLI mechanism. +1. A supported Codex CLI mechanism allows Switchboard to change the selected model/profile inside an already-running interactive session. +2. A two-turn live probe can exercise that mechanism with two different Switchboard-selected models without requiring the user to exit or resume a separate command. 3. The probe records both selected target IDs and resolved Codex models. 4. The probe records enough session evidence to show that continuity was preserved. 5. The workflow requires no manual model selection by the user after the prompt is provided. @@ -60,7 +62,7 @@ The spike is successful only if all required criteria are met: The spike is partial, but still useful, if Codex CLI supports route-selected models only at `exec` or `resume` boundaries. -That outcome would mean Codex CLI may support a non-interactive or wrapper-style Switchboard workflow, but it does not prove automatic switching inside a running interactive Codex TUI. +That outcome means Codex CLI may support a non-interactive or wrapper-style Switchboard workflow, but it does not go beyond Claude parity for the primary product differentiator: automatic model switching inside a running interactive session. ## Failure Criteria @@ -71,6 +73,7 @@ The spike should be considered failed or blocked if any of the following are tru 3. Route-selected model changes are accepted syntactically but cannot be verified from durable evidence. 4. The only viable path requires private, unsupported, or brittle Codex internals. 5. The resulting workflow has the same or worse cognitive overhead as manual model selection. +6. Codex CLI cannot support in-session model changes beyond the same command-boundary pattern already available through Claude-style launch/resume flows. ## Explicit Non-Goals @@ -101,7 +104,7 @@ Expected output: ### Phase 2: Live Resume Probe -Status: verified for `exec`/`resume` boundary continuity. +Status: verified as partial/parity evidence for `exec`/`resume` boundary continuity. Run two real Codex CLI turns: @@ -117,13 +120,28 @@ Observed 2026-05-13: - `codex exec resume --last --model gpt-5.4-mini --json` completed successfully. - Both turns reported shared thread/session evidence: `019e21ad-1f30-72d0-bec0-08d275284eaf`. - This verifies route-selected model changes at Codex CLI `exec`/`resume` boundaries, not model changes inside an already-running interactive Codex TUI. +- Because in-session switching is the intended differentiator beyond Claude, this evidence is partial/parity evidence rather than a success condition for changing product direction. -### Phase 3: Product Decision +### Phase 3: In-Session Switch Probe + +Status: next; required for beyond-parity product direction. + +Investigate whether Codex CLI exposes a supported hook, command, control protocol, config reload behavior, or interactive-session API that can change the active model after an interactive session has started. + +Required evidence: + +1. The mechanism is documented, exposed in help output, or otherwise supportable without private internals. +2. A running interactive session accepts a route-selected model/profile change after the first user turn. +3. A second user turn runs under the new model while preserving the same interactive session continuity. +4. The user does not need to manually choose the model, exit the session, or start a separate `exec resume` command. +5. The probe records durable evidence for the session identity and model change. + +### Phase 4: Product Decision Classify the result as one of: -- `verified`: Codex CLI supports route-selected resumed turns with usable continuity and inspectable evidence. -- `partial`: Codex CLI supports route-selected command boundaries but not a low-friction interactive workflow. +- `verified`: Codex CLI supports supported in-session route-selected model changes with usable continuity and inspectable evidence. +- `partial`: Codex CLI supports route-selected command boundaries but does not go beyond Claude parity for interactive use. - `blocked`: Codex CLI cannot support route authority with continuity through supported mechanisms. ## Stop Conditions From 89300dcc6d7c83b562a6785de6758fba6e56ff1f Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:21:17 +0200 Subject: [PATCH 04/23] docs(spike): record Codex app-server switch evidence Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 11 ++++++----- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 13 +++++++++++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index dc279db..dc1824f 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -337,7 +337,7 @@ Follow-up: Decision ID: DEC-2026-05-13-codex-cli-feasibility-spike Related deferred item: PRODUCT-PRD.md Section 17.1 deferred items; milestone 5 second-surface proof -Status: committed; resume boundary verified; beyond-parity evidence pending +Status: committed; app-server in-session evidence observed Date: 2026-05-13 Owners: team @@ -360,17 +360,18 @@ Verification signal: - Expected signal: a reproducible probe that can show whether local Codex CLI exposes route-selected model authority at launch, non-interactive execution, or resume boundaries, and whether it should be treated as authoritative or advisory. - Evidence observed: local Codex CLI help exposes `--model` for interactive launch, `codex exec --model`, and `codex exec resume --last --model`. The command-surface probe reports this as resume-boundary route authority, not in-session automatic switching. - Live evidence observed on 2026-05-13: `npm run switchboard:spike:codex-cli:live` completed a two-turn probe. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 resumed with `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns reported shared thread/session evidence `019e21ad-1f30-72d0-bec0-08d275284eaf`. +- App-server evidence observed on 2026-05-13: generated Codex app-server protocol bindings expose `turn/start` with a `model` override documented as applying to "this turn and subsequent turns." A live stdio smoke test started one thread with `gpt-5.5`, completed a first turn, then completed a second `turn/start` on the same `threadId` / `sessionId` with `model: "gpt-5.4-mini"` and no `exec resume` boundary. Decision: - Chosen option: Option B. -- Scope of commitment: treat Codex CLI as verified only for route authority at `exec`/`resume` boundaries. This is partial/parity evidence, not a verified beyond-parity product path. -- What remains intentionally deferred: claims of in-session automatic switching inside the Codex TUI; broad UX/product reframing decisions until a supported in-session switch mechanism is found and verified. +- Scope of commitment: treat Codex CLI as verified for route authority at `exec`/`resume` boundaries and promising for in-session route authority through the experimental app-server protocol. +- What remains intentionally deferred: claims that the interactive Codex TUI itself can be hot-swapped; broad UX/product reframing decisions until the app-server protocol is judged supportable enough for a product surface. Consequences: - Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. - Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority, and live mode records selected targets, resolved models, final-message evidence, JSON event counts, and session/thread IDs. -- Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence. +- Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: investigate whether Codex CLI exposes a supported in-session model switch mechanism. If no supported mechanism exists, keep Codex CLI in the deferred candidate bucket despite the verified `exec`/`resume` boundary. +- Next review milestone: decide whether the experimental Codex app-server protocol is supportable enough to be the Switchboard-controlled session surface. If yes, build a small repeatable app-server probe/harness; if no, keep Codex CLI in the deferred candidate bucket despite the positive protocol evidence. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, test/codex-cli-feasibility-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 2961df3..7e3899e 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -2,7 +2,7 @@ ## Status -Status: resume boundary verified; beyond-parity evidence pending +Status: app-server in-session evidence observed; supportability decision pending Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). @@ -124,7 +124,7 @@ Observed 2026-05-13: ### Phase 3: In-Session Switch Probe -Status: next; required for beyond-parity product direction. +Status: app-server protocol smoke test passed; supportability and product-surface fit pending. Investigate whether Codex CLI exposes a supported hook, command, control protocol, config reload behavior, or interactive-session API that can change the active model after an interactive session has started. @@ -136,6 +136,15 @@ Required evidence: 4. The user does not need to manually choose the model, exit the session, or start a separate `exec resume` command. 5. The probe records durable evidence for the session identity and model change. +Observed 2026-05-13: + +- Generated Codex app-server protocol bindings from `codex app-server generate-ts`. +- `turn/start` exposes `model?: string | null` with generated documentation: "Override the model for this turn and subsequent turns." +- A live app-server stdio smoke test started one thread with `gpt-5.5`, ran a first turn, then ran a second `turn/start` on the same `threadId` with `model: "gpt-5.4-mini"`. +- Both turns completed on the same `threadId` / `sessionId`: `019e21ec-89f0-7a03-b5f6-f8590818eb1b`. +- This did not require `codex exec resume`, manual model selection, or starting a separate command-boundary workflow. +- Caveat: the evidence is from the experimental Codex app-server protocol, not from a documented interactive TUI hook. The next decision is whether Switchboard can treat the app-server protocol as a supportable product surface. + ### Phase 4: Product Decision Classify the result as one of: From 311f2349dfbd1514cb7ebfa4c17efd654e4518be Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:33:15 +0200 Subject: [PATCH 05/23] test(spike): add Codex app-server switch probe Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 1 + docs/decision-log.md | 10 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 19 +- package.json | 3 +- scripts/codex-app-server-switch-probe.js | 401 +++++++++++++++++++++ test/codex-app-server-switch-probe.test.js | 148 ++++++++ 6 files changed, 575 insertions(+), 7 deletions(-) create mode 100644 scripts/codex-app-server-switch-probe.js create mode 100644 test/codex-app-server-switch-probe.test.js diff --git a/README.md b/README.md index 68acb30..75145e0 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `switchboard probe continuity-interactive` | Runs the interactive continuity probe and verifies resume/session behavior across turns. | You want to validate interactive continuity and related checks. | | `npm run switchboard:spike:codex-cli` | Inspects the local Codex CLI command surface and maps two routed turns to Codex `exec`/`resume --model` plans without making live model calls. | You want a product-aligned feasibility signal for Codex CLI route authority before building a deeper integration. | | `npm run switchboard:spike:codex-cli:live` | Runs the bounded two-turn Codex CLI resume probe with route-selected models and captures JSON/session evidence. | You are ready to collect live evidence for the Codex CLI feasibility spike. | +| `npm run switchboard:spike:codex-app-server` | Runs the bounded app-server in-session switch probe with one thread and two route-selected `turn/start` model overrides. | You are evaluating whether Codex app-server can be a Switchboard-controlled session surface beyond `exec`/`resume` parity. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | ### Interactive Mode Clarification diff --git a/docs/decision-log.md b/docs/decision-log.md index dc1824f..9804b07 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -337,7 +337,7 @@ Follow-up: Decision ID: DEC-2026-05-13-codex-cli-feasibility-spike Related deferred item: PRODUCT-PRD.md Section 17.1 deferred items; milestone 5 second-surface proof -Status: committed; app-server in-session evidence observed +Status: committed; app-server in-session probe implemented Date: 2026-05-13 Owners: team @@ -361,6 +361,8 @@ Verification signal: - Evidence observed: local Codex CLI help exposes `--model` for interactive launch, `codex exec --model`, and `codex exec resume --last --model`. The command-surface probe reports this as resume-boundary route authority, not in-session automatic switching. - Live evidence observed on 2026-05-13: `npm run switchboard:spike:codex-cli:live` completed a two-turn probe. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 resumed with `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns reported shared thread/session evidence `019e21ad-1f30-72d0-bec0-08d275284eaf`. - App-server evidence observed on 2026-05-13: generated Codex app-server protocol bindings expose `turn/start` with a `model` override documented as applying to "this turn and subsequent turns." A live stdio smoke test started one thread with `gpt-5.5`, completed a first turn, then completed a second `turn/start` on the same `threadId` / `sessionId` with `model: "gpt-5.4-mini"` and no `exec resume` boundary. +- Repeatable app-server probe: `npm run switchboard:spike:codex-app-server` now exercises the same one-thread, two-turn app-server path and records route-selected targets, requested models, thread/session IDs, turn completion, agent-message evidence, and any `model/rerouted` telemetry. +- Repeatable live app-server evidence observed on 2026-05-13: `npm run switchboard:spike:codex-app-server` returned `status: verified`. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns completed on the same `threadId` / `sessionId` `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`; `thread/read` returned `turnCount: 2`. No `model/rerouted` telemetry was emitted. Decision: - Chosen option: Option B. @@ -369,9 +371,9 @@ Decision: Consequences: - Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. -- Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority, and live mode records selected targets, resolved models, final-message evidence, JSON event counts, and session/thread IDs. +- Test and replay impact: probe test coverage verifies that resume-boundary support is not misreported as in-session authority, and app-server probe coverage verifies the one-thread, two-turn model-override harness without requiring live Codex calls in CI. - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: decide whether the experimental Codex app-server protocol is supportable enough to be the Switchboard-controlled session surface. If yes, build a small repeatable app-server probe/harness; if no, keep Codex CLI in the deferred candidate bucket despite the positive protocol evidence. -- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, test/codex-cli-feasibility-probe.test.js, README.md +- Next review milestone: decide whether the experimental Codex app-server protocol is supportable enough to be the Switchboard-controlled session surface. If no, keep Codex CLI in the deferred candidate bucket despite the positive protocol evidence. +- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 7e3899e..829020b 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -2,7 +2,7 @@ ## Status -Status: app-server in-session evidence observed; supportability decision pending +Status: app-server in-session probe implemented; supportability decision pending Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). @@ -124,7 +124,7 @@ Observed 2026-05-13: ### Phase 3: In-Session Switch Probe -Status: app-server protocol smoke test passed; supportability and product-surface fit pending. +Status: repeatable app-server probe implemented; supportability and product-surface fit pending. Investigate whether Codex CLI exposes a supported hook, command, control protocol, config reload behavior, or interactive-session API that can change the active model after an interactive session has started. @@ -145,6 +145,21 @@ Observed 2026-05-13: - This did not require `codex exec resume`, manual model selection, or starting a separate command-boundary workflow. - Caveat: the evidence is from the experimental Codex app-server protocol, not from a documented interactive TUI hook. The next decision is whether Switchboard can treat the app-server protocol as a supportable product surface. +Repeatable probe: + +- [../../scripts/codex-app-server-switch-probe.js](../../scripts/codex-app-server-switch-probe.js) starts `codex app-server --listen stdio://`, initializes the experimental protocol, starts one thread with the first routed model, and sends a second `turn/start` on the same thread with the second routed model. +- The probe reports selected target IDs, resolved Codex models, thread/session IDs, completed turn IDs, agent-message evidence, notification counts, and any observed `model/rerouted` notifications. +- The probe intentionally reports the second turn as an accepted model override on the same app-server thread. It does not overclaim provider-side backend telemetry unless Codex emits explicit model telemetry. + +Repeatable live result on 2026-05-13: + +- `npm run switchboard:spike:codex-app-server` returned `status: verified`. +- First turn selected `openai-coder` / `codex-best-coder` / `gpt-5.5`. +- Second turn selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`. +- Both turns completed on the same `threadId` / `sessionId`: `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`. +- `thread/read` returned `turnCount: 2`. +- No `model/rerouted` telemetry was emitted, so the durable evidence remains accepted turn-level model override plus same-thread completion rather than provider-side backend model attestation. + ### Phase 4: Product Decision Classify the result as one of: diff --git a/package.json b/package.json index 27fd82e..153ac58 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,8 @@ "switchboard:continuity": "node bin/switchboard.js probe continuity --no-tools --inter-turn-delay-ms 1000", "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json", "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js", - "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live" + "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live", + "switchboard:spike:codex-app-server": "node scripts/codex-app-server-switch-probe.js" }, "repository": { "type": "git", diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js new file mode 100644 index 0000000..07b4cfb --- /dev/null +++ b/scripts/codex-app-server-switch-probe.js @@ -0,0 +1,401 @@ +#!/usr/bin/env node +/* eslint-disable security/detect-non-literal-fs-filename */ +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import { setTimeout, clearTimeout } from "node:timers"; +import { fileURLToPath } from "node:url"; +import { routePrompt } from "../src/router/router.js"; +import { OPENAI_TARGETS_PATH } from "../src/switchboard/paths.js"; +import { getProfileModelMap, getTargetProfileMap } from "../src/adapters/model-mappings.js"; + +const TARGET_TO_PROFILE = getTargetProfileMap("openai-codex"); +const PROFILE_TO_MODEL = getProfileModelMap("openai-codex"); +const DEFAULT_TIMEOUT_MS = 120000; + +function readJson(filePath) { + return JSON.parse(fs.readFileSync(filePath, "utf8")); +} + +function getArg(args, flag) { + const idx = args.lastIndexOf(flag); + if (idx === -1 || idx + 1 >= args.length) return null; + return args[idx + 1]; +} + +function textInput(text) { + return [{ type: "text", text, text_elements: [] }]; +} + +function tailText(text, maxLength = 1600) { + if (!text) return ""; + return text.length > maxLength ? text.slice(text.length - maxLength) : text; +} + +function routeTurnPlan({ input, session, targets }) { + const route = routePrompt({ + input, + session, + targets, + executionSupported: false + }); + const targetId = route.selectedTarget?.id || null; + const profile = targetId ? TARGET_TO_PROFILE[targetId] || null : null; + const model = profile ? PROFILE_TO_MODEL[profile] || null : null; + return { + input, + route: { + status: route.status, + mode: route.mode, + selectedTargetId: targetId, + targetClass: route.selectedTarget?.target_class || null, + shouldSwitch: route.shouldSwitch, + explanation: route.explanation + }, + codex: { profile, model } + }; +} + +function createTurnPlans(targets) { + const session = { + mode: "plan", + currentTargetId: null, + turnCount: 0, + routingOverride: "auto", + vendorClient: "openai-codex", + clientSurface: "codex-app-server" + }; + + const first = routeTurnPlan({ + input: + "Do not edit files or run commands. Briefly outline how you would implement retry logic with clear tests and error handling. End with one short sentence saying first turn complete.", + session, + targets + }); + if (first.route.status === "ok") { + session.mode = first.route.mode; + session.currentTargetId = first.route.selectedTargetId; + session.turnCount += 1; + } + + const second = routeTurnPlan({ + input: "Thanks, summarize the outcome briefly.", + session, + targets + }); + return [first, second]; +} + +function createDeferred() { + let resolve; + let reject; + const promise = new Promise((promiseResolve, promiseReject) => { + resolve = promiseResolve; + reject = promiseReject; + }); + return { promise, resolve, reject }; +} + +function withTimeout(promise, ms, label) { + let timer; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); +} + +class JsonLineClient { + constructor({ codexBin, timeoutMs }) { + this.timeoutMs = timeoutMs; + this.nextId = 1; + this.buffer = ""; + this.stdout = ""; + this.stderr = ""; + this.notifications = []; + this.pending = new Map(); + this.waiters = []; + this.closed = false; + this.child = spawn(codexBin, ["app-server", "--listen", "stdio://"], { + env: { ...process.env, NO_COLOR: "1" }, + stdio: ["pipe", "pipe", "pipe"] + }); + this.child.stdout.setEncoding("utf8"); + this.child.stderr.setEncoding("utf8"); + this.child.stdout.on("data", (chunk) => this.handleStdout(chunk)); + this.child.stderr.on("data", (chunk) => { + this.stderr += chunk; + }); + this.child.on("error", (error) => { + this.stderr += `${error.message}\n`; + this.rejectOpenWork(error); + }); + this.child.on("exit", (code, signal) => { + this.rejectOpenWork(new Error(`codex app-server exited with code ${code ?? "null"} and signal ${signal ?? "null"}`)); + }); + } + + rejectOpenWork(error) { + this.closed = true; + for (const pending of this.pending.values()) pending.reject(error); + this.pending.clear(); + for (const waiter of this.waiters) waiter.reject(error); + this.waiters = []; + } + + handleStdout(chunk) { + this.stdout += chunk; + this.buffer += chunk; + let newlineIndex = this.buffer.indexOf("\n"); + while (newlineIndex !== -1) { + const line = this.buffer.slice(0, newlineIndex).trim(); + this.buffer = this.buffer.slice(newlineIndex + 1); + if (line) this.handleLine(line); + newlineIndex = this.buffer.indexOf("\n"); + } + } + + handleLine(line) { + let message; + try { + message = JSON.parse(line); + } catch { + return; + } + + if (Object.prototype.hasOwnProperty.call(message, "id")) { + const pending = this.pending.get(message.id); + if (!pending) return; + this.pending.delete(message.id); + if (message.error) { + pending.reject(new Error(JSON.stringify(message.error))); + } else { + pending.resolve(message.result); + } + return; + } + + this.notifications.push(message); + const remaining = []; + for (const waiter of this.waiters) { + if (waiter.predicate(message)) { + waiter.resolve(message); + } else { + remaining.push(waiter); + } + } + this.waiters = remaining; + } + + request(method, params) { + if (this.closed) throw new Error("codex app-server is closed"); + const id = this.nextId; + this.nextId += 1; + const pending = createDeferred(); + this.pending.set(id, pending); + this.child.stdin.write(`${JSON.stringify({ jsonrpc: "2.0", id, method, params })}\n`); + return withTimeout(pending.promise, this.timeoutMs, method); + } + + notify(method, params = undefined) { + if (this.closed) throw new Error("codex app-server is closed"); + const message = params === undefined ? { method } : { method, params }; + this.child.stdin.write(`${JSON.stringify(message)}\n`); + } + + waitForNotification(predicate, label) { + const existing = this.notifications.find(predicate); + if (existing) return Promise.resolve(existing); + const waiter = createDeferred(); + waiter.predicate = predicate; + this.waiters.push(waiter); + return withTimeout(waiter.promise, this.timeoutMs, label); + } + + close() { + if (this.child.exitCode !== null || this.child.killed) return; + this.child.kill("SIGTERM"); + } +} + +function summarizeTurn({ label, plan, response, completedNotification, messages }) { + return { + label, + selectedTargetId: plan.route.selectedTargetId, + selectedProfile: plan.codex.profile, + requestedModel: plan.codex.model, + turnId: response?.turn?.id || completedNotification?.params?.turn?.id || null, + completed: Boolean(completedNotification), + agentMessages: messages + }; +} + +function collectAgentMessages(notifications, turnId) { + return notifications + .filter((message) => message.method === "item/completed" && message.params?.turnId === turnId) + .map((message) => message.params?.item) + .filter((item) => item?.type === "agentMessage" && typeof item.text === "string") + .map((item) => item.text); +} + +async function runTurn(client, { label, threadId, plan, cwd }) { + const response = await client.request("turn/start", { + threadId, + input: textInput(plan.input), + model: plan.codex.model, + cwd + }); + const turnId = response?.turn?.id; + if (!turnId) { + throw new Error(`${label} turn/start completed without a turn id`); + } + const completedNotification = await client.waitForNotification( + (message) => message.method === "turn/completed" && message.params?.threadId === threadId && message.params?.turn?.id === turnId, + `${label} turn/completed` + ); + const messages = collectAgentMessages(client.notifications, turnId); + return summarizeTurn({ label, plan, response, completedNotification, messages }); +} + +async function maybeReadThread(client, threadId) { + try { + const result = await client.request("thread/read", { threadId, includeTurns: true }); + return { + ok: true, + turnCount: Array.isArray(result?.thread?.turns) ? result.thread.turns.length : null, + itemCount: Array.isArray(result?.thread?.items) ? result.thread.items.length : null + }; + } catch (error) { + return { ok: false, error: error.message }; + } +} + +export async function runCodexAppServerSwitchProbe({ + codexBin = "codex", + targets = readJson(OPENAI_TARGETS_PATH).targets, + cwd = process.cwd(), + timeoutMs = DEFAULT_TIMEOUT_MS +} = {}) { + const [first, second] = createTurnPlans(targets); + const targetChanged = Boolean( + first.route.selectedTargetId && + second.route.selectedTargetId && + first.route.selectedTargetId !== second.route.selectedTargetId + ); + const modelChanged = Boolean(first.codex.model && second.codex.model && first.codex.model !== second.codex.model); + + if (!targetChanged || !modelChanged) { + return { + status: "blocked", + surface: "codex-app-server", + reason: "The router did not produce two different Codex targets/models for the app-server switch probe.", + turnPlans: [first, second] + }; + } + + const client = new JsonLineClient({ codexBin, timeoutMs }); + try { + const initialize = await client.request("initialize", { + clientInfo: { + name: "switchboard-codex-app-server-spike", + title: "Switchboard Codex app-server spike", + version: "0.0.0" + }, + capabilities: { experimentalApi: true } + }); + client.notify("initialized"); + + const threadStart = await client.request("thread/start", { + model: first.codex.model, + cwd, + approvalPolicy: "never", + sandbox: "read-only" + }); + const threadId = threadStart?.thread?.id || null; + const sessionId = threadStart?.thread?.sessionId || null; + + if (!threadId) { + return { + status: "blocked", + surface: "codex-app-server", + reason: "thread/start completed without a thread id.", + initialize, + threadStart + }; + } + + const firstTurn = await runTurn(client, { label: "first", threadId, plan: first, cwd }); + const secondTurn = await runTurn(client, { label: "second", threadId, plan: second, cwd }); + const threadRead = await maybeReadThread(client, threadId); + + const sameThreadCompleted = firstTurn.completed && secondTurn.completed; + const requestedModelOverrideAccepted = secondTurn.requestedModel === second.codex.model && sameThreadCompleted; + const modelRerouted = client.notifications.filter((message) => message.method === "model/rerouted"); + const status = requestedModelOverrideAccepted ? "verified" : "partial"; + + return { + status, + surface: "codex-app-server", + mode: "live_app_server", + verdict: { + appServerModelOverrideAccepted: requestedModelOverrideAccepted, + sameThreadCompleted, + targetChanged, + modelChanged, + backendModelTelemetryObserved: modelRerouted.length > 0, + interactiveTuiHotSwapProven: false + }, + thread: { + threadId, + sessionId, + threadStartModel: threadStart.model || first.codex.model, + threadStartModelProvider: threadStart.modelProvider || null + }, + turns: [firstTurn, secondTurn], + threadRead, + modelRerouted, + notificationCounts: client.notifications.reduce((counts, message) => { + counts[message.method] = (counts[message.method] || 0) + 1; + return counts; + }, {}), + evidence: { + command: `${codexBin} app-server --listen stdio://`, + interpretation: + "Codex app-server accepted a second turn/start model override on the same thread. This is in-session app-server evidence, not proof of an interactive TUI hot-swap hook." + }, + limitations: [ + "The generated app-server protocol is experimental.", + "The probe verifies accepted turn-level model override requests and same-thread completion, but it does not currently observe provider-side backend model telemetry unless Codex emits model/rerouted.", + "This does not prove that the Codex interactive TUI itself can be hot-swapped." + ], + stderrTail: tailText(client.stderr), + stdoutTail: tailText(client.stdout) + }; + } catch (error) { + return { + status: "blocked", + surface: "codex-app-server", + reason: error.message, + turnPlans: [first, second], + stderrTail: tailText(client.stderr), + stdoutTail: tailText(client.stdout) + }; + } finally { + client.close(); + } +} + +async function main() { + const args = process.argv.slice(2); + const codexBin = getArg(args, "--codex-bin") || "codex"; + const cwd = getArg(args, "--cwd") || process.cwd(); + const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); + const result = await runCodexAppServerSwitchProbe({ codexBin, cwd, timeoutMs }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + process.exitCode = result.status === "blocked" ? 1 : 0; +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((error) => { + process.stderr.write(`codex-app-server-switch-probe failed: ${error.message}\n`); + process.exitCode = 1; + }); +} diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js new file mode 100644 index 0000000..8eed49b --- /dev/null +++ b/test/codex-app-server-switch-probe.test.js @@ -0,0 +1,148 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runCodexAppServerSwitchProbe } from "../scripts/codex-app-server-switch-probe.js"; + +function createTargets() { + return [ + { + id: "openai-quick", + label: "quick", + target_class: "cheap_fast", + capabilities: ["chat", "structured_output"], + privacy_tier: "external", + availability: "available" + }, + { + id: "openai-coder", + label: "best coder", + target_class: "strong_coding", + capabilities: [ + "chat", + "reasoning", + "structured_output", + "repo_context", + "file_read", + "file_edit", + "shell_execution", + "test_execution" + ], + privacy_tier: "external", + availability: "available" + } + ]; +} + +function createFakeCodexBin() { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-probe-test-")); + const binPath = path.join(dir, "codex"); + fs.writeFileSync( + binPath, + `#!/usr/bin/env node +const readline = require("node:readline"); + +let nextTurn = 1; +const thread = { + id: "thread-123", + sessionId: "session-abc", + turns: [] +}; + +function write(message) { + process.stdout.write(JSON.stringify(message) + "\\n"); +} + +function respond(id, result) { + write({ jsonrpc: "2.0", id, result }); +} + +function makeTurn(model) { + return { id: "turn-" + nextTurn++, status: "completed", model }; +} + +if (process.argv.slice(2).join(" ") !== "app-server --listen stdio://") { + process.exit(2); +} + +const rl = readline.createInterface({ input: process.stdin }); +rl.on("line", (line) => { + if (!line.trim()) return; + const message = JSON.parse(line); + if (message.method === "initialize") { + respond(message.id, { userAgent: "fake-codex", codexHome: "/tmp/fake-codex", platformFamily: "unix", platformOs: "macos" }); + return; + } + if (message.method === "initialized") return; + if (message.method === "thread/start") { + respond(message.id, { + thread, + model: message.params.model, + modelProvider: "openai", + serviceTier: null, + cwd: message.params.cwd, + instructionSources: [], + approvalPolicy: "never", + approvalsReviewer: "auto", + sandbox: "read-only", + reasoningEffort: null + }); + write({ method: "thread/started", params: { thread } }); + return; + } + if (message.method === "turn/start") { + const turn = makeTurn(message.params.model); + thread.turns.push(turn); + respond(message.id, { turn }); + write({ method: "turn/started", params: { threadId: message.params.threadId, turn } }); + write({ + method: "item/completed", + params: { + threadId: message.params.threadId, + turnId: turn.id, + completedAtMs: Date.now(), + item: { type: "agentMessage", id: "item-" + turn.id, text: turn.id + " complete", phase: null, memoryCitation: null } + } + }); + write({ method: "turn/completed", params: { threadId: message.params.threadId, turn } }); + return; + } + if (message.method === "thread/read") { + respond(message.id, { thread }); + return; + } + respond(message.id, {}); +}); +`, + "utf8" + ); + fs.chmodSync(binPath, 0o755); + return binPath; +} + +test("codex app-server switch probe verifies accepted model override on one thread", async () => { + const result = await runCodexAppServerSwitchProbe({ + codexBin: createFakeCodexBin(), + targets: createTargets(), + timeoutMs: 5000 + }); + + assert.equal(result.surface, "codex-app-server"); + assert.equal(result.status, "verified"); + assert.equal(result.verdict.appServerModelOverrideAccepted, true); + assert.equal(result.verdict.sameThreadCompleted, true); + assert.equal(result.verdict.targetChanged, true); + assert.equal(result.verdict.modelChanged, true); + assert.equal(result.verdict.interactiveTuiHotSwapProven, false); + assert.equal(result.thread.threadId, "thread-123"); + assert.equal(result.thread.sessionId, "session-abc"); + assert.equal(result.turns[0].selectedTargetId, "openai-coder"); + assert.equal(result.turns[0].requestedModel, "gpt-5.5"); + assert.equal(result.turns[1].selectedTargetId, "openai-quick"); + assert.equal(result.turns[1].requestedModel, "gpt-5.4-mini"); + assert.deepEqual(result.turns[0].agentMessages, ["turn-1 complete"]); + assert.deepEqual(result.turns[1].agentMessages, ["turn-2 complete"]); + assert.equal(result.threadRead.ok, true); + assert.equal(result.threadRead.turnCount, 2); +}); From 2457e1b5781d86e6ec3e0f6362b6f5620c969e82 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:36:30 +0200 Subject: [PATCH 06/23] docs(spike): define Codex app-server supportability gates Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 2 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 201 ++++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index 9804b07..c5c4a5c 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -375,5 +375,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: decide whether the experimental Codex app-server protocol is supportable enough to be the Switchboard-controlled session surface. If no, keep Codex CLI in the deferred candidate bucket despite the positive protocol evidence. +- Next review milestone: work through the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Start with Gate 1, public surface, then Gate 2, protocol stability, before building any product workflow. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 829020b..289015b 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -64,6 +64,207 @@ The spike is partial, but still useful, if Codex CLI supports route-selected mod That outcome means Codex CLI may support a non-interactive or wrapper-style Switchboard workflow, but it does not go beyond Claude parity for the primary product differentiator: automatic model switching inside a running interactive session. +## App-Server Supportability Review + +The app-server result is technically promising, but it should not graduate into product direction until these supportability gates are checked off. Treat this as a product/platform risk review, not an implementation backlog. + +Status legend: + +- `[ ]` Not checked yet. +- `[~]` Partially checked; evidence exists but a material caveat remains. +- `[x]` Checked with durable evidence in this spike. + +### Gate 1: Public Surface + +Status: `[~]` + +Question: Is `codex app-server` an intended external integration surface rather than an internal-only or experimental protocol? + +Evidence needed: + +- `codex app-server --help` exposes the command and relevant subcommands. +- Generated protocol schemas are available through supported CLI commands. +- Documentation, release notes, schema comments, or maintainer statements describe intended use and stability. + +Current evidence: + +- `codex app-server generate-ts` and `generate-json-schema` are available locally. +- Generated protocol bindings include `thread/start`, `turn/start`, and `thread/read`. +- Caveat: the protocol currently appears experimental, and no durable external support policy has been recorded in this spike. + +Pass condition: + +- We can point to a supported CLI command, generated/versioned protocol artifact, or official statement that makes app-server reasonable to depend on for an integration spike. + +Fail condition: + +- The only evidence is private implementation behavior or generated files with no indication that external clients may rely on them. + +### Gate 2: Protocol Stability + +Status: `[ ]` + +Question: Are the required methods and fields stable enough to build against without excessive breakage risk? + +Evidence needed: + +- Required methods: `initialize`, `thread/start`, `turn/start`, `thread/read`. +- Required fields: `thread.id`, `thread.sessionId`, `ThreadStartParams.model`, `TurnStartParams.model`, `TurnStartParams.input`, `ThreadReadParams.includeTurns`. +- Version/capability marker for the experimental API, or a documented compatibility/deprecation story. +- Snapshot or fixture that records the minimum protocol shape Switchboard depends on. + +Pass condition: + +- The probe can validate the required protocol shape and fail clearly when it changes. + +Fail condition: + +- Required fields are absent, unstable across versions, or only inferable from raw rollout internals. + +### Gate 3: User Install And Auth Path + +Status: `[ ]` + +Question: Can a normal Switchboard user run the app-server path without fragile local setup? + +Evidence needed: + +- Codex CLI version requirement. +- Authentication requirement and failure mode. +- Whether app-server is available in the same Codex CLI users install for normal interactive use. +- Clear diagnostic when Codex is missing, unauthenticated, or too old. + +Pass condition: + +- A user can install/login/run a preflight command and get actionable output before Switchboard attempts a routed session. + +Fail condition: + +- The path depends on developer-only builds, hidden flags, or undocumented local state. + +### Gate 4: Process Lifecycle Safety + +Status: `[ ]` + +Question: Can Switchboard safely own a long-running app-server process? + +Evidence needed: + +- Start/stop behavior. +- Clean shutdown after idle or user exit. +- Behavior after app-server crash. +- Behavior after interrupted turn. +- Handling for malformed JSON, protocol errors, stderr warnings, and child process spawn failure. + +Pass condition: + +- A lifecycle probe demonstrates start, two turns, interruption or shutdown, and clear error reporting. + +Fail condition: + +- A failed or interrupted app-server leaves Switchboard unable to recover without manual cleanup. + +### Gate 5: Continuity And Session Semantics + +Status: `[x]` + +Question: Does the app-server path preserve one continuous session while Switchboard changes the route-selected model? + +Evidence needed: + +- Two turns complete on the same `threadId` / `sessionId`. +- `thread/read` can inspect the resulting thread and report both turns. +- No `codex exec resume` boundary is required. + +Current evidence: + +- `npm run switchboard:spike:codex-app-server` returned `status: verified` on 2026-05-13. +- Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`. +- Turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`. +- Both turns completed on `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`. +- `thread/read` returned `turnCount: 2`. + +Pass condition: + +- Already met for the current local Codex CLI version. + +Fail condition: + +- Future live probes cannot preserve same-thread continuity when `turn/start.model` changes. + +### Gate 6: Model Evidence + +Status: `[~]` + +Question: Can we prove the second turn actually used the route-selected model, not merely that Codex accepted a model override request? + +Evidence needed: + +- Per-turn model/provider metadata from `turn/start`, `turn/completed`, `thread/read`, `model/rerouted`, logs, or another supportable app-server notification. +- Clear distinction between requested model, accepted model, rerouted model, and unavailable telemetry. + +Current evidence: + +- The app-server accepted `turn/start` with `model: "gpt-5.4-mini"` on the second turn and completed the turn on the same thread. +- No `model/rerouted` notification was emitted in the repeatable live probe. + +Pass condition: + +- We can capture per-turn selected or effective model metadata, or explicitly decide that accepted override plus same-thread completion is enough for the product risk tolerance. + +Fail condition: + +- We cannot capture backend model telemetry and the product requires provider-side attestation before making claims. + +### Gate 7: Product Fit + +Status: `[~]` + +Question: Does the app-server path reduce cognitive overhead enough to justify a Codex product surface beyond Claude parity? + +Evidence needed: + +- User can stay in one Switchboard-controlled session loop. +- Switchboard routes before each prompt without manual model selection. +- The workflow avoids repeated exit/resume commands. +- UX implications are documented, including that this is not the stock Codex interactive TUI unless we build a wrapper surface. + +Current evidence: + +- The app-server probe avoids `exec resume` and manual model selection. +- Caveat: turning this into a usable product likely means Switchboard owns the session UI or loop. + +Pass condition: + +- Product accepts a Switchboard-controlled Codex session surface as the differentiated workflow. + +Fail condition: + +- Product requires hot-swapping inside the stock Codex TUI specifically, and app-server cannot provide that. + +### Supportability Decision Rule + +Codex app-server can graduate from promising spike to serious product-surface candidate only if: + +1. Gates 1, 2, 3, 4, 5, and 7 pass. +2. Gate 6 either passes or is explicitly accepted as a known risk. +3. No gate requires private Codex internals or raw rollout-file parsing as the primary mechanism. + +If Gate 7 fails because the required user experience must be the stock Codex TUI, classify Codex as `partial` despite the app-server evidence. + +### Next Check Order + +Work through the gates in this order: + +1. Gate 1: Public Surface. +2. Gate 2: Protocol Stability. +3. Gate 6: Model Evidence. +4. Gate 3: User Install And Auth Path. +5. Gate 4: Process Lifecycle Safety. +6. Gate 7: Product Fit. + +Gate 5 is already checked for the current local Codex CLI version. + ## Failure Criteria The spike should be considered failed or blocked if any of the following are true: From 51f6411ec15593eb968d537162379263bd273bb4 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:41:25 +0200 Subject: [PATCH 07/23] docs(spike): check Codex app-server public surface Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 1 + docs/product/CODEX-CLI-SPIKE-SCOPE.md | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index c5c4a5c..a5eb4ad 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -363,6 +363,7 @@ Verification signal: - App-server evidence observed on 2026-05-13: generated Codex app-server protocol bindings expose `turn/start` with a `model` override documented as applying to "this turn and subsequent turns." A live stdio smoke test started one thread with `gpt-5.5`, completed a first turn, then completed a second `turn/start` on the same `threadId` / `sessionId` with `model: "gpt-5.4-mini"` and no `exec resume` boundary. - Repeatable app-server probe: `npm run switchboard:spike:codex-app-server` now exercises the same one-thread, two-turn app-server path and records route-selected targets, requested models, thread/session IDs, turn completion, agent-message evidence, and any `model/rerouted` telemetry. - Repeatable live app-server evidence observed on 2026-05-13: `npm run switchboard:spike:codex-app-server` returned `status: verified`. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns completed on the same `threadId` / `sessionId` `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`; `thread/read` returned `turnCount: 2`. No `model/rerouted` telemetry was emitted. +- Gate 1 public-surface review completed on 2026-05-13 against local `codex-cli 0.130.0`, local app-server help output, generated stable TypeScript bindings, and OpenAI's public `openai/codex` app-server README. Result: pass for a bounded integration spike. Caveat: `codex app-server` and schema generation are still labeled experimental, so this is not a production stability claim. Decision: - Chosen option: Option B. diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 289015b..691f61b 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -76,7 +76,7 @@ Status legend: ### Gate 1: Public Surface -Status: `[~]` +Status: `[x]` Question: Is `codex app-server` an intended external integration surface rather than an internal-only or experimental protocol? @@ -88,13 +88,23 @@ Evidence needed: Current evidence: -- `codex app-server generate-ts` and `generate-json-schema` are available locally. -- Generated protocol bindings include `thread/start`, `turn/start`, and `thread/read`. -- Caveat: the protocol currently appears experimental, and no durable external support policy has been recorded in this spike. +- Local Codex CLI version checked on 2026-05-13: `codex-cli 0.130.0`. +- `codex app-server --help` exposes `app-server` as an available command with `stdio://`, websocket, unix socket, and `off` listen modes. The command is labeled `[experimental]`. +- `codex app-server generate-ts --help` and `codex app-server generate-json-schema --help` expose schema generation commands. Both are labeled `[experimental]`, and both support an `--experimental` flag to include experimental methods and fields. +- OpenAI's public `openai/codex` repository documents `codex app-server` as the interface Codex uses to power rich interfaces such as the Codex VS Code extension. +- The same README documents JSON-RPC-like app-server protocol semantics, lifecycle, initialization, schema generation, and client identification. +- The README states that generated TypeScript and JSON Schema artifacts are specific to the Codex version used to generate them and match that version. +- The README also distinguishes stable surface from experimental surface. It says stable-only output is the default, while experimental methods and fields require opt-in at schema-generation time or runtime initialization. +- Generated stable TypeScript bindings include the Switchboard-relevant methods and fields: + - `ClientRequest` includes `thread/start`, `turn/start`, and `thread/read`. + - `ThreadStartParams` includes `model?: string | null`. + - `TurnStartParams` includes `model?: string | null` with the comment "Override the model for this turn and subsequent turns." + - `ThreadReadParams` includes `includeTurns: boolean`. +- Caveat: the command and schema tooling remain labeled experimental, and the README says experimental methods and fields have no backwards-compatible guarantees. This gate therefore passes only for a bounded integration spike, not as a production stability claim. Pass condition: -- We can point to a supported CLI command, generated/versioned protocol artifact, or official statement that makes app-server reasonable to depend on for an integration spike. +- Met for spike purposes. We can point to a public OpenAI repository README, a local CLI command surface, and generated protocol artifacts that make app-server reasonable to depend on for the next feasibility step. Fail condition: From e2459df25474e239ace79771d85bb68ecfea2ff1 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:47:39 +0200 Subject: [PATCH 08/23] test(spike): check Codex app-server protocol shape Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 1 + docs/decision-log.md | 5 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 15 +- package.json | 1 + scripts/codex-app-server-protocol-check.js | 185 +++++++++++++++++++ test/codex-app-server-protocol-check.test.js | 70 +++++++ 6 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 scripts/codex-app-server-protocol-check.js create mode 100644 test/codex-app-server-protocol-check.test.js diff --git a/README.md b/README.md index 75145e0..c29d763 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `switchboard probe continuity-interactive` | Runs the interactive continuity probe and verifies resume/session behavior across turns. | You want to validate interactive continuity and related checks. | | `npm run switchboard:spike:codex-cli` | Inspects the local Codex CLI command surface and maps two routed turns to Codex `exec`/`resume --model` plans without making live model calls. | You want a product-aligned feasibility signal for Codex CLI route authority before building a deeper integration. | | `npm run switchboard:spike:codex-cli:live` | Runs the bounded two-turn Codex CLI resume probe with route-selected models and captures JSON/session evidence. | You are ready to collect live evidence for the Codex CLI feasibility spike. | +| `npm run switchboard:spike:codex-app-server:protocol` | Generates Codex app-server TypeScript bindings and verifies the minimum protocol shape Switchboard depends on. | You are checking whether Codex app-server protocol changes would break the feasibility spike. | | `npm run switchboard:spike:codex-app-server` | Runs the bounded app-server in-session switch probe with one thread and two route-selected `turn/start` model overrides. | You are evaluating whether Codex app-server can be a Switchboard-controlled session surface beyond `exec`/`resume` parity. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | diff --git a/docs/decision-log.md b/docs/decision-log.md index a5eb4ad..88b92b3 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -364,6 +364,7 @@ Verification signal: - Repeatable app-server probe: `npm run switchboard:spike:codex-app-server` now exercises the same one-thread, two-turn app-server path and records route-selected targets, requested models, thread/session IDs, turn completion, agent-message evidence, and any `model/rerouted` telemetry. - Repeatable live app-server evidence observed on 2026-05-13: `npm run switchboard:spike:codex-app-server` returned `status: verified`. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns completed on the same `threadId` / `sessionId` `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`; `thread/read` returned `turnCount: 2`. No `model/rerouted` telemetry was emitted. - Gate 1 public-surface review completed on 2026-05-13 against local `codex-cli 0.130.0`, local app-server help output, generated stable TypeScript bindings, and OpenAI's public `openai/codex` app-server README. Result: pass for a bounded integration spike. Caveat: `codex app-server` and schema generation are still labeled experimental, so this is not a production stability claim. +- Gate 2 protocol-stability review completed on 2026-05-13 with `scripts/codex-app-server-protocol-check.js`. The check generates Codex app-server TypeScript bindings and verifies the minimum method and field surface Switchboard depends on: `initialize`, `thread/start`, `turn/start`, `thread/read`, `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, and `ThreadReadParams.includeTurns`. Result: pass for the installed Codex CLI version; future Codex releases still require this check as a compatibility guard. Decision: - Chosen option: Option B. @@ -376,5 +377,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: work through the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Start with Gate 1, public surface, then Gate 2, protocol stability, before building any product workflow. -- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, README.md +- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 6, model evidence, is next because it determines whether Switchboard can prove the effective model or must explicitly accept accepted-override evidence as the product risk boundary. +- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 691f61b..2e02a80 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -112,20 +112,29 @@ Fail condition: ### Gate 2: Protocol Stability -Status: `[ ]` +Status: `[x]` Question: Are the required methods and fields stable enough to build against without excessive breakage risk? Evidence needed: - Required methods: `initialize`, `thread/start`, `turn/start`, `thread/read`. -- Required fields: `thread.id`, `thread.sessionId`, `ThreadStartParams.model`, `TurnStartParams.model`, `TurnStartParams.input`, `ThreadReadParams.includeTurns`. +- Required fields: `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, `ThreadReadParams.includeTurns`. - Version/capability marker for the experimental API, or a documented compatibility/deprecation story. - Snapshot or fixture that records the minimum protocol shape Switchboard depends on. +Current evidence: + +- [../../scripts/codex-app-server-protocol-check.js](../../scripts/codex-app-server-protocol-check.js) generates app-server TypeScript bindings with `codex app-server generate-ts --out ` and validates the minimum protocol shape Switchboard depends on. +- The check verifies required client methods in `ClientRequest.ts`: `initialize`, `thread/start`, `turn/start`, and `thread/read`. +- The check verifies required generated fields: `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, and `ThreadReadParams.includeTurns`. +- [../../test/codex-app-server-protocol-check.test.js](../../test/codex-app-server-protocol-check.test.js) records a fixture for the minimum protocol shape and verifies the checker fails clearly if `TurnStartParams.model` disappears. +- `npm run switchboard:spike:codex-app-server:protocol` returned `status: verified` on 2026-05-13 against freshly generated bindings from local `codex-cli 0.130.0`. +- Caveat: this checks schema shape for the installed Codex CLI version. It does not promise backwards compatibility across future Codex releases; it gives Switchboard a fast failure signal when the app-server protocol changes. + Pass condition: -- The probe can validate the required protocol shape and fail clearly when it changes. +- Met for spike purposes. The protocol check validates the required shape and fails clearly when required methods or fields change. Fail condition: diff --git a/package.json b/package.json index 153ac58..60e2e07 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json", "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js", "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live", + "switchboard:spike:codex-app-server:protocol": "node scripts/codex-app-server-protocol-check.js", "switchboard:spike:codex-app-server": "node scripts/codex-app-server-switch-probe.js" }, "repository": { diff --git a/scripts/codex-app-server-protocol-check.js b/scripts/codex-app-server-protocol-check.js new file mode 100644 index 0000000..695116c --- /dev/null +++ b/scripts/codex-app-server-protocol-check.js @@ -0,0 +1,185 @@ +#!/usr/bin/env node +/* eslint-disable security/detect-non-literal-fs-filename */ +import { spawn } from "node:child_process"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { setTimeout, clearTimeout } from "node:timers"; +import { fileURLToPath } from "node:url"; + +const REQUIRED_METHODS = ["initialize", "thread/start", "turn/start", "thread/read"]; +const REQUIRED_FILES = { + ClientRequest: "ClientRequest.ts", + InitializeCapabilities: "InitializeCapabilities.ts", + ThreadStartParams: "ThreadStartParams.ts", + TurnStartParams: "TurnStartParams.ts", + ThreadReadParams: "ThreadReadParams.ts" +}; + +function getArg(args, flag) { + const idx = args.lastIndexOf(flag); + if (idx === -1 || idx + 1 >= args.length) return null; + return args[idx + 1]; +} + +function fileExists(filePath) { + try { + return fs.statSync(filePath).isFile(); + } catch { + return false; + } +} + +function readFile(filePath) { + return fs.readFileSync(filePath, "utf8"); +} + +function findBindingFile(rootDir, filename) { + const candidates = [path.join(rootDir, "v2", filename), path.join(rootDir, filename)]; + return candidates.find(fileExists) || path.join(rootDir, filename); +} + +function ensureBindingRoot(rootDir) { + if (!fileExists(findBindingFile(rootDir, REQUIRED_FILES.ClientRequest))) { + throw new Error(`Could not find generated app-server TypeScript bindings in ${rootDir}`); + } +} + +function hasMethod(clientRequestSource, method) { + return clientRequestSource.includes(`"method": "${method}"`); +} + +function hasOptionalStringField(source, fieldName) { + const compact = source.replace(/\s+/g, ""); + return compact.includes(`${fieldName}?:string|null`); +} + +function hasBooleanField(source, fieldName) { + const compact = source.replace(/\s+/g, ""); + return compact.includes(`${fieldName}:boolean`); +} + +export function validateCodexAppServerProtocolShape(rootDir) { + ensureBindingRoot(rootDir); + const files = Object.fromEntries( + Object.entries(REQUIRED_FILES).map(([key, filename]) => { + const filePath = findBindingFile(rootDir, filename); + if (!fileExists(filePath)) { + return [key, { filePath, ok: false, reason: "missing" }]; + } + return [key, { filePath, ok: true, source: readFile(filePath) }]; + }) + ); + + const failures = []; + for (const [key, file] of Object.entries(files)) { + if (!file.ok) failures.push(`${key}: missing ${file.filePath}`); + } + + const methods = Object.fromEntries( + REQUIRED_METHODS.map((method) => { + const ok = files.ClientRequest.ok && hasMethod(files.ClientRequest.source, method); + if (!ok) failures.push(`ClientRequest: missing method ${method}`); + return [method, ok]; + }) + ); + + const fields = { + "InitializeCapabilities.experimentalApi": + files.InitializeCapabilities.ok && files.InitializeCapabilities.source.includes("experimentalApi: boolean"), + "ThreadStartParams.model": files.ThreadStartParams.ok && hasOptionalStringField(files.ThreadStartParams.source, "model"), + "TurnStartParams.threadId": files.TurnStartParams.ok && files.TurnStartParams.source.includes("threadId: string"), + "TurnStartParams.input": files.TurnStartParams.ok && files.TurnStartParams.source.includes("input: Array<"), + "TurnStartParams.model": files.TurnStartParams.ok && hasOptionalStringField(files.TurnStartParams.source, "model"), + "ThreadReadParams.threadId": files.ThreadReadParams.ok && files.ThreadReadParams.source.includes("threadId: string"), + "ThreadReadParams.includeTurns": files.ThreadReadParams.ok && hasBooleanField(files.ThreadReadParams.source, "includeTurns") + }; + + for (const [field, ok] of Object.entries(fields)) { + if (!ok) failures.push(`missing field ${field}`); + } + + return { + status: failures.length === 0 ? "verified" : "failed", + surface: "codex-app-server-protocol", + bindingDir: rootDir, + filePaths: Object.fromEntries(Object.entries(files).map(([key, file]) => [key, file.filePath])), + methods, + fields, + failures + }; +} + +function runCommand(command, args, { timeoutMs }) { + return new Promise((resolve, reject) => { + const child = spawn(command, args, { + env: { ...process.env, NO_COLOR: "1" }, + stdio: ["ignore", "pipe", "pipe"] + }); + let stdout = ""; + let stderr = ""; + const timer = setTimeout(() => { + child.kill("SIGTERM"); + reject(new Error(`${command} ${args.join(" ")} timed out after ${timeoutMs}ms`)); + }, timeoutMs); + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + child.stdout.on("data", (chunk) => { + stdout += chunk; + }); + child.stderr.on("data", (chunk) => { + stderr += chunk; + }); + child.on("error", (error) => { + clearTimeout(timer); + reject(error); + }); + child.on("exit", (code, signal) => { + clearTimeout(timer); + if (code === 0) { + resolve({ stdout, stderr }); + } else { + reject(new Error(`${command} exited with code ${code ?? "null"} and signal ${signal ?? "null"}: ${stderr}`)); + } + }); + }); +} + +export async function runCodexAppServerProtocolCheck({ + codexBin = "codex", + bindingsDir = null, + timeoutMs = 30000 +} = {}) { + const generatedDir = bindingsDir || fs.mkdtempSync(path.join(os.tmpdir(), "switchboard-codex-app-server-ts-")); + const generated = !bindingsDir; + const evidence = {}; + if (generated) { + evidence.generateTs = await runCommand(codexBin, ["app-server", "generate-ts", "--out", generatedDir], { timeoutMs }); + } + return { + generated, + ...validateCodexAppServerProtocolShape(generatedDir), + evidence: { + command: generated ? `${codexBin} app-server generate-ts --out ${generatedDir}` : null, + generatedDir, + stderrTail: evidence.generateTs?.stderr?.slice(-1600) || "" + } + }; +} + +async function main() { + const args = process.argv.slice(2); + const codexBin = getArg(args, "--codex-bin") || "codex"; + const bindingsDir = getArg(args, "--bindings-dir"); + const timeoutMs = Number(getArg(args, "--timeout-ms") || 30000); + const result = await runCodexAppServerProtocolCheck({ codexBin, bindingsDir, timeoutMs }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + process.exitCode = result.status === "verified" ? 0 : 1; +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((error) => { + process.stderr.write(`codex-app-server-protocol-check failed: ${error.message}\n`); + process.exitCode = 1; + }); +} diff --git a/test/codex-app-server-protocol-check.test.js b/test/codex-app-server-protocol-check.test.js new file mode 100644 index 0000000..f8540cd --- /dev/null +++ b/test/codex-app-server-protocol-check.test.js @@ -0,0 +1,70 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { validateCodexAppServerProtocolShape } from "../scripts/codex-app-server-protocol-check.js"; + +function writeFixture(files) { + const root = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-protocol-check-test-")); + const dir = path.join(root, "v2"); + fs.mkdirSync(dir); + for (const [filename, source] of Object.entries(files)) { + fs.writeFileSync(path.join(dir, filename), source, "utf8"); + } + return root; +} + +function validFixture() { + return writeFixture({ + "ClientRequest.ts": ` + export type ClientRequest = + { "method": "initialize", id: string, params: unknown } | + { "method": "thread/start", id: string, params: ThreadStartParams } | + { "method": "turn/start", id: string, params: TurnStartParams } | + { "method": "thread/read", id: string, params: ThreadReadParams }; + `, + "InitializeCapabilities.ts": "export type InitializeCapabilities = { experimentalApi: boolean };", + "ThreadStartParams.ts": "export type ThreadStartParams = { model?: string | null };", + "TurnStartParams.ts": ` + export type TurnStartParams = { + threadId: string, + input: Array, + model?: string | null + }; + `, + "ThreadReadParams.ts": "export type ThreadReadParams = { threadId: string, includeTurns: boolean };" + }); +} + +test("protocol check verifies the minimum app-server surface Switchboard depends on", () => { + const result = validateCodexAppServerProtocolShape(validFixture()); + + assert.equal(result.status, "verified"); + assert.deepEqual(result.failures, []); + assert.equal(result.methods.initialize, true); + assert.equal(result.methods["thread/start"], true); + assert.equal(result.methods["turn/start"], true); + assert.equal(result.methods["thread/read"], true); + assert.equal(result.fields["InitializeCapabilities.experimentalApi"], true); + assert.equal(result.fields["ThreadStartParams.model"], true); + assert.equal(result.fields["TurnStartParams.threadId"], true); + assert.equal(result.fields["TurnStartParams.input"], true); + assert.equal(result.fields["TurnStartParams.model"], true); + assert.equal(result.fields["ThreadReadParams.threadId"], true); + assert.equal(result.fields["ThreadReadParams.includeTurns"], true); +}); + +test("protocol check fails clearly when a required model override field disappears", () => { + const root = validFixture(); + fs.writeFileSync( + path.join(root, "v2", "TurnStartParams.ts"), + "export type TurnStartParams = { threadId: string, input: Array };", + "utf8" + ); + + const result = validateCodexAppServerProtocolShape(root); + + assert.equal(result.status, "failed"); + assert.match(result.failures.join("\n"), /TurnStartParams\.model/); +}); From c9650cce68913b15102dda3fa849936cc4bd3295 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Wed, 13 May 2026 17:51:49 +0200 Subject: [PATCH 09/23] test(spike): record Codex model evidence limits Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 3 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 8 ++- scripts/codex-app-server-switch-probe.js | 75 ++++++++++++++++++++-- test/codex-app-server-switch-probe.test.js | 21 ++++++ 4 files changed, 100 insertions(+), 7 deletions(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index 88b92b3..53e167a 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -365,6 +365,7 @@ Verification signal: - Repeatable live app-server evidence observed on 2026-05-13: `npm run switchboard:spike:codex-app-server` returned `status: verified`. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns completed on the same `threadId` / `sessionId` `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`; `thread/read` returned `turnCount: 2`. No `model/rerouted` telemetry was emitted. - Gate 1 public-surface review completed on 2026-05-13 against local `codex-cli 0.130.0`, local app-server help output, generated stable TypeScript bindings, and OpenAI's public `openai/codex` app-server README. Result: pass for a bounded integration spike. Caveat: `codex app-server` and schema generation are still labeled experimental, so this is not a production stability claim. - Gate 2 protocol-stability review completed on 2026-05-13 with `scripts/codex-app-server-protocol-check.js`. The check generates Codex app-server TypeScript bindings and verifies the minimum method and field surface Switchboard depends on: `initialize`, `thread/start`, `turn/start`, `thread/read`, `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, and `ThreadReadParams.includeTurns`. Result: pass for the installed Codex CLI version; future Codex releases still require this check as a compatibility guard. +- Gate 6 model-evidence review completed on 2026-05-13 with an expanded `scripts/codex-app-server-switch-probe.js`. The live probe now separates requested model evidence from observed effective/backend model telemetry. Result: partial. A live run requested `gpt-5.5` then `gpt-5.4-mini` on the same app-server thread, but observed no effective model field in turn payloads, `thread/read`, raw response items, `model/rerouted`, or `model/verification`. Product must either accept requested-override plus same-thread completion as sufficient, or keep backend model attestation as an unresolved requirement. Decision: - Chosen option: Option B. @@ -377,5 +378,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 6, model evidence, is next because it determines whether Switchboard can prove the effective model or must explicitly accept accepted-override evidence as the product risk boundary. +- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 3, user install and auth path, is next. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 2e02a80..1a6d254 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -225,7 +225,13 @@ Evidence needed: Current evidence: - The app-server accepted `turn/start` with `model: "gpt-5.4-mini"` on the second turn and completed the turn on the same thread. -- No `model/rerouted` notification was emitted in the repeatable live probe. +- [../../scripts/codex-app-server-switch-probe.js](../../scripts/codex-app-server-switch-probe.js) now records model evidence separately from route requests: requested models, turn payload model fields, `thread/read` model fields, raw response item model fields, `model/rerouted`, and `model/verification`. +- `npm run switchboard:spike:codex-app-server` returned `status: verified` on 2026-05-13 with two requested models on one thread: `gpt-5.5` then `gpt-5.4-mini`. +- The same live run reported `backendModelTelemetryObserved: false`. +- `turn/start` responses and `turn/completed` notifications contained no effective model field. +- `thread/read` returned both turns but no per-turn or item-level model field for the executed model. +- No `rawResponseItem/completed`, `model/rerouted`, or `model/verification` model telemetry was emitted in the live run. +- Caveat: Gate 6 remains partial. We can prove requested model override plus same-thread completion, but not provider-side effective model attestation. Pass condition: diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js index 07b4cfb..f6e84e2 100644 --- a/scripts/codex-app-server-switch-probe.js +++ b/scripts/codex-app-server-switch-probe.js @@ -217,11 +217,15 @@ class JsonLineClient { } function summarizeTurn({ label, plan, response, completedNotification, messages }) { + const responseModel = response?.model || response?.turn?.model || null; + const completedModel = completedNotification?.params?.model || completedNotification?.params?.turn?.model || null; return { label, selectedTargetId: plan.route.selectedTargetId, selectedProfile: plan.codex.profile, requestedModel: plan.codex.model, + responseModel, + completedModel, turnId: response?.turn?.id || completedNotification?.params?.turn?.id || null, completed: Boolean(completedNotification), agentMessages: messages @@ -258,16 +262,73 @@ async function runTurn(client, { label, threadId, plan, cwd }) { async function maybeReadThread(client, threadId) { try { const result = await client.request("thread/read", { threadId, includeTurns: true }); + const turns = Array.isArray(result?.thread?.turns) + ? result.thread.turns.map((turn) => ({ + turnId: turn?.id || null, + model: turn?.model || null, + itemModels: Array.isArray(turn?.items) + ? turn.items + .map((item) => item?.model) + .filter((model) => typeof model === "string") + : [] + })) + : []; return { ok: true, turnCount: Array.isArray(result?.thread?.turns) ? result.thread.turns.length : null, - itemCount: Array.isArray(result?.thread?.items) ? result.thread.items.length : null + itemCount: Array.isArray(result?.thread?.items) ? result.thread.items.length : null, + turns }; } catch (error) { return { ok: false, error: error.message }; } } +function collectModelEvidence({ notifications, turns, threadRead }) { + const rerouted = notifications.filter((message) => message.method === "model/rerouted"); + const verification = notifications.filter((message) => message.method === "model/verification"); + const rawResponseModels = notifications + .filter((message) => message.method === "rawResponseItem/completed") + .map((message) => ({ + turnId: message.params?.turnId || null, + model: message.params?.item?.model || null, + type: message.params?.item?.type || null + })) + .filter((entry) => entry.model); + const turnPayloadModels = turns + .flatMap((turn) => [ + { turnId: turn.turnId, source: "turn/start response", model: turn.responseModel }, + { turnId: turn.turnId, source: "turn/completed notification", model: turn.completedModel } + ]) + .filter((entry) => entry.model); + const threadReadModels = threadRead?.turns + ? threadRead.turns.flatMap((turn) => [ + ...(turn.model ? [{ turnId: turn.turnId, source: "thread/read turn", model: turn.model }] : []), + ...turn.itemModels.map((model) => ({ turnId: turn.turnId, source: "thread/read item", model })) + ]) + : []; + + return { + requestedModels: turns.map((turn) => ({ + turnId: turn.turnId, + selectedTargetId: turn.selectedTargetId, + selectedProfile: turn.selectedProfile, + requestedModel: turn.requestedModel + })), + turnPayloadModels, + threadReadModels, + rawResponseModels, + rerouted, + verification, + backendModelTelemetryObserved: + turnPayloadModels.length > 0 || + threadReadModels.length > 0 || + rawResponseModels.length > 0 || + rerouted.length > 0 || + verification.length > 0 + }; +} + export async function runCodexAppServerSwitchProbe({ codexBin = "codex", targets = readJson(OPENAI_TARGETS_PATH).targets, @@ -328,7 +389,11 @@ export async function runCodexAppServerSwitchProbe({ const sameThreadCompleted = firstTurn.completed && secondTurn.completed; const requestedModelOverrideAccepted = secondTurn.requestedModel === second.codex.model && sameThreadCompleted; - const modelRerouted = client.notifications.filter((message) => message.method === "model/rerouted"); + const modelEvidence = collectModelEvidence({ + notifications: client.notifications, + turns: [firstTurn, secondTurn], + threadRead + }); const status = requestedModelOverrideAccepted ? "verified" : "partial"; return { @@ -340,7 +405,7 @@ export async function runCodexAppServerSwitchProbe({ sameThreadCompleted, targetChanged, modelChanged, - backendModelTelemetryObserved: modelRerouted.length > 0, + backendModelTelemetryObserved: modelEvidence.backendModelTelemetryObserved, interactiveTuiHotSwapProven: false }, thread: { @@ -351,7 +416,7 @@ export async function runCodexAppServerSwitchProbe({ }, turns: [firstTurn, secondTurn], threadRead, - modelRerouted, + modelEvidence, notificationCounts: client.notifications.reduce((counts, message) => { counts[message.method] = (counts[message.method] || 0) + 1; return counts; @@ -363,7 +428,7 @@ export async function runCodexAppServerSwitchProbe({ }, limitations: [ "The generated app-server protocol is experimental.", - "The probe verifies accepted turn-level model override requests and same-thread completion, but it does not currently observe provider-side backend model telemetry unless Codex emits model/rerouted.", + "The probe verifies accepted turn-level model override requests and same-thread completion, but it does not currently observe provider-side backend model telemetry unless Codex emits model/rerouted, model/verification, raw response model metadata, or turn/thread model payload fields.", "This does not prove that the Codex interactive TUI itself can be hot-swapped." ], stderrTail: tailText(client.stderr), diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js index 8eed49b..2505fe9 100644 --- a/test/codex-app-server-switch-probe.test.js +++ b/test/codex-app-server-switch-probe.test.js @@ -96,6 +96,18 @@ rl.on("line", (line) => { thread.turns.push(turn); respond(message.id, { turn }); write({ method: "turn/started", params: { threadId: message.params.threadId, turn } }); + if (turn.id === "turn-2") { + write({ + method: "model/rerouted", + params: { + threadId: message.params.threadId, + turnId: turn.id, + fromModel: "gpt-5.5", + toModel: message.params.model, + reason: "highRiskCyberActivity" + } + }); + } write({ method: "item/completed", params: { @@ -134,6 +146,7 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.equal(result.verdict.sameThreadCompleted, true); assert.equal(result.verdict.targetChanged, true); assert.equal(result.verdict.modelChanged, true); + assert.equal(result.verdict.backendModelTelemetryObserved, true); assert.equal(result.verdict.interactiveTuiHotSwapProven, false); assert.equal(result.thread.threadId, "thread-123"); assert.equal(result.thread.sessionId, "session-abc"); @@ -145,4 +158,12 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.deepEqual(result.turns[1].agentMessages, ["turn-2 complete"]); assert.equal(result.threadRead.ok, true); assert.equal(result.threadRead.turnCount, 2); + assert.deepEqual(result.modelEvidence.turnPayloadModels, [ + { turnId: "turn-1", source: "turn/start response", model: "gpt-5.5" }, + { turnId: "turn-1", source: "turn/completed notification", model: "gpt-5.5" }, + { turnId: "turn-2", source: "turn/start response", model: "gpt-5.4-mini" }, + { turnId: "turn-2", source: "turn/completed notification", model: "gpt-5.4-mini" } + ]); + assert.equal(result.modelEvidence.rerouted.length, 1); + assert.equal(result.modelEvidence.rerouted[0].params.toModel, "gpt-5.4-mini"); }); From bfa44edcac88d4f49919283cd826ec843dc77dad Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:22:00 +0200 Subject: [PATCH 10/23] test(spike): add Codex app-server preflight Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 1 + docs/decision-log.md | 5 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 12 +- package.json | 1 + scripts/codex-app-server-preflight.js | 354 ++++++++++++++++++++++++ test/codex-app-server-preflight.test.js | 131 +++++++++ 6 files changed, 500 insertions(+), 4 deletions(-) create mode 100644 scripts/codex-app-server-preflight.js create mode 100644 test/codex-app-server-preflight.test.js diff --git a/README.md b/README.md index c29d763..b894dea 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `switchboard probe continuity-interactive` | Runs the interactive continuity probe and verifies resume/session behavior across turns. | You want to validate interactive continuity and related checks. | | `npm run switchboard:spike:codex-cli` | Inspects the local Codex CLI command surface and maps two routed turns to Codex `exec`/`resume --model` plans without making live model calls. | You want a product-aligned feasibility signal for Codex CLI route authority before building a deeper integration. | | `npm run switchboard:spike:codex-cli:live` | Runs the bounded two-turn Codex CLI resume probe with route-selected models and captures JSON/session evidence. | You are ready to collect live evidence for the Codex CLI feasibility spike. | +| `npm run switchboard:spike:codex-app-server:preflight` | Verifies the local Codex CLI version, app-server command availability, login status, and redacted app-server auth evidence before a routed session starts. | You want to check whether a normal user install can support the Codex app-server spike path. | | `npm run switchboard:spike:codex-app-server:protocol` | Generates Codex app-server TypeScript bindings and verifies the minimum protocol shape Switchboard depends on. | You are checking whether Codex app-server protocol changes would break the feasibility spike. | | `npm run switchboard:spike:codex-app-server` | Runs the bounded app-server in-session switch probe with one thread and two route-selected `turn/start` model overrides. | You are evaluating whether Codex app-server can be a Switchboard-controlled session surface beyond `exec`/`resume` parity. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | diff --git a/docs/decision-log.md b/docs/decision-log.md index 53e167a..1027366 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -365,6 +365,7 @@ Verification signal: - Repeatable live app-server evidence observed on 2026-05-13: `npm run switchboard:spike:codex-app-server` returned `status: verified`. Turn 1 selected `openai-coder` / `codex-best-coder` / `gpt-5.5`; turn 2 selected `openai-quick` / `codex-fast` / `gpt-5.4-mini`; both turns completed on the same `threadId` / `sessionId` `019e21f7-0b2e-7730-9cbd-af5e5536ddbf`; `thread/read` returned `turnCount: 2`. No `model/rerouted` telemetry was emitted. - Gate 1 public-surface review completed on 2026-05-13 against local `codex-cli 0.130.0`, local app-server help output, generated stable TypeScript bindings, and OpenAI's public `openai/codex` app-server README. Result: pass for a bounded integration spike. Caveat: `codex app-server` and schema generation are still labeled experimental, so this is not a production stability claim. - Gate 2 protocol-stability review completed on 2026-05-13 with `scripts/codex-app-server-protocol-check.js`. The check generates Codex app-server TypeScript bindings and verifies the minimum method and field surface Switchboard depends on: `initialize`, `thread/start`, `turn/start`, `thread/read`, `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, and `ThreadReadParams.includeTurns`. Result: pass for the installed Codex CLI version; future Codex releases still require this check as a compatibility guard. +- Gate 3 user-install and auth-path review completed on 2026-05-13 with `scripts/codex-app-server-preflight.js`. The preflight checks the Codex CLI version, app-server command availability, `codex login status`, and redacted app-server auth/account evidence before Switchboard attempts a routed session. Result: pass for the local `codex-cli 0.130.0` install. Caveat: app-server remains experimental, so the preflight is a compatibility guard rather than a stability guarantee. - Gate 6 model-evidence review completed on 2026-05-13 with an expanded `scripts/codex-app-server-switch-probe.js`. The live probe now separates requested model evidence from observed effective/backend model telemetry. Result: partial. A live run requested `gpt-5.5` then `gpt-5.4-mini` on the same app-server thread, but observed no effective model field in turn payloads, `thread/read`, raw response items, `model/rerouted`, or `model/verification`. Product must either accept requested-override plus same-thread completion as sufficient, or keep backend model attestation as an unresolved requirement. Decision: @@ -378,5 +379,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 3, user install and auth path, is next. -- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, README.md +- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 4, process lifecycle safety, is next. +- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, scripts/codex-app-server-preflight.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, test/codex-app-server-preflight.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 1a6d254..184d625 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -142,7 +142,7 @@ Fail condition: ### Gate 3: User Install And Auth Path -Status: `[ ]` +Status: `[x]` Question: Can a normal Switchboard user run the app-server path without fragile local setup? @@ -153,9 +153,17 @@ Evidence needed: - Whether app-server is available in the same Codex CLI users install for normal interactive use. - Clear diagnostic when Codex is missing, unauthenticated, or too old. +Evidence: + +- [../../scripts/codex-app-server-preflight.js](../../scripts/codex-app-server-preflight.js) verifies the local Codex CLI version, checks `codex app-server --help`, checks `codex login status`, starts `codex app-server --listen stdio://`, initializes the experimental app-server protocol, and reads redacted auth/account evidence through `getAuthStatus` and `account/read`. +- [../../test/codex-app-server-preflight.test.js](../../test/codex-app-server-preflight.test.js) covers the normal install path and the actionable failure modes for too-old Codex CLI, missing app-server support, and missing auth. +- `npm run switchboard:spike:codex-app-server:preflight` returned `status: verified` on 2026-05-13 against local `codex-cli 0.130.0`. +- Live preflight evidence: `codex --version` reported `0.130.0`; `codex app-server --help` exposed the experimental app-server command; `codex login status` reported a ChatGPT login; app-server auth returned `authMethod: "chatgpt"` and a redacted ChatGPT account. +- Caveat: this validates the local installed Codex CLI path. The app-server command remains experimental, and the minimum version should stay pinned to the earliest version Switchboard has actually verified. + Pass condition: -- A user can install/login/run a preflight command and get actionable output before Switchboard attempts a routed session. +- Met for spike purposes. A user can install/login/run a preflight command and get actionable output before Switchboard attempts a routed session. Fail condition: diff --git a/package.json b/package.json index 60e2e07..f29e15f 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "switchboard:continuity:interactive": "node bin/switchboard.js probe continuity-interactive --json", "switchboard:spike:codex-cli": "node scripts/codex-cli-feasibility-probe.js", "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live", + "switchboard:spike:codex-app-server:preflight": "node scripts/codex-app-server-preflight.js", "switchboard:spike:codex-app-server:protocol": "node scripts/codex-app-server-protocol-check.js", "switchboard:spike:codex-app-server": "node scripts/codex-app-server-switch-probe.js" }, diff --git a/scripts/codex-app-server-preflight.js b/scripts/codex-app-server-preflight.js new file mode 100644 index 0000000..7ce4307 --- /dev/null +++ b/scripts/codex-app-server-preflight.js @@ -0,0 +1,354 @@ +#!/usr/bin/env node +import { spawn } from "node:child_process"; +import { setTimeout, clearTimeout } from "node:timers"; +import { fileURLToPath } from "node:url"; + +const DEFAULT_MIN_VERSION = "0.130.0"; +const DEFAULT_TIMEOUT_MS = 30000; + +function getArg(args, flag) { + const idx = args.lastIndexOf(flag); + if (idx === -1 || idx + 1 >= args.length) return null; + return args[idx + 1]; +} + +function tailText(text, maxLength = 1600) { + if (!text) return ""; + return text.length > maxLength ? text.slice(text.length - maxLength) : text; +} + +function redact(value) { + if (typeof value === "string") { + if (value.includes("@")) return "[redacted-email]"; + if (value.length > 16) return "[redacted]"; + return value; + } + if (Array.isArray(value)) return value.map(redact); + if (!value || typeof value !== "object") return value; + return Object.fromEntries( + Object.entries(value).map(([key, entry]) => { + if (/token|email|accountId|userId/i.test(key)) return [key, entry == null ? entry : "[redacted]"]; + return [key, redact(entry)]; + }) + ); +} + +export function parseCodexVersion(output) { + const match = String(output).match(/codex-cli\s+(\d+)\.(\d+)\.(\d+)/); + if (!match) return null; + return match.slice(1, 4).map((part) => Number(part)); +} + +export function compareVersions(actual, minimum) { + for (let i = 0; i < 3; i += 1) { + if (actual[i] > minimum[i]) return 1; + if (actual[i] < minimum[i]) return -1; + } + return 0; +} + +function formatVersion(version) { + return Array.isArray(version) ? version.join(".") : null; +} + +function runCommand(command, args, { timeoutMs, input = null }) { + return new Promise((resolve) => { + const child = spawn(command, args, { + env: { ...process.env, NO_COLOR: "1" }, + stdio: ["pipe", "pipe", "pipe"] + }); + let stdout = ""; + let stderr = ""; + const timer = setTimeout(() => { + child.kill("SIGTERM"); + resolve({ + ok: false, + code: null, + signal: "timeout", + stdout, + stderr, + error: `${command} ${args.join(" ")} timed out after ${timeoutMs}ms` + }); + }, timeoutMs); + child.stdout.setEncoding("utf8"); + child.stderr.setEncoding("utf8"); + child.stdout.on("data", (chunk) => { + stdout += chunk; + }); + child.stderr.on("data", (chunk) => { + stderr += chunk; + }); + child.on("error", (error) => { + clearTimeout(timer); + resolve({ ok: false, code: null, signal: null, stdout, stderr, error: error.message }); + }); + child.on("exit", (code, signal) => { + clearTimeout(timer); + resolve({ ok: code === 0, code, signal, stdout, stderr, error: null }); + }); + if (input) child.stdin.write(input); + child.stdin.end(); + }); +} + +function createDeferred() { + let resolve; + let reject; + const promise = new Promise((promiseResolve, promiseReject) => { + resolve = promiseResolve; + reject = promiseReject; + }); + return { promise, resolve, reject }; +} + +function withTimeout(promise, ms, label) { + let timer; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); +} + +class JsonLineClient { + constructor({ codexBin, timeoutMs }) { + this.timeoutMs = timeoutMs; + this.nextId = 1; + this.buffer = ""; + this.stdout = ""; + this.stderr = ""; + this.pending = new Map(); + this.closed = false; + this.child = spawn(codexBin, ["app-server", "--listen", "stdio://"], { + env: { ...process.env, NO_COLOR: "1" }, + stdio: ["pipe", "pipe", "pipe"] + }); + this.child.stdout.setEncoding("utf8"); + this.child.stderr.setEncoding("utf8"); + this.child.stdout.on("data", (chunk) => this.handleStdout(chunk)); + this.child.stderr.on("data", (chunk) => { + this.stderr += chunk; + }); + this.child.on("error", (error) => this.rejectOpenWork(error)); + this.child.on("exit", (code, signal) => { + this.rejectOpenWork( + new Error( + `codex app-server exited with code ${code ?? "null"} and signal ${signal ?? "null"}: ${tailText(this.stderr)}` + ) + ); + }); + } + + rejectOpenWork(error) { + if (this.closed) return; + this.closed = true; + for (const pending of this.pending.values()) pending.reject(error); + this.pending.clear(); + } + + handleStdout(chunk) { + this.stdout += chunk; + this.buffer += chunk; + let newlineIndex = this.buffer.indexOf("\n"); + while (newlineIndex !== -1) { + const line = this.buffer.slice(0, newlineIndex).trim(); + this.buffer = this.buffer.slice(newlineIndex + 1); + if (line) this.handleLine(line); + newlineIndex = this.buffer.indexOf("\n"); + } + } + + handleLine(line) { + let message; + try { + message = JSON.parse(line); + } catch { + return; + } + if (!Object.prototype.hasOwnProperty.call(message, "id")) return; + const pending = this.pending.get(message.id); + if (!pending) return; + this.pending.delete(message.id); + if (message.error) { + pending.reject(new Error(JSON.stringify(message.error))); + } else { + pending.resolve(message.result); + } + } + + request(method, params) { + if (this.closed) throw new Error("codex app-server is closed"); + const id = this.nextId; + this.nextId += 1; + const pending = createDeferred(); + this.pending.set(id, pending); + this.child.stdin.write(`${JSON.stringify({ jsonrpc: "2.0", id, method, params })}\n`); + return withTimeout(pending.promise, this.timeoutMs, method); + } + + notify(method, params = undefined) { + if (this.closed) throw new Error("codex app-server is closed"); + const message = params === undefined ? { method } : { method, params }; + this.child.stdin.write(`${JSON.stringify(message)}\n`); + } + + close() { + if (this.child.exitCode !== null || this.child.killed) return; + this.child.kill("SIGTERM"); + } +} + +function diagnostic(code, message, action) { + return { code, message, action }; +} + +async function checkAppServerAuth({ codexBin, timeoutMs }) { + const client = new JsonLineClient({ codexBin, timeoutMs }); + try { + await client.request("initialize", { + clientInfo: { + name: "switchboard-codex-app-server-preflight", + title: "Switchboard Codex App Server Preflight", + version: "0.0.0" + }, + capabilities: { experimentalApi: true } + }); + client.notify("initialized"); + const authStatus = await client.request("getAuthStatus", { + includeToken: false, + refreshToken: false + }); + const accountStatus = await client.request("account/read", { + refreshToken: false + }); + return { + ok: Boolean(authStatus?.authMethod || accountStatus?.account), + authStatus: redact(authStatus), + accountStatus: redact(accountStatus), + stderrTail: tailText(client.stderr) + }; + } finally { + client.close(); + } +} + +export async function runCodexAppServerPreflight({ + codexBin = "codex", + minVersion = DEFAULT_MIN_VERSION, + timeoutMs = DEFAULT_TIMEOUT_MS +} = {}) { + const minimumVersion = parseCodexVersion(`codex-cli ${minVersion}`); + const checks = {}; + const diagnostics = []; + + const versionResult = await runCommand(codexBin, ["--version"], { timeoutMs }); + const actualVersion = parseCodexVersion(versionResult.stdout || versionResult.stderr); + checks.version = { + ok: versionResult.ok && Boolean(actualVersion) && compareVersions(actualVersion, minimumVersion) >= 0, + command: `${codexBin} --version`, + required: minVersion, + actual: formatVersion(actualVersion), + stderrTail: tailText(versionResult.stderr) + }; + if (!versionResult.ok) { + diagnostics.push( + diagnostic( + "codex-missing", + "Codex CLI could not be executed.", + "Install Codex CLI and ensure `codex` is on PATH before starting Switchboard's Codex app-server path." + ) + ); + } else if (!actualVersion) { + diagnostics.push( + diagnostic("codex-version-unreadable", "Codex CLI version output was not recognized.", "Run `codex --version` and verify it reports `codex-cli x.y.z`.") + ); + } else if (!checks.version.ok) { + diagnostics.push( + diagnostic( + "codex-too-old", + `Codex CLI ${checks.version.actual} is older than the validated minimum ${minVersion}.`, + "Update Codex CLI before using Switchboard's Codex app-server path." + ) + ); + } + + const appServerHelp = await runCommand(codexBin, ["app-server", "--help"], { timeoutMs }); + checks.appServerCommand = { + ok: appServerHelp.ok && appServerHelp.stdout.includes("app-server"), + command: `${codexBin} app-server --help`, + experimental: appServerHelp.stdout.includes("[experimental]"), + stderrTail: tailText(appServerHelp.stderr) + }; + if (!checks.appServerCommand.ok) { + diagnostics.push( + diagnostic( + "app-server-unavailable", + "Codex CLI did not expose the app-server command.", + "Install a Codex CLI build that includes `codex app-server`; the spike is validated against codex-cli 0.130.0." + ) + ); + } + + const loginStatus = await runCommand(codexBin, ["login", "status"], { timeoutMs }); + checks.loginStatusCommand = { + ok: loginStatus.ok, + command: `${codexBin} login status`, + summary: loginStatus.ok ? tailText(`${loginStatus.stdout}${loginStatus.stderr}`.trim(), 200) : null, + stderrTail: tailText(loginStatus.stderr) + }; + + if (checks.version.ok && checks.appServerCommand.ok) { + try { + const appServerAuth = await checkAppServerAuth({ codexBin, timeoutMs }); + checks.appServerAuth = appServerAuth; + if (!appServerAuth.ok) { + diagnostics.push( + diagnostic( + "codex-unauthenticated", + "Codex app-server did not report an authenticated account.", + "Run `codex login` or `codex login --with-api-key`, then rerun the Switchboard Codex app-server preflight." + ) + ); + } + } catch (error) { + checks.appServerAuth = { + ok: false, + error: error.message + }; + diagnostics.push( + diagnostic( + "app-server-auth-check-failed", + "Codex app-server auth check failed before a routed session could start.", + "Run `codex login status` and `codex app-server --help`; if both look healthy, rerun the preflight with a longer timeout." + ) + ); + } + } + + const requiredChecks = [checks.version, checks.appServerCommand, checks.appServerAuth].filter(Boolean); + const status = diagnostics.length === 0 && requiredChecks.every((check) => check.ok) ? "verified" : "failed"; + return { + status, + surface: "codex-app-server-preflight", + codexBin, + minVersion, + checks, + diagnostics + }; +} + +async function main() { + const args = process.argv.slice(2); + const codexBin = getArg(args, "--codex-bin") || "codex"; + const minVersion = getArg(args, "--min-version") || DEFAULT_MIN_VERSION; + const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); + const result = await runCodexAppServerPreflight({ codexBin, minVersion, timeoutMs }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + process.exitCode = result.status === "verified" ? 0 : 1; +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((error) => { + process.stderr.write(`codex-app-server-preflight failed: ${error.message}\n`); + process.exitCode = 1; + }); +} diff --git a/test/codex-app-server-preflight.test.js b/test/codex-app-server-preflight.test.js new file mode 100644 index 0000000..e48f145 --- /dev/null +++ b/test/codex-app-server-preflight.test.js @@ -0,0 +1,131 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runCodexAppServerPreflight, parseCodexVersion, compareVersions } from "../scripts/codex-app-server-preflight.js"; + +function makeFakeCodex({ version = "0.130.0", auth = "authenticated", appServer = true } = {}) { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-preflight-test-")); + const bin = path.join(dir, "codex"); + const source = `#!/usr/bin/env node +const appServer = ${JSON.stringify(appServer)}; +const auth = ${JSON.stringify(auth)}; +const args = process.argv.slice(2); +if (args.join(" ") === "--version") { + console.log("codex-cli ${version}"); + process.exit(0); +} +if (args.join(" ") === "app-server --help") { + if (!appServer) { + console.error("unrecognized subcommand app-server"); + process.exit(2); + } + console.log("[experimental] Run the app server or related tooling"); + console.log("Usage: codex app-server [OPTIONS] [COMMAND]"); + process.exit(0); +} +if (args.join(" ") === "login status") { + if (auth === "authenticated") { + console.log("Logged in using ChatGPT"); + process.exit(0); + } + console.error("Not logged in"); + process.exit(1); +} +if (args.join(" ") === "app-server --listen stdio://") { + if (!appServer) process.exit(2); + let buffer = ""; + process.stdin.setEncoding("utf8"); + process.stdin.on("data", (chunk) => { + buffer += chunk; + let newline = buffer.indexOf("\\n"); + while (newline !== -1) { + const line = buffer.slice(0, newline).trim(); + buffer = buffer.slice(newline + 1); + if (line) handle(JSON.parse(line)); + newline = buffer.indexOf("\\n"); + } + }); + function write(id, result) { + process.stdout.write(JSON.stringify({ jsonrpc: "2.0", id, result }) + "\\n"); + } + function handle(message) { + if (!message.id) return; + if (message.method === "initialize") write(message.id, {}); + if (message.method === "getAuthStatus") { + write(message.id, auth === "authenticated" + ? { authMethod: "chatgpt", authToken: null, requiresOpenaiAuth: false } + : { authMethod: null, authToken: null, requiresOpenaiAuth: true }); + } + if (message.method === "account/read") { + write(message.id, auth === "authenticated" + ? { account: { type: "chatgpt", email: "person@example.com", planType: "plus" }, requiresOpenaiAuth: false } + : { account: null, requiresOpenaiAuth: true }); + } + } + return; +} +console.error("unexpected args: " + args.join(" ")); +process.exit(2); +`; + fs.writeFileSync(bin, source, { encoding: "utf8", mode: 0o755 }); + return bin; +} + +test("parses and compares Codex CLI versions", () => { + assert.deepEqual(parseCodexVersion("codex-cli 0.130.0"), [0, 130, 0]); + assert.equal(parseCodexVersion("not codex"), null); + assert.equal(compareVersions([0, 130, 1], [0, 130, 0]), 1); + assert.equal(compareVersions([0, 130, 0], [0, 130, 0]), 0); + assert.equal(compareVersions([0, 129, 9], [0, 130, 0]), -1); +}); + +test("preflight verifies a normal Codex install with app-server auth", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex(), + timeoutMs: 1000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.checks.version.actual, "0.130.0"); + assert.equal(result.checks.appServerCommand.ok, true); + assert.equal(result.checks.loginStatusCommand.ok, true); + assert.equal(result.checks.appServerAuth.ok, true); + assert.deepEqual(result.diagnostics, []); + assert.equal(result.checks.appServerAuth.accountStatus.account.email, "[redacted]"); +}); + +test("preflight fails clearly when Codex CLI is too old", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ version: "0.129.0" }), + timeoutMs: 1000 + }); + + assert.equal(result.status, "failed"); + assert.equal(result.diagnostics[0].code, "codex-too-old"); + assert.match(result.diagnostics[0].action, /Update Codex CLI/); +}); + +test("preflight fails clearly when app-server auth is missing", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ auth: "missing" }), + timeoutMs: 1000 + }); + + assert.equal(result.status, "failed"); + assert.equal(result.checks.appServerAuth.ok, false); + assert.equal(result.diagnostics.at(-1).code, "codex-unauthenticated"); + assert.match(result.diagnostics.at(-1).action, /codex login/); +}); + +test("preflight fails clearly when app-server is unavailable", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ appServer: false }), + timeoutMs: 1000 + }); + + assert.equal(result.status, "failed"); + assert.equal(result.checks.appServerCommand.ok, false); + assert.equal(result.diagnostics.at(-1).code, "app-server-unavailable"); +}); From 1fbb167d5257aa6535bba83112db2c19ab7b2747 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:30:52 +0200 Subject: [PATCH 11/23] test(spike): verify Codex app-server lifecycle Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 1 + docs/decision-log.md | 5 +- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 12 +- package.json | 1 + scripts/codex-app-server-lifecycle-probe.js | 343 ++++++++++++++++++ test/codex-app-server-lifecycle-probe.test.js | 122 +++++++ test/codex-app-server-preflight.test.js | 8 +- 7 files changed, 484 insertions(+), 8 deletions(-) create mode 100644 scripts/codex-app-server-lifecycle-probe.js create mode 100644 test/codex-app-server-lifecycle-probe.test.js diff --git a/README.md b/README.md index b894dea..1b79ce8 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces | `npm run switchboard:spike:codex-cli:live` | Runs the bounded two-turn Codex CLI resume probe with route-selected models and captures JSON/session evidence. | You are ready to collect live evidence for the Codex CLI feasibility spike. | | `npm run switchboard:spike:codex-app-server:preflight` | Verifies the local Codex CLI version, app-server command availability, login status, and redacted app-server auth evidence before a routed session starts. | You want to check whether a normal user install can support the Codex app-server spike path. | | `npm run switchboard:spike:codex-app-server:protocol` | Generates Codex app-server TypeScript bindings and verifies the minimum protocol shape Switchboard depends on. | You are checking whether Codex app-server protocol changes would break the feasibility spike. | +| `npm run switchboard:spike:codex-app-server:lifecycle` | Starts Codex app-server, verifies protocol-error handling, completes one turn, interrupts a second turn, captures stderr/malformed-output evidence, and shuts the process down. | You are checking whether Switchboard can safely own the app-server process lifecycle. | | `npm run switchboard:spike:codex-app-server` | Runs the bounded app-server in-session switch probe with one thread and two route-selected `turn/start` model overrides. | You are evaluating whether Codex app-server can be a Switchboard-controlled session surface beyond `exec`/`resume` parity. | | `npm test` | Runs the full automated test suite for adapters, router, workflow, and CLI behavior. | You changed routing/workflow/docs and want a full regression check. | diff --git a/docs/decision-log.md b/docs/decision-log.md index 1027366..b153205 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -366,6 +366,7 @@ Verification signal: - Gate 1 public-surface review completed on 2026-05-13 against local `codex-cli 0.130.0`, local app-server help output, generated stable TypeScript bindings, and OpenAI's public `openai/codex` app-server README. Result: pass for a bounded integration spike. Caveat: `codex app-server` and schema generation are still labeled experimental, so this is not a production stability claim. - Gate 2 protocol-stability review completed on 2026-05-13 with `scripts/codex-app-server-protocol-check.js`. The check generates Codex app-server TypeScript bindings and verifies the minimum method and field surface Switchboard depends on: `initialize`, `thread/start`, `turn/start`, `thread/read`, `InitializeCapabilities.experimentalApi`, `ThreadStartParams.model`, `TurnStartParams.threadId`, `TurnStartParams.input`, `TurnStartParams.model`, `ThreadReadParams.threadId`, and `ThreadReadParams.includeTurns`. Result: pass for the installed Codex CLI version; future Codex releases still require this check as a compatibility guard. - Gate 3 user-install and auth-path review completed on 2026-05-13 with `scripts/codex-app-server-preflight.js`. The preflight checks the Codex CLI version, app-server command availability, `codex login status`, and redacted app-server auth/account evidence before Switchboard attempts a routed session. Result: pass for the local `codex-cli 0.130.0` install. Caveat: app-server remains experimental, so the preflight is a compatibility guard rather than a stability guarantee. +- Gate 4 process-lifecycle review completed on 2026-05-14 with `scripts/codex-app-server-lifecycle-probe.js`. The live probe initialized app-server, confirmed a protocol error did not kill the process, completed one turn, interrupted a second turn after receiving `turn/started`, captured stderr warnings, and shut down cleanly with exit code `0`. Deterministic fake-process tests cover malformed stdout, app-server crash, and child process spawn failure. Result: pass for the local `codex-cli 0.130.0` install. - Gate 6 model-evidence review completed on 2026-05-13 with an expanded `scripts/codex-app-server-switch-probe.js`. The live probe now separates requested model evidence from observed effective/backend model telemetry. Result: partial. A live run requested `gpt-5.5` then `gpt-5.4-mini` on the same app-server thread, but observed no effective model field in turn payloads, `thread/read`, raw response items, `model/rerouted`, or `model/verification`. Product must either accept requested-override plus same-thread completion as sufficient, or keep backend model attestation as an unresolved requirement. Decision: @@ -379,5 +380,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 4, process lifecycle safety, is next. -- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, scripts/codex-app-server-preflight.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, test/codex-app-server-preflight.test.js, README.md +- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 7, product fit, is next. +- Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, scripts/codex-app-server-preflight.js, scripts/codex-app-server-lifecycle-probe.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, test/codex-app-server-preflight.test.js, test/codex-app-server-lifecycle-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index 184d625..f9537b9 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -171,7 +171,7 @@ Fail condition: ### Gate 4: Process Lifecycle Safety -Status: `[ ]` +Status: `[x]` Question: Can Switchboard safely own a long-running app-server process? @@ -183,9 +183,17 @@ Evidence needed: - Behavior after interrupted turn. - Handling for malformed JSON, protocol errors, stderr warnings, and child process spawn failure. +Evidence: + +- [../../scripts/codex-app-server-lifecycle-probe.js](../../scripts/codex-app-server-lifecycle-probe.js) starts `codex app-server --listen stdio://`, initializes the app-server protocol, verifies an unsupported request returns a bounded protocol error, starts a thread, completes one turn, starts and interrupts a second turn after `turn/started`, captures stderr warning output, ignores malformed JSON lines, and waits for shutdown. +- [../../test/codex-app-server-lifecycle-probe.test.js](../../test/codex-app-server-lifecycle-probe.test.js) covers the lifecycle harness with a fake app-server, including malformed stdout, stderr warnings, protocol errors, interrupted turns, app-server crash, and child process spawn failure. +- `npm run switchboard:spike:codex-app-server:lifecycle` returned `status: verified` on 2026-05-14 against local `codex-cli 0.130.0`. +- Live lifecycle evidence: initialize passed; unsupported request returned a protocol error without killing the server; one turn completed; a second turn was interrupted successfully after waiting for `turn/started`; shutdown returned exit code `0`. +- Caveat: real malformed-output and crash behavior are not induced against the live Codex app-server; those failure modes are covered by deterministic fake-process tests because the real server should not normally emit malformed JSON or crash on demand. + Pass condition: -- A lifecycle probe demonstrates start, two turns, interruption or shutdown, and clear error reporting. +- Met for spike purposes. A lifecycle probe demonstrates start, two turns, interruption, shutdown, and clear error reporting for protocol, stderr, malformed-output, crash, and spawn-failure paths. Fail condition: diff --git a/package.json b/package.json index f29e15f..82f8bcf 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "switchboard:spike:codex-cli:live": "node scripts/codex-cli-feasibility-probe.js --live", "switchboard:spike:codex-app-server:preflight": "node scripts/codex-app-server-preflight.js", "switchboard:spike:codex-app-server:protocol": "node scripts/codex-app-server-protocol-check.js", + "switchboard:spike:codex-app-server:lifecycle": "node scripts/codex-app-server-lifecycle-probe.js", "switchboard:spike:codex-app-server": "node scripts/codex-app-server-switch-probe.js" }, "repository": { diff --git a/scripts/codex-app-server-lifecycle-probe.js b/scripts/codex-app-server-lifecycle-probe.js new file mode 100644 index 0000000..63afb8c --- /dev/null +++ b/scripts/codex-app-server-lifecycle-probe.js @@ -0,0 +1,343 @@ +#!/usr/bin/env node +import { spawn } from "node:child_process"; +import { setTimeout, clearTimeout } from "node:timers"; +import { fileURLToPath } from "node:url"; + +const DEFAULT_TIMEOUT_MS = 120000; + +function getArg(args, flag) { + const idx = args.lastIndexOf(flag); + if (idx === -1 || idx + 1 >= args.length) return null; + return args[idx + 1]; +} + +function tailText(text, maxLength = 1600) { + if (!text) return ""; + return text.length > maxLength ? text.slice(text.length - maxLength) : text; +} + +function textInput(text) { + return [{ type: "text", text, text_elements: [] }]; +} + +function createDeferred() { + let resolve; + let reject; + const promise = new Promise((promiseResolve, promiseReject) => { + resolve = promiseResolve; + reject = promiseReject; + }); + return { promise, resolve, reject }; +} + +function withTimeout(promise, ms, label) { + let timer; + const timeout = new Promise((_, reject) => { + timer = setTimeout(() => reject(new Error(`${label} timed out after ${ms}ms`)), ms); + }); + return Promise.race([promise, timeout]).finally(() => clearTimeout(timer)); +} + +class JsonLineClient { + constructor({ codexBin, timeoutMs }) { + this.timeoutMs = timeoutMs; + this.nextId = 1; + this.buffer = ""; + this.stdout = ""; + this.stderr = ""; + this.notifications = []; + this.malformedLines = []; + this.pending = new Map(); + this.waiters = []; + this.closed = false; + this.exit = null; + this.exitDeferred = createDeferred(); + this.child = spawn(codexBin, ["app-server", "--listen", "stdio://"], { + env: { ...process.env, NO_COLOR: "1" }, + stdio: ["pipe", "pipe", "pipe"] + }); + this.child.stdout.setEncoding("utf8"); + this.child.stderr.setEncoding("utf8"); + this.child.stdout.on("data", (chunk) => this.handleStdout(chunk)); + this.child.stderr.on("data", (chunk) => { + this.stderr += chunk; + }); + this.child.on("error", (error) => { + this.stderr += `${error.message}\n`; + this.rejectOpenWork(error); + this.exitDeferred.resolve({ code: null, signal: null, error: error.message }); + }); + this.child.on("exit", (code, signal) => { + this.exit = { code, signal }; + this.rejectOpenWork(new Error(`codex app-server exited with code ${code ?? "null"} and signal ${signal ?? "null"}`)); + this.exitDeferred.resolve(this.exit); + }); + } + + rejectOpenWork(error) { + if (this.closed) return; + this.closed = true; + for (const pending of this.pending.values()) pending.reject(error); + this.pending.clear(); + for (const waiter of this.waiters) waiter.reject(error); + this.waiters = []; + } + + handleStdout(chunk) { + this.stdout += chunk; + this.buffer += chunk; + let newlineIndex = this.buffer.indexOf("\n"); + while (newlineIndex !== -1) { + const line = this.buffer.slice(0, newlineIndex).trim(); + this.buffer = this.buffer.slice(newlineIndex + 1); + if (line) this.handleLine(line); + newlineIndex = this.buffer.indexOf("\n"); + } + } + + handleLine(line) { + let message; + try { + message = JSON.parse(line); + } catch { + this.malformedLines.push(line); + return; + } + + if (Object.prototype.hasOwnProperty.call(message, "id")) { + const pending = this.pending.get(message.id); + if (!pending) return; + this.pending.delete(message.id); + if (message.error) { + pending.reject(new Error(JSON.stringify(message.error))); + } else { + pending.resolve(message.result); + } + return; + } + + this.notifications.push(message); + const remaining = []; + for (const waiter of this.waiters) { + if (waiter.predicate(message)) { + waiter.resolve(message); + } else { + remaining.push(waiter); + } + } + this.waiters = remaining; + } + + request(method, params) { + if (this.closed) throw new Error("codex app-server is closed"); + const id = this.nextId; + this.nextId += 1; + const pending = createDeferred(); + this.pending.set(id, pending); + this.child.stdin.write(`${JSON.stringify({ jsonrpc: "2.0", id, method, params })}\n`); + return withTimeout(pending.promise, this.timeoutMs, method); + } + + notify(method, params = undefined) { + if (this.closed) throw new Error("codex app-server is closed"); + const message = params === undefined ? { method } : { method, params }; + this.child.stdin.write(`${JSON.stringify(message)}\n`); + } + + waitForNotification(predicate, label) { + const existing = this.notifications.find(predicate); + if (existing) return Promise.resolve(existing); + const waiter = createDeferred(); + waiter.predicate = predicate; + this.waiters.push(waiter); + return withTimeout(waiter.promise, this.timeoutMs, label); + } + + async closeAndWait() { + if (this.child.exitCode === null && !this.child.killed) { + this.child.kill("SIGTERM"); + } + const exit = await withTimeout(this.exitDeferred.promise, 5000, "codex app-server shutdown"); + return { + ok: exit.signal === "SIGTERM" || exit.code === 0 || exit.code === null, + exit + }; + } +} + +async function expectProtocolError(client) { + try { + await client.request("switchboard/unsupported-lifecycle-probe", {}); + return { ok: false, message: "unsupported method unexpectedly succeeded" }; + } catch (error) { + return { ok: true, message: error.message }; + } +} + +async function runCompletedTurn(client, { threadId, label, cwd }) { + const response = await client.request("turn/start", { + threadId, + input: textInput(`Do not edit files or run commands. Reply with exactly: ${label} lifecycle complete.`), + model: "gpt-5.4-mini", + cwd + }); + const turnId = response?.turn?.id; + if (!turnId) throw new Error(`${label} turn/start completed without a turn id`); + const completed = await client.waitForNotification( + (message) => message.method === "turn/completed" && message.params?.turn?.id === turnId, + `${label} turn/completed` + ); + return { + ok: Boolean(completed), + turnId, + status: completed?.params?.turn?.status || response?.turn?.status || null + }; +} + +async function runInterruptedTurn(client, { threadId, cwd }) { + const response = await client.request("turn/start", { + threadId, + input: textInput("Do not edit files or run commands. Wait briefly, then say interrupted lifecycle complete."), + model: "gpt-5.4-mini", + cwd + }); + const turnId = response?.turn?.id; + if (!turnId) throw new Error("interrupt turn/start completed without a turn id"); + try { + await client.waitForNotification( + (message) => message.method === "turn/started" && message.params?.turn?.id === turnId, + "interrupt turn/started" + ); + const interrupt = await client.request("turn/interrupt", { threadId, turnId }); + const completion = await client.waitForNotification( + (message) => message.method === "turn/completed" && message.params?.turn?.id === turnId, + "interrupted turn/completed" + ); + return { + ok: true, + turnId, + interrupt, + status: completion?.params?.turn?.status || null + }; + } catch (error) { + return { + ok: false, + turnId, + error: error.message + }; + } +} + +export async function runCodexAppServerLifecycleProbe({ + codexBin = "codex", + cwd = process.cwd(), + timeoutMs = DEFAULT_TIMEOUT_MS, + includeInterrupt = true +} = {}) { + const client = new JsonLineClient({ codexBin, timeoutMs }); + const checks = {}; + try { + checks.initialize = { + ok: Boolean( + await client.request("initialize", { + clientInfo: { + name: "switchboard-codex-app-server-lifecycle-probe", + title: "Switchboard Codex app-server lifecycle probe", + version: "0.0.0" + }, + capabilities: { experimentalApi: true } + }) + ) + }; + client.notify("initialized"); + + checks.protocolError = await expectProtocolError(client); + + const threadStart = await client.request("thread/start", { + model: "gpt-5.4-mini", + cwd, + approvalPolicy: "never", + sandbox: "read-only" + }); + const threadId = threadStart?.thread?.id || null; + checks.threadStart = { + ok: Boolean(threadId), + threadId, + sessionId: threadStart?.thread?.sessionId || null + }; + if (!threadId) throw new Error("thread/start completed without a thread id"); + + checks.firstTurn = await runCompletedTurn(client, { threadId, label: "first", cwd }); + checks.secondTurn = includeInterrupt + ? await runInterruptedTurn(client, { threadId, cwd }) + : await runCompletedTurn(client, { threadId, label: "second", cwd }); + + checks.stderrWarnings = { + ok: true, + observed: Boolean(client.stderr.trim()), + stderrTail: tailText(client.stderr) + }; + checks.malformedJson = { + ok: true, + ignoredLineCount: client.malformedLines.length + }; + } catch (error) { + checks.runtimeError = { + ok: false, + message: error.message + }; + } finally { + checks.shutdown = await client.closeAndWait().catch((error) => ({ ok: false, error: error.message })); + } + + const required = [ + checks.initialize, + checks.protocolError, + checks.threadStart, + checks.firstTurn, + checks.secondTurn, + checks.stderrWarnings, + checks.malformedJson, + checks.shutdown + ].filter(Boolean); + const verified = required.every((check) => check.ok) && !checks.runtimeError; + return { + status: verified ? "verified" : "blocked", + surface: "codex-app-server-lifecycle", + mode: "live_app_server_lifecycle", + checks, + notificationCounts: client.notifications.reduce((counts, message) => { + counts[message.method] = (counts[message.method] || 0) + 1; + return counts; + }, {}), + evidence: { + command: `${codexBin} app-server --listen stdio://`, + interpretation: + "Switchboard can own the app-server process lifecycle when startup, turn execution, protocol errors, stderr, malformed output, interruption or shutdown, and process exit produce bounded outcomes." + }, + limitations: [ + "The generated app-server protocol is experimental.", + "Live malformed-output and crash behavior are covered by deterministic fake app-server tests; the real app-server should not normally emit malformed JSON or crash on demand." + ], + stderrTail: tailText(client.stderr), + stdoutTail: tailText(client.stdout) + }; +} + +async function main() { + const args = process.argv.slice(2); + const codexBin = getArg(args, "--codex-bin") || "codex"; + const cwd = getArg(args, "--cwd") || process.cwd(); + const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); + const includeInterrupt = !args.includes("--no-interrupt"); + const result = await runCodexAppServerLifecycleProbe({ codexBin, cwd, timeoutMs, includeInterrupt }); + process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); + process.exitCode = result.status === "verified" ? 0 : 1; +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((error) => { + process.stderr.write(`codex-app-server-lifecycle-probe failed: ${error.message}\n`); + process.exitCode = 1; + }); +} diff --git a/test/codex-app-server-lifecycle-probe.test.js b/test/codex-app-server-lifecycle-probe.test.js new file mode 100644 index 0000000..70fafd4 --- /dev/null +++ b/test/codex-app-server-lifecycle-probe.test.js @@ -0,0 +1,122 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { runCodexAppServerLifecycleProbe } from "../scripts/codex-app-server-lifecycle-probe.js"; + +function createFakeCodexBin({ crashOnTurn = false } = {}) { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-lifecycle-test-")); + const binPath = path.join(dir, "codex"); + fs.writeFileSync( + binPath, + `#!/usr/bin/env node +const readline = require("node:readline"); + +let nextTurn = 1; +const thread = { id: "thread-life", sessionId: "session-life", turns: [] }; + +function write(message) { + process.stdout.write(JSON.stringify(message) + "\\n"); +} + +function respond(id, result) { + write({ jsonrpc: "2.0", id, result }); +} + +if (process.argv.slice(2).join(" ") !== "app-server --listen stdio://") { + process.exit(2); +} + +process.stderr.write("fake stderr warning\\n"); +process.stdout.write("not-json-from-server\\n"); + +const rl = readline.createInterface({ input: process.stdin }); +rl.on("line", (line) => { + if (!line.trim()) return; + const message = JSON.parse(line); + if (message.method === "initialize") { + respond(message.id, { userAgent: "fake-codex" }); + return; + } + if (message.method === "initialized") return; + if (message.method === "switchboard/unsupported-lifecycle-probe") { + write({ jsonrpc: "2.0", id: message.id, error: { code: -32601, message: "method not found" } }); + return; + } + if (message.method === "thread/start") { + respond(message.id, { thread, model: message.params.model, modelProvider: "openai" }); + write({ method: "thread/started", params: { thread } }); + return; + } + if (message.method === "turn/start") { + if (${JSON.stringify(crashOnTurn)}) { + process.stderr.write("simulated app-server crash\\n"); + process.exit(7); + } + const turn = { id: "turn-" + nextTurn++, status: "inProgress" }; + thread.turns.push(turn); + respond(message.id, { turn }); + write({ method: "turn/started", params: { threadId: message.params.threadId, turn } }); + if (turn.id === "turn-1") { + const completed = { ...turn, status: "completed" }; + write({ method: "turn/completed", params: { threadId: message.params.threadId, turn: completed } }); + } + return; + } + if (message.method === "turn/interrupt") { + respond(message.id, {}); + const interrupted = { id: message.params.turnId, status: "interrupted" }; + write({ method: "turn/completed", params: { threadId: message.params.threadId, turn: interrupted } }); + return; + } + respond(message.id, {}); +}); +`, + "utf8" + ); + fs.chmodSync(binPath, 0o755); + return binPath; +} + +test("lifecycle probe verifies process ownership and recovery signals", async () => { + const result = await runCodexAppServerLifecycleProbe({ + codexBin: createFakeCodexBin(), + timeoutMs: 5000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.checks.initialize.ok, true); + assert.equal(result.checks.protocolError.ok, true); + assert.equal(result.checks.threadStart.threadId, "thread-life"); + assert.equal(result.checks.firstTurn.status, "completed"); + assert.equal(result.checks.secondTurn.status, "interrupted"); + assert.equal(result.checks.stderrWarnings.observed, true); + assert.equal(result.checks.malformedJson.ignoredLineCount, 1); + assert.equal(result.checks.shutdown.ok, true); +}); + +test("lifecycle probe reports app-server crash without hanging", async () => { + const result = await runCodexAppServerLifecycleProbe({ + codexBin: createFakeCodexBin({ crashOnTurn: true }), + timeoutMs: 1000 + }); + + assert.equal(result.status, "blocked"); + assert.equal(result.checks.runtimeError.ok, false); + assert.match(result.checks.runtimeError.message, /exited with code 7/); + assert.match(result.stderrTail, /simulated app-server crash/); + assert.equal(result.checks.shutdown.ok, false); + assert.equal(result.checks.shutdown.exit.code, 7); +}); + +test("lifecycle probe reports child process spawn failure", async () => { + const result = await runCodexAppServerLifecycleProbe({ + codexBin: path.join(os.tmpdir(), "missing-codex-for-lifecycle"), + timeoutMs: 1000 + }); + + assert.equal(result.status, "blocked"); + assert.match(result.checks.runtimeError.message, /ENOENT|spawn/); + assert.equal(result.checks.shutdown.ok, true); +}); diff --git a/test/codex-app-server-preflight.test.js b/test/codex-app-server-preflight.test.js index e48f145..2350409 100644 --- a/test/codex-app-server-preflight.test.js +++ b/test/codex-app-server-preflight.test.js @@ -84,7 +84,7 @@ test("parses and compares Codex CLI versions", () => { test("preflight verifies a normal Codex install with app-server auth", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex(), - timeoutMs: 1000 + timeoutMs: 5000 }); assert.equal(result.status, "verified"); @@ -99,7 +99,7 @@ test("preflight verifies a normal Codex install with app-server auth", async () test("preflight fails clearly when Codex CLI is too old", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex({ version: "0.129.0" }), - timeoutMs: 1000 + timeoutMs: 5000 }); assert.equal(result.status, "failed"); @@ -110,7 +110,7 @@ test("preflight fails clearly when Codex CLI is too old", async () => { test("preflight fails clearly when app-server auth is missing", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex({ auth: "missing" }), - timeoutMs: 1000 + timeoutMs: 5000 }); assert.equal(result.status, "failed"); @@ -122,7 +122,7 @@ test("preflight fails clearly when app-server auth is missing", async () => { test("preflight fails clearly when app-server is unavailable", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex({ appServer: false }), - timeoutMs: 1000 + timeoutMs: 5000 }); assert.equal(result.status, "failed"); From 6f918411404c9d31b6913d157bc566f0a42953ca Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:39:20 +0200 Subject: [PATCH 12/23] docs(readme): clarify experimental usage paths Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1b79ce8..4b3d6e9 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,12 @@ [![npm version](https://img.shields.io/npm/v/model-switchboard.svg)](https://www.npmjs.com/package/model-switchboard) [![npm downloads](https://img.shields.io/npm/dm/model-switchboard.svg)](https://www.npmjs.com/package/model-switchboard) [![CI](https://github.com/hannasdev/model-switchboard/actions/workflows/ci.yml/badge.svg)](https://github.com/hannasdev/model-switchboard/actions/workflows/ci.yml) [![Release](https://github.com/hannasdev/model-switchboard/actions/workflows/release.yml/badge.svg)](https://github.com/hannasdev/model-switchboard/actions/workflows/release.yml) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/hannasdev/model-switchboard/badge)](https://securityscorecards.dev/viewer/?uri=github.com/hannasdev/model-switchboard) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/12820/badge)](https://www.bestpractices.dev/projects/12820) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) -Model Switchboard is a routing layer for AI-assisted software delivery. +Model Switchboard is an experimental routing layer for AI-assisted software delivery. It keeps coding sessions moving by choosing model and effort settings before each turn, so you do not have to make that call manually every time. +The project is still exploring the product shape for automatic model hot-swapping. It is useful today as a Claude Code routing wrapper and as a Codex feasibility spike, but it is not yet a polished replacement for an existing AI coding workflow. + ## Get, Provide Feedback, and Contribute - Obtain the software: @@ -31,7 +33,7 @@ Model Switchboard reduces that overhead with consistent routing decisions and a ## Current Product Slice -The current MVP is a Claude Code workflow integration powered by a separable router core. +The current MVP is a Claude Code workflow integration powered by a separable router core. The Codex work is an active spike to test whether Switchboard can go beyond advisory routing and actually control per-turn model changes inside one continuous session. High-level flow: @@ -40,11 +42,104 @@ High-level flow: 3. Switchboard launches or resumes Claude with matching model and effort settings for that launch. 4. Route context, session state, and hook evidence are recorded for explainability, replay, and governance. +## Usage Paths + +Switchboard currently has three distinct paths. They are intentionally not equivalent. + +### Claude Code Wrapper + +This is the most complete path today. + +Use: + +```bash +switchboard "your prompt" +switchboard --interactive +switchboard explain +``` + +What it allows: + +- Routes each prompt before launching or resuming Claude. +- Applies model and effort choices at Claude launch/resume boundaries. +- Records local routing evidence for explainability and replay. + +Advantages: + +- Most productized workflow in this repository. +- Uses the existing Claude Code user experience. +- Good fit for prompt-by-prompt routing with auditability. + +Does not yet support: + +- Automatic model changes inside an already-running Claude interactive session. +- Eliminating the cognitive overhead of model choice during a long-lived stock Claude TUI session. + +### Advisory Cross-Surface Routing + +This path gives a recommendation without taking over execution. + +Use: + +```bash +switchboard advise --surface openai-codex "your prompt" +``` + +What it allows: + +- Asks Switchboard what it would choose for a target surface. +- Lets you keep using another client manually. + +Advantages: + +- Low-risk way to test routing policy across vendors or clients. +- Does not require Switchboard to own the session process. + +Does not yet support: + +- Automatic execution. +- Automatic in-session model switching. +- Reducing all model-selection overhead, because the user still has to apply the recommendation. + +### Codex App-Server Spike + +This is the experimental hot-swapping path. + +Use: + +```bash +npm run switchboard:spike:codex-app-server:preflight +npm run switchboard:spike:codex-app-server:protocol +npm run switchboard:spike:codex-app-server:lifecycle +npm run switchboard:spike:codex-app-server +``` + +What it allows: + +- Starts `codex app-server --listen stdio://`. +- Creates one Codex app-server thread. +- Sends multiple `turn/start` requests on that same thread. +- Requests different models on different turns without a `codex exec resume` boundary. + +Advantages: + +- This is the only current path that suggests Switchboard could go beyond Claude parity. +- It demonstrates a possible Switchboard-owned session surface with per-turn model override. +- It preserves one app-server thread/session while route-selected model requests change. + +Does not yet support: + +- A polished end-user UI. +- Hot-swapping inside the stock Codex TUI. +- Production stability guarantees, because the app-server surface is still experimental. +- Provider-side backend model attestation; current evidence proves requested model overrides and same-thread completion, not a durable backend model field. + ## What It Is Not -- Not a replacement for your coding client. +- Not a finished replacement for your coding client. - Not a general-purpose agent runtime. -- Not a cross-vendor orchestration product in this MVP phase. +- Not a claim that stock Claude or stock Codex TUI sessions can be hot-swapped today. +- Not a production-grade cross-vendor orchestration product in this MVP phase. ## Security & Code Quality @@ -60,6 +155,8 @@ See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting proces ## Primary Commands +The commands below mix productized MVP commands and spike commands. Commands containing `spike` are feasibility evidence for the Codex direction, not polished product UX. + | Command | What It Does | Use It When | | --- | --- | --- | | `switchboard "your prompt"` | Routes a single prompt, chooses target/effort, then launches or resumes Claude for that turn. | You want normal prompt-driven usage with routing applied automatically. | From 8d24843682c33096df781034e4cf24318fade83d Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:41:25 +0200 Subject: [PATCH 13/23] docs(readme): prioritize early adopter usage Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 4b3d6e9..9587d12 100644 --- a/README.md +++ b/README.md @@ -8,16 +8,6 @@ It keeps coding sessions moving by choosing model and effort settings before eac The project is still exploring the product shape for automatic model hot-swapping. It is useful today as a Claude Code routing wrapper and as a Codex feasibility spike, but it is not yet a polished replacement for an existing AI coding workflow. -## Get, Provide Feedback, and Contribute - -- Obtain the software: - - GitHub repository: https://github.com/hannasdev/model-switchboard - - npm package: https://www.npmjs.com/package/model-switchboard -- Provide feedback (bug reports and enhancements): - - Issues: https://github.com/hannasdev/model-switchboard/issues -- Contribute to the project: - - Contribution guide: [CONTRIBUTING.md](CONTRIBUTING.md) - ## Why It Exists Choosing the right model repeatedly is a real cognitive tax. A single coding session can shift between quick clarifications, planning, implementation, and debugging, each with different cost and quality needs. @@ -141,18 +131,6 @@ Does not yet support: - Not a claim that stock Claude or stock Codex TUI sessions can be hot-swapped today. - Not a production-grade cross-vendor orchestration product in this MVP phase. -## Security & Code Quality - -This project prioritizes security for AI-related software: - -- **Vulnerability Scanning**: Automated dependency scanning via `npm audit` in CI on pull requests and pushes to `main`, plus [Snyk](https://snyk.io) scans on pushes to `main` and a daily schedule when `SNYK_TOKEN` is configured -- **Static Analysis**: ESLint with security plugin to detect common vulnerabilities -- **Responsible Disclosure**: Follow the [Security Policy](SECURITY.md) to report vulnerabilities privately -- **Test Coverage**: Comprehensive test suite validates security-relevant code paths -- **Developer Knowledge**: Core team has expertise in secure software design and threat modeling - -See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting process and security practices. - ## Primary Commands The commands below mix productized MVP commands and spike commands. Commands containing `spike` are feasibility evidence for the Codex direction, not polished product UX. @@ -191,3 +169,25 @@ If you want per-turn re-routing and potential target/model changes, run prompts `npm test` For detailed command documentation, environment variables, and output formats, see [CLI Reference](docs/CLI-REFERENCE.md). + +## Get, Provide Feedback, and Contribute + +- Obtain the software: + - GitHub repository: https://github.com/hannasdev/model-switchboard + - npm package: https://www.npmjs.com/package/model-switchboard +- Provide feedback (bug reports and enhancements): + - Issues: https://github.com/hannasdev/model-switchboard/issues +- Contribute to the project: + - Contribution guide: [CONTRIBUTING.md](CONTRIBUTING.md) + +## Security & Code Quality + +This project prioritizes security for AI-related software: + +- **Vulnerability Scanning**: Automated dependency scanning via `npm audit` in CI on pull requests and pushes to `main`, plus [Snyk](https://snyk.io) scans on pushes to `main` and a daily schedule when `SNYK_TOKEN` is configured +- **Static Analysis**: ESLint with security plugin to detect common vulnerabilities +- **Responsible Disclosure**: Follow the [Security Policy](SECURITY.md) to report vulnerabilities privately +- **Test Coverage**: Comprehensive test suite validates security-relevant code paths +- **Developer Knowledge**: Core team has expertise in secure software design and threat modeling + +See [SECURITY.md](SECURITY.md) for details on the vulnerability reporting process and security practices. From 2a7e0d053e6e026a780936bc2b567e906c5a4392 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:42:36 +0200 Subject: [PATCH 14/23] docs(readme): frame routing as product goal Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9587d12..329b0f2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Model Switchboard is an experimental routing layer for AI-assisted software delivery. -It keeps coding sessions moving by choosing model and effort settings before each turn, so you do not have to make that call manually every time. +Its goal is to keep coding sessions moving by choosing model and effort settings before each turn, so you do not have to make that call manually every time. The project is still exploring the product shape for automatic model hot-swapping. It is useful today as a Claude Code routing wrapper and as a Codex feasibility spike, but it is not yet a polished replacement for an existing AI coding workflow. @@ -12,7 +12,7 @@ The project is still exploring the product shape for automatic model hot-swappin Choosing the right model repeatedly is a real cognitive tax. A single coding session can shift between quick clarifications, planning, implementation, and debugging, each with different cost and quality needs. -Model Switchboard reduces that overhead with consistent routing decisions and a short explanation of why a route was selected. +Model Switchboard explores reducing that overhead with consistent routing decisions and a short explanation of why a route was selected. ## Core Value From c8b68a845d563468e1738c0d74a8601cfde81794 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:45:38 +0200 Subject: [PATCH 15/23] docs(spike): close Codex app-server product gate Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/decision-log.md | 8 +++++--- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 19 ++++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/docs/decision-log.md b/docs/decision-log.md index b153205..7a5cc46 100644 --- a/docs/decision-log.md +++ b/docs/decision-log.md @@ -368,11 +368,13 @@ Verification signal: - Gate 3 user-install and auth-path review completed on 2026-05-13 with `scripts/codex-app-server-preflight.js`. The preflight checks the Codex CLI version, app-server command availability, `codex login status`, and redacted app-server auth/account evidence before Switchboard attempts a routed session. Result: pass for the local `codex-cli 0.130.0` install. Caveat: app-server remains experimental, so the preflight is a compatibility guard rather than a stability guarantee. - Gate 4 process-lifecycle review completed on 2026-05-14 with `scripts/codex-app-server-lifecycle-probe.js`. The live probe initialized app-server, confirmed a protocol error did not kill the process, completed one turn, interrupted a second turn after receiving `turn/started`, captured stderr warnings, and shut down cleanly with exit code `0`. Deterministic fake-process tests cover malformed stdout, app-server crash, and child process spawn failure. Result: pass for the local `codex-cli 0.130.0` install. - Gate 6 model-evidence review completed on 2026-05-13 with an expanded `scripts/codex-app-server-switch-probe.js`. The live probe now separates requested model evidence from observed effective/backend model telemetry. Result: partial. A live run requested `gpt-5.5` then `gpt-5.4-mini` on the same app-server thread, but observed no effective model field in turn payloads, `thread/read`, raw response items, `model/rerouted`, or `model/verification`. Product must either accept requested-override plus same-thread completion as sufficient, or keep backend model attestation as an unresolved requirement. +- Gate 7 product-fit review completed on 2026-05-14 after README reframing. Result: pass for the experimental spike. Product accepts a Switchboard-owned Codex app-server session surface as the differentiated workflow, while explicitly not claiming stock Codex TUI hot-swapping or polished replacement UX. +- Gate 6 risk disposition completed on 2026-05-14. For publication, product accepts requested model override plus same-thread completion as enough evidence for an experimental hot-swapping feasibility claim, while preserving backend model attestation as a known unresolved risk. Decision: - Chosen option: Option B. -- Scope of commitment: treat Codex CLI as verified for route authority at `exec`/`resume` boundaries and promising for in-session route authority through the experimental app-server protocol. -- What remains intentionally deferred: claims that the interactive Codex TUI itself can be hot-swapped; broad UX/product reframing decisions until the app-server protocol is judged supportable enough for a product surface. +- Scope of commitment: treat Codex CLI as verified for route authority at `exec`/`resume` boundaries and publish Codex app-server as an experimental in-session route-authority surface. +- What remains intentionally deferred: claims that the interactive Codex TUI itself can be hot-swapped, a polished Switchboard-owned Codex UX, production stability guarantees for the experimental app-server protocol, and backend model attestation. Consequences: - Near-term implementation impact: additive Codex CLI feasibility tooling; no change to Claude MVP promise. @@ -380,5 +382,5 @@ Consequences: - Migration impact: low; probe is additive and does not alter core Claude workflow contracts. Codex should not displace the Claude MVP path based only on command-boundary evidence, but the app-server protocol may justify a focused product-surface spike. Follow-up: -- Next review milestone: continue the app-server supportability gates in docs/product/CODEX-CLI-SPIKE-SCOPE.md. Gate 7, product fit, is next. +- Next review milestone: open a draft PR for the Codex CLI/app-server feasibility spike and review publish readiness. - Linked artifacts (logs, fixtures, docs, PRs): docs/product/CODEX-CLI-SPIKE-SCOPE.md, scripts/codex-cli-feasibility-probe.js, scripts/codex-app-server-switch-probe.js, scripts/codex-app-server-protocol-check.js, scripts/codex-app-server-preflight.js, scripts/codex-app-server-lifecycle-probe.js, test/codex-cli-feasibility-probe.test.js, test/codex-app-server-switch-probe.test.js, test/codex-app-server-protocol-check.test.js, test/codex-app-server-preflight.test.js, test/codex-app-server-lifecycle-probe.test.js, README.md diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index f9537b9..e0244cd 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -2,7 +2,7 @@ ## Status -Status: app-server in-session probe implemented; supportability decision pending +Status: app-server in-session probe implemented; supportability gates complete for experimental publication Decision record: `DEC-2026-05-13-codex-cli-feasibility-spike` in [../decision-log.md](../decision-log.md). @@ -259,7 +259,7 @@ Fail condition: ### Gate 7: Product Fit -Status: `[~]` +Status: `[x]` Question: Does the app-server path reduce cognitive overhead enough to justify a Codex product surface beyond Claude parity? @@ -273,11 +273,14 @@ Evidence needed: Current evidence: - The app-server probe avoids `exec resume` and manual model selection. -- Caveat: turning this into a usable product likely means Switchboard owns the session UI or loop. +- The README now presents three distinct usage paths for early adopters: Claude Code wrapper, advisory cross-surface routing, and Codex app-server spike. +- Product framing now explicitly treats Codex app-server as an experimental Switchboard-owned session surface, not as hot-swapping inside the stock Codex TUI. +- The publish position accepts that a polished Codex user experience is future work; current evidence is enough to publish the spike as an exploration of the hot-swapping concept. +- Caveat: turning this into a polished usable product likely means Switchboard owns the session UI or loop. Pass condition: -- Product accepts a Switchboard-controlled Codex session surface as the differentiated workflow. +- Met for spike purposes. Product accepts a Switchboard-controlled Codex app-server session surface as the differentiated experimental workflow. Fail condition: @@ -291,6 +294,12 @@ Codex app-server can graduate from promising spike to serious product-surface ca 2. Gate 6 either passes or is explicitly accepted as a known risk. 3. No gate requires private Codex internals or raw rollout-file parsing as the primary mechanism. +Current decision: + +- Gates 1, 2, 3, 4, 5, and 7 pass for the experimental spike. +- Gate 6 is explicitly accepted as a known risk for publication: the current evidence proves requested model override plus same-thread completion, but not provider-side backend model attestation. +- The Codex app-server path is publishable as an experimental hot-swapping feasibility surface, not as a production replacement for Claude Code or the stock Codex TUI. + If Gate 7 fails because the required user experience must be the stock Codex TUI, classify Codex as `partial` despite the app-server evidence. ### Next Check Order @@ -366,7 +375,7 @@ Observed 2026-05-13: ### Phase 3: In-Session Switch Probe -Status: repeatable app-server probe implemented; supportability and product-surface fit pending. +Status: repeatable app-server probe implemented; supportability and product-surface fit accepted for experimental publication. Investigate whether Codex CLI exposes a supported hook, command, control protocol, config reload behavior, or interactive-session API that can change the active model after an interactive session has started. From a9ee1d674208923fd18d629b2607b9d7a86c7967 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 08:52:18 +0200 Subject: [PATCH 16/23] fix(spike): address Codex probe review comments Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- package.json | 1 + scripts/codex-app-server-preflight.js | 2 +- test/codex-app-server-preflight.test.js | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 82f8bcf..8d629aa 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ }, "files": [ "bin/", + "scripts/", "src/", "README.md" ], diff --git a/scripts/codex-app-server-preflight.js b/scripts/codex-app-server-preflight.js index 7ce4307..3356565 100644 --- a/scripts/codex-app-server-preflight.js +++ b/scripts/codex-app-server-preflight.js @@ -292,7 +292,7 @@ export async function runCodexAppServerPreflight({ checks.loginStatusCommand = { ok: loginStatus.ok, command: `${codexBin} login status`, - summary: loginStatus.ok ? tailText(`${loginStatus.stdout}${loginStatus.stderr}`.trim(), 200) : null, + summary: loginStatus.ok ? tailText(redact(`${loginStatus.stdout}${loginStatus.stderr}`.trim()), 200) : null, stderrTail: tailText(loginStatus.stderr) }; diff --git a/test/codex-app-server-preflight.test.js b/test/codex-app-server-preflight.test.js index 2350409..62914b3 100644 --- a/test/codex-app-server-preflight.test.js +++ b/test/codex-app-server-preflight.test.js @@ -27,7 +27,7 @@ if (args.join(" ") === "app-server --help") { } if (args.join(" ") === "login status") { if (auth === "authenticated") { - console.log("Logged in using ChatGPT"); + console.log("Logged in as person@example.com using ChatGPT"); process.exit(0); } console.error("Not logged in"); @@ -91,6 +91,7 @@ test("preflight verifies a normal Codex install with app-server auth", async () assert.equal(result.checks.version.actual, "0.130.0"); assert.equal(result.checks.appServerCommand.ok, true); assert.equal(result.checks.loginStatusCommand.ok, true); + assert.equal(result.checks.loginStatusCommand.summary, "[redacted-email]"); assert.equal(result.checks.appServerAuth.ok, true); assert.deepEqual(result.diagnostics, []); assert.equal(result.checks.appServerAuth.accountStatus.account.email, "[redacted]"); From cb13d7409e1c082c8084358a5a13b73c3846ed3a Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 09:06:16 +0200 Subject: [PATCH 17/23] fix: verify lifecycle protocol errors Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-app-server-lifecycle-probe.js | 19 +++++++++++++++++-- test/codex-app-server-lifecycle-probe.test.js | 19 +++++++++++++++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/scripts/codex-app-server-lifecycle-probe.js b/scripts/codex-app-server-lifecycle-probe.js index 63afb8c..9570f2e 100644 --- a/scripts/codex-app-server-lifecycle-probe.js +++ b/scripts/codex-app-server-lifecycle-probe.js @@ -30,6 +30,14 @@ function createDeferred() { return { promise, resolve, reject }; } +function createJsonRpcError(error) { + const rpcError = new Error(error?.message || JSON.stringify(error)); + rpcError.code = error?.code; + rpcError.data = error?.data; + rpcError.jsonRpcError = error; + return rpcError; +} + function withTimeout(promise, ms, label) { let timer; const timeout = new Promise((_, reject) => { @@ -109,7 +117,7 @@ class JsonLineClient { if (!pending) return; this.pending.delete(message.id); if (message.error) { - pending.reject(new Error(JSON.stringify(message.error))); + pending.reject(createJsonRpcError(message.error)); } else { pending.resolve(message.result); } @@ -170,7 +178,14 @@ async function expectProtocolError(client) { await client.request("switchboard/unsupported-lifecycle-probe", {}); return { ok: false, message: "unsupported method unexpectedly succeeded" }; } catch (error) { - return { ok: true, message: error.message }; + if (error.code === -32601) { + return { ok: true, code: error.code, message: error.message }; + } + return { + ok: false, + code: error.code ?? null, + message: `unsupported method did not return JSON-RPC method-not-found error: ${error.message}` + }; } } diff --git a/test/codex-app-server-lifecycle-probe.test.js b/test/codex-app-server-lifecycle-probe.test.js index 70fafd4..7ba8c3c 100644 --- a/test/codex-app-server-lifecycle-probe.test.js +++ b/test/codex-app-server-lifecycle-probe.test.js @@ -5,7 +5,7 @@ import os from "node:os"; import path from "node:path"; import { runCodexAppServerLifecycleProbe } from "../scripts/codex-app-server-lifecycle-probe.js"; -function createFakeCodexBin({ crashOnTurn = false } = {}) { +function createFakeCodexBin({ crashOnTurn = false, ignoreUnsupportedMethod = false } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-lifecycle-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -41,6 +41,7 @@ rl.on("line", (line) => { } if (message.method === "initialized") return; if (message.method === "switchboard/unsupported-lifecycle-probe") { + if (${JSON.stringify(ignoreUnsupportedMethod)}) return; write({ jsonrpc: "2.0", id: message.id, error: { code: -32601, message: "method not found" } }); return; } @@ -88,6 +89,7 @@ test("lifecycle probe verifies process ownership and recovery signals", async () assert.equal(result.status, "verified"); assert.equal(result.checks.initialize.ok, true); assert.equal(result.checks.protocolError.ok, true); + assert.equal(result.checks.protocolError.code, -32601); assert.equal(result.checks.threadStart.threadId, "thread-life"); assert.equal(result.checks.firstTurn.status, "completed"); assert.equal(result.checks.secondTurn.status, "interrupted"); @@ -96,10 +98,23 @@ test("lifecycle probe verifies process ownership and recovery signals", async () assert.equal(result.checks.shutdown.ok, true); }); +test("lifecycle probe does not accept protocol-error timeouts as method-not-found", async () => { + const result = await runCodexAppServerLifecycleProbe({ + codexBin: createFakeCodexBin({ ignoreUnsupportedMethod: true }), + timeoutMs: 1000 + }); + + assert.equal(result.status, "blocked"); + assert.equal(result.checks.protocolError.ok, false); + assert.equal(result.checks.protocolError.code, null); + assert.match(result.checks.protocolError.message, /did not return JSON-RPC method-not-found error/); + assert.match(result.checks.protocolError.message, /timed out/); +}); + test("lifecycle probe reports app-server crash without hanging", async () => { const result = await runCodexAppServerLifecycleProbe({ codexBin: createFakeCodexBin({ crashOnTurn: true }), - timeoutMs: 1000 + timeoutMs: 5000 }); assert.equal(result.status, "blocked"); From 550bc38c66aa5ee051005cec2a6390f5a0b2062a Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 09:13:37 +0200 Subject: [PATCH 18/23] fix: tighten codex probe evidence Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-app-server-preflight.js | 2 +- scripts/codex-app-server-switch-probe.js | 3 ++- test/codex-app-server-preflight.test.js | 13 ++++++++++++- test/codex-app-server-switch-probe.test.js | 18 ++++++++++++++++-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/scripts/codex-app-server-preflight.js b/scripts/codex-app-server-preflight.js index 3356565..4c1765f 100644 --- a/scripts/codex-app-server-preflight.js +++ b/scripts/codex-app-server-preflight.js @@ -293,7 +293,7 @@ export async function runCodexAppServerPreflight({ ok: loginStatus.ok, command: `${codexBin} login status`, summary: loginStatus.ok ? tailText(redact(`${loginStatus.stdout}${loginStatus.stderr}`.trim()), 200) : null, - stderrTail: tailText(loginStatus.stderr) + stderrTail: tailText(redact(loginStatus.stderr)) }; if (checks.version.ok && checks.appServerCommand.ok) { diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js index f6e84e2..aa6f9d5 100644 --- a/scripts/codex-app-server-switch-probe.js +++ b/scripts/codex-app-server-switch-probe.js @@ -411,7 +411,8 @@ export async function runCodexAppServerSwitchProbe({ thread: { threadId, sessionId, - threadStartModel: threadStart.model || first.codex.model, + requestedThreadStartModel: first.codex.model, + threadStartModel: threadStart.model || null, threadStartModelProvider: threadStart.modelProvider || null }, turns: [firstTurn, secondTurn], diff --git a/test/codex-app-server-preflight.test.js b/test/codex-app-server-preflight.test.js index 62914b3..03b2f74 100644 --- a/test/codex-app-server-preflight.test.js +++ b/test/codex-app-server-preflight.test.js @@ -5,7 +5,7 @@ import os from "node:os"; import path from "node:path"; import { runCodexAppServerPreflight, parseCodexVersion, compareVersions } from "../scripts/codex-app-server-preflight.js"; -function makeFakeCodex({ version = "0.130.0", auth = "authenticated", appServer = true } = {}) { +function makeFakeCodex({ version = "0.130.0", auth = "authenticated", appServer = true, loginStatusStderr = "" } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-preflight-test-")); const bin = path.join(dir, "codex"); const source = `#!/usr/bin/env node @@ -26,6 +26,7 @@ if (args.join(" ") === "app-server --help") { process.exit(0); } if (args.join(" ") === "login status") { + if (${JSON.stringify(loginStatusStderr)}) process.stderr.write(${JSON.stringify(loginStatusStderr)}); if (auth === "authenticated") { console.log("Logged in as person@example.com using ChatGPT"); process.exit(0); @@ -97,6 +98,16 @@ test("preflight verifies a normal Codex install with app-server auth", async () assert.equal(result.checks.appServerAuth.accountStatus.account.email, "[redacted]"); }); +test("preflight redacts login-status stderr before returning diagnostics", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ loginStatusStderr: "warning for person@example.com account id user-123456789012345\n" }), + timeoutMs: 5000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.checks.loginStatusCommand.stderrTail, "[redacted-email]"); +}); + test("preflight fails clearly when Codex CLI is too old", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex({ version: "0.129.0" }), diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js index 2505fe9..c044c48 100644 --- a/test/codex-app-server-switch-probe.test.js +++ b/test/codex-app-server-switch-probe.test.js @@ -35,7 +35,7 @@ function createTargets() { ]; } -function createFakeCodexBin() { +function createFakeCodexBin({ omitThreadStartModel = false } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-probe-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -78,7 +78,7 @@ rl.on("line", (line) => { if (message.method === "thread/start") { respond(message.id, { thread, - model: message.params.model, + ...(${JSON.stringify(omitThreadStartModel)} ? {} : { model: message.params.model }), modelProvider: "openai", serviceTier: null, cwd: message.params.cwd, @@ -150,6 +150,8 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.equal(result.verdict.interactiveTuiHotSwapProven, false); assert.equal(result.thread.threadId, "thread-123"); assert.equal(result.thread.sessionId, "session-abc"); + assert.equal(result.thread.requestedThreadStartModel, "gpt-5.5"); + assert.equal(result.thread.threadStartModel, "gpt-5.5"); assert.equal(result.turns[0].selectedTargetId, "openai-coder"); assert.equal(result.turns[0].requestedModel, "gpt-5.5"); assert.equal(result.turns[1].selectedTargetId, "openai-quick"); @@ -167,3 +169,15 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.equal(result.modelEvidence.rerouted.length, 1); assert.equal(result.modelEvidence.rerouted[0].params.toModel, "gpt-5.4-mini"); }); + +test("codex app-server switch probe does not synthesize omitted thread-start model", async () => { + const result = await runCodexAppServerSwitchProbe({ + codexBin: createFakeCodexBin({ omitThreadStartModel: true }), + targets: createTargets(), + timeoutMs: 5000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.thread.requestedThreadStartModel, "gpt-5.5"); + assert.equal(result.thread.threadStartModel, null); +}); From 7913e1f246fe82c8990b80929ed0920d9873f5d7 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 09:20:56 +0200 Subject: [PATCH 19/23] fix: require observed codex model evidence Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-app-server-switch-probe.js | 32 +++++++++++++++++++++- test/codex-app-server-switch-probe.test.js | 23 ++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js index aa6f9d5..2596caa 100644 --- a/scripts/codex-app-server-switch-probe.js +++ b/scripts/codex-app-server-switch-probe.js @@ -329,6 +329,30 @@ function collectModelEvidence({ notifications, turns, threadRead }) { }; } +function observedModelsForTurn(modelEvidence, turnId) { + if (!turnId) return []; + const entries = [ + ...modelEvidence.turnPayloadModels, + ...modelEvidence.threadReadModels, + ...modelEvidence.rawResponseModels, + ...modelEvidence.rerouted.map((message) => ({ + turnId: message.params?.turnId || null, + model: message.params?.toModel || null + })), + ...modelEvidence.verification.map((message) => ({ + turnId: message.params?.turnId || null, + model: message.params?.model || message.params?.toModel || null + })) + ]; + return [ + ...new Set( + entries + .filter((entry) => entry.turnId === turnId && typeof entry.model === "string") + .map((entry) => entry.model) + ) + ]; +} + export async function runCodexAppServerSwitchProbe({ codexBin = "codex", targets = readJson(OPENAI_TARGETS_PATH).targets, @@ -388,12 +412,17 @@ export async function runCodexAppServerSwitchProbe({ const threadRead = await maybeReadThread(client, threadId); const sameThreadCompleted = firstTurn.completed && secondTurn.completed; - const requestedModelOverrideAccepted = secondTurn.requestedModel === second.codex.model && sameThreadCompleted; const modelEvidence = collectModelEvidence({ notifications: client.notifications, turns: [firstTurn, secondTurn], threadRead }); + const secondTurnObservedModels = observedModelsForTurn(modelEvidence, secondTurn.turnId); + const requestedModelOverrideAccepted = + sameThreadCompleted && + secondTurn.requestedModel === second.codex.model && + secondTurnObservedModels.length > 0 && + secondTurnObservedModels.every((model) => model === second.codex.model); const status = requestedModelOverrideAccepted ? "verified" : "partial"; return { @@ -406,6 +435,7 @@ export async function runCodexAppServerSwitchProbe({ targetChanged, modelChanged, backendModelTelemetryObserved: modelEvidence.backendModelTelemetryObserved, + secondTurnObservedModels, interactiveTuiHotSwapProven: false }, thread: { diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js index c044c48..e200eb3 100644 --- a/test/codex-app-server-switch-probe.test.js +++ b/test/codex-app-server-switch-probe.test.js @@ -35,7 +35,7 @@ function createTargets() { ]; } -function createFakeCodexBin({ omitThreadStartModel = false } = {}) { +function createFakeCodexBin({ omitThreadStartModel = false, secondTurnObservedModel = null } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-probe-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -92,7 +92,10 @@ rl.on("line", (line) => { return; } if (message.method === "turn/start") { - const turn = makeTurn(message.params.model); + const observedModel = nextTurn === 2 && ${JSON.stringify(secondTurnObservedModel)} + ? ${JSON.stringify(secondTurnObservedModel)} + : message.params.model; + const turn = makeTurn(observedModel); thread.turns.push(turn); respond(message.id, { turn }); write({ method: "turn/started", params: { threadId: message.params.threadId, turn } }); @@ -147,6 +150,7 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.equal(result.verdict.targetChanged, true); assert.equal(result.verdict.modelChanged, true); assert.equal(result.verdict.backendModelTelemetryObserved, true); + assert.deepEqual(result.verdict.secondTurnObservedModels, ["gpt-5.4-mini"]); assert.equal(result.verdict.interactiveTuiHotSwapProven, false); assert.equal(result.thread.threadId, "thread-123"); assert.equal(result.thread.sessionId, "session-abc"); @@ -181,3 +185,18 @@ test("codex app-server switch probe does not synthesize omitted thread-start mod assert.equal(result.thread.requestedThreadStartModel, "gpt-5.5"); assert.equal(result.thread.threadStartModel, null); }); + +test("codex app-server switch probe rejects conflicting observed second-turn model", async () => { + const result = await runCodexAppServerSwitchProbe({ + codexBin: createFakeCodexBin({ secondTurnObservedModel: "gpt-5.5" }), + targets: createTargets(), + timeoutMs: 5000 + }); + + assert.equal(result.status, "partial"); + assert.equal(result.verdict.appServerModelOverrideAccepted, false); + assert.deepEqual(result.verdict.secondTurnObservedModels, ["gpt-5.5", "gpt-5.4-mini"]); + assert.equal(result.turns[1].requestedModel, "gpt-5.4-mini"); + assert.equal(result.turns[1].responseModel, "gpt-5.5"); + assert.equal(result.turns[1].completedModel, "gpt-5.5"); +}); From 61fcf389127639e34714d3fbdf62b637377f6e1d Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 09:36:28 +0200 Subject: [PATCH 20/23] fix: clarify codex probe verification Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-app-server-lifecycle-probe.js | 2 +- scripts/codex-app-server-switch-probe.js | 9 ++++-- test/codex-app-server-lifecycle-probe.test.js | 24 ++++++++++++-- test/codex-app-server-switch-probe.test.js | 32 +++++++++++++++++-- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/scripts/codex-app-server-lifecycle-probe.js b/scripts/codex-app-server-lifecycle-probe.js index 9570f2e..e4cf1b6 100644 --- a/scripts/codex-app-server-lifecycle-probe.js +++ b/scripts/codex-app-server-lifecycle-probe.js @@ -167,7 +167,7 @@ class JsonLineClient { } const exit = await withTimeout(this.exitDeferred.promise, 5000, "codex app-server shutdown"); return { - ok: exit.signal === "SIGTERM" || exit.code === 0 || exit.code === null, + ok: exit.signal === "SIGTERM" || exit.code === 0 || Boolean(exit.error), exit }; } diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js index 2596caa..278112a 100644 --- a/scripts/codex-app-server-switch-probe.js +++ b/scripts/codex-app-server-switch-probe.js @@ -418,11 +418,14 @@ export async function runCodexAppServerSwitchProbe({ threadRead }); const secondTurnObservedModels = observedModelsForTurn(modelEvidence, secondTurn.turnId); + const observedSecondTurnModelAccepted = + secondTurnObservedModels.length > 0 && secondTurnObservedModels.every((model) => model === second.codex.model); + const observedSecondTurnModelConflict = + secondTurnObservedModels.length > 0 && secondTurnObservedModels.some((model) => model !== second.codex.model); const requestedModelOverrideAccepted = sameThreadCompleted && secondTurn.requestedModel === second.codex.model && - secondTurnObservedModels.length > 0 && - secondTurnObservedModels.every((model) => model === second.codex.model); + !observedSecondTurnModelConflict; const status = requestedModelOverrideAccepted ? "verified" : "partial"; return { @@ -436,6 +439,8 @@ export async function runCodexAppServerSwitchProbe({ modelChanged, backendModelTelemetryObserved: modelEvidence.backendModelTelemetryObserved, secondTurnObservedModels, + observedSecondTurnModelAccepted, + observedSecondTurnModelConflict, interactiveTuiHotSwapProven: false }, thread: { diff --git a/test/codex-app-server-lifecycle-probe.test.js b/test/codex-app-server-lifecycle-probe.test.js index 7ba8c3c..80f89f2 100644 --- a/test/codex-app-server-lifecycle-probe.test.js +++ b/test/codex-app-server-lifecycle-probe.test.js @@ -5,7 +5,7 @@ import os from "node:os"; import path from "node:path"; import { runCodexAppServerLifecycleProbe } from "../scripts/codex-app-server-lifecycle-probe.js"; -function createFakeCodexBin({ crashOnTurn = false, ignoreUnsupportedMethod = false } = {}) { +function createFakeCodexBin({ crashOnTurn = false, ignoreUnsupportedMethod = false, signalOnTurn = null } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-lifecycle-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -51,6 +51,11 @@ rl.on("line", (line) => { return; } if (message.method === "turn/start") { + if (${JSON.stringify(signalOnTurn)}) { + process.stderr.write("simulated app-server signal exit\\n"); + process.kill(process.pid, ${JSON.stringify(signalOnTurn)}); + return; + } if (${JSON.stringify(crashOnTurn)}) { process.stderr.write("simulated app-server crash\\n"); process.exit(7); @@ -101,7 +106,7 @@ test("lifecycle probe verifies process ownership and recovery signals", async () test("lifecycle probe does not accept protocol-error timeouts as method-not-found", async () => { const result = await runCodexAppServerLifecycleProbe({ codexBin: createFakeCodexBin({ ignoreUnsupportedMethod: true }), - timeoutMs: 1000 + timeoutMs: 3000 }); assert.equal(result.status, "blocked"); @@ -125,6 +130,21 @@ test("lifecycle probe reports app-server crash without hanging", async () => { assert.equal(result.checks.shutdown.exit.code, 7); }); +test("lifecycle probe reports signal-only app-server exits as failed shutdowns", async () => { + const result = await runCodexAppServerLifecycleProbe({ + codexBin: createFakeCodexBin({ signalOnTurn: "SIGKILL" }), + timeoutMs: 5000 + }); + + assert.equal(result.status, "blocked"); + assert.equal(result.checks.runtimeError.ok, false); + assert.match(result.checks.runtimeError.message, /signal SIGKILL/); + assert.match(result.stderrTail, /simulated app-server signal exit/); + assert.equal(result.checks.shutdown.ok, false); + assert.equal(result.checks.shutdown.exit.code, null); + assert.equal(result.checks.shutdown.exit.signal, "SIGKILL"); +}); + test("lifecycle probe reports child process spawn failure", async () => { const result = await runCodexAppServerLifecycleProbe({ codexBin: path.join(os.tmpdir(), "missing-codex-for-lifecycle"), diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js index e200eb3..b0759c6 100644 --- a/test/codex-app-server-switch-probe.test.js +++ b/test/codex-app-server-switch-probe.test.js @@ -35,7 +35,12 @@ function createTargets() { ]; } -function createFakeCodexBin({ omitThreadStartModel = false, secondTurnObservedModel = null } = {}) { +function createFakeCodexBin({ + emitModelTelemetry = true, + omitThreadStartModel = false, + omitTurnModels = false, + secondTurnObservedModel = null +} = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-probe-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -59,7 +64,9 @@ function respond(id, result) { } function makeTurn(model) { - return { id: "turn-" + nextTurn++, status: "completed", model }; + const turn = { id: "turn-" + nextTurn++, status: "completed" }; + if (!${JSON.stringify(omitTurnModels)}) turn.model = model; + return turn; } if (process.argv.slice(2).join(" ") !== "app-server --listen stdio://") { @@ -99,7 +106,7 @@ rl.on("line", (line) => { thread.turns.push(turn); respond(message.id, { turn }); write({ method: "turn/started", params: { threadId: message.params.threadId, turn } }); - if (turn.id === "turn-2") { + if (turn.id === "turn-2" && ${JSON.stringify(emitModelTelemetry)}) { write({ method: "model/rerouted", params: { @@ -151,6 +158,8 @@ test("codex app-server switch probe verifies accepted model override on one thre assert.equal(result.verdict.modelChanged, true); assert.equal(result.verdict.backendModelTelemetryObserved, true); assert.deepEqual(result.verdict.secondTurnObservedModels, ["gpt-5.4-mini"]); + assert.equal(result.verdict.observedSecondTurnModelAccepted, true); + assert.equal(result.verdict.observedSecondTurnModelConflict, false); assert.equal(result.verdict.interactiveTuiHotSwapProven, false); assert.equal(result.thread.threadId, "thread-123"); assert.equal(result.thread.sessionId, "session-abc"); @@ -186,6 +195,21 @@ test("codex app-server switch probe does not synthesize omitted thread-start mod assert.equal(result.thread.threadStartModel, null); }); +test("codex app-server switch probe verifies requested override when backend model telemetry is absent", async () => { + const result = await runCodexAppServerSwitchProbe({ + codexBin: createFakeCodexBin({ emitModelTelemetry: false, omitTurnModels: true }), + targets: createTargets(), + timeoutMs: 5000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.verdict.appServerModelOverrideAccepted, true); + assert.equal(result.verdict.backendModelTelemetryObserved, false); + assert.deepEqual(result.verdict.secondTurnObservedModels, []); + assert.equal(result.verdict.observedSecondTurnModelAccepted, false); + assert.equal(result.verdict.observedSecondTurnModelConflict, false); +}); + test("codex app-server switch probe rejects conflicting observed second-turn model", async () => { const result = await runCodexAppServerSwitchProbe({ codexBin: createFakeCodexBin({ secondTurnObservedModel: "gpt-5.5" }), @@ -196,6 +220,8 @@ test("codex app-server switch probe rejects conflicting observed second-turn mod assert.equal(result.status, "partial"); assert.equal(result.verdict.appServerModelOverrideAccepted, false); assert.deepEqual(result.verdict.secondTurnObservedModels, ["gpt-5.5", "gpt-5.4-mini"]); + assert.equal(result.verdict.observedSecondTurnModelAccepted, false); + assert.equal(result.verdict.observedSecondTurnModelConflict, true); assert.equal(result.turns[1].requestedModel, "gpt-5.4-mini"); assert.equal(result.turns[1].responseModel, "gpt-5.5"); assert.equal(result.turns[1].completedModel, "gpt-5.5"); From 4925e2c70698eb5d828b9f7ef5d90a111de1b05b Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 09:46:57 +0200 Subject: [PATCH 21/23] fix: bound codex cli spike commands Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- docs/product/CODEX-CLI-SPIKE-SCOPE.md | 12 ++++---- scripts/codex-cli-feasibility-probe.js | 35 +++++++++++++++--------- test/codex-cli-feasibility-probe.test.js | 21 +++++++++++++- 3 files changed, 48 insertions(+), 20 deletions(-) diff --git a/docs/product/CODEX-CLI-SPIKE-SCOPE.md b/docs/product/CODEX-CLI-SPIKE-SCOPE.md index e0244cd..0545ce4 100644 --- a/docs/product/CODEX-CLI-SPIKE-SCOPE.md +++ b/docs/product/CODEX-CLI-SPIKE-SCOPE.md @@ -49,13 +49,13 @@ Can Codex CLI provide a supported in-session route-authority boundary where Swit ## Success Criteria -The spike is successful only if all required criteria are met: +The spike is successful if all required criteria are met for a supported Codex control surface: -1. A supported Codex CLI mechanism allows Switchboard to change the selected model/profile inside an already-running interactive session. -2. A two-turn live probe can exercise that mechanism with two different Switchboard-selected models without requiring the user to exit or resume a separate command. -3. The probe records both selected target IDs and resolved Codex models. -4. The probe records enough session evidence to show that continuity was preserved. -5. The workflow requires no manual model selection by the user after the prompt is provided. +1. Switchboard can request two different route-selected Codex models across a two-turn flow. +2. The two-turn live probe preserves session continuity without requiring manual model selection by the user after the prompt is provided. +3. The probe records selected target IDs, resolved Codex models, and enough session evidence to make the continuity claim inspectable. +4. The result clearly states the control-surface boundary: stock Codex CLI `exec`/`resume`, stock interactive TUI, or Switchboard-owned Codex app-server session. +5. Any unsupported stronger claim, such as hot-swapping inside the stock Codex interactive TUI, is explicitly called out as not proven. 6. The implementation remains a spike/probe and does not replace the Claude MVP path. ## Partial Success diff --git a/scripts/codex-cli-feasibility-probe.js b/scripts/codex-cli-feasibility-probe.js index be81e0a..8d307d8 100644 --- a/scripts/codex-cli-feasibility-probe.js +++ b/scripts/codex-cli-feasibility-probe.js @@ -10,6 +10,7 @@ import { getProfileModelMap, getTargetProfileMap } from "../src/adapters/model-m const TARGET_TO_PROFILE = getTargetProfileMap("openai-codex"); const PROFILE_TO_MODEL = getProfileModelMap("openai-codex"); +const DEFAULT_TIMEOUT_MS = 120000; function readJson(filePath) { // eslint-disable-next-line security/detect-non-literal-fs-filename -- probe reads the known targets path or explicit test fixture path. @@ -26,16 +27,19 @@ function hasFlag(args, flag) { return args.includes(flag); } -function runHelp(codexBin, args) { +function runHelp(codexBin, args, { timeoutMs = DEFAULT_TIMEOUT_MS } = {}) { const result = spawnSync(codexBin, args, { encoding: "utf8", - env: { ...process.env, NO_COLOR: "1" } + env: { ...process.env, NO_COLOR: "1" }, + timeout: timeoutMs }); return { command: [codexBin, ...args].join(" "), status: result.status, + signal: result.signal, stdout: result.stdout || "", stderr: result.stderr || "", + error: result.error ? result.error.message : null, ok: result.status === 0 }; } @@ -77,11 +81,12 @@ function routeTurnPlan({ input, session, targets }) { }; } -function runCodexCommand(codexBin, args, { cwd = process.cwd() } = {}) { +function runCodexCommand(codexBin, args, { cwd = process.cwd(), timeoutMs = DEFAULT_TIMEOUT_MS } = {}) { const result = spawnSync(codexBin, args, { cwd, encoding: "utf8", - env: { ...process.env, NO_COLOR: "1" } + env: { ...process.env, NO_COLOR: "1" }, + timeout: timeoutMs }); return { command: [codexBin, ...args].join(" "), @@ -90,6 +95,7 @@ function runCodexCommand(codexBin, args, { cwd = process.cwd() } = {}) { signal: result.signal, stdout: result.stdout || "", stderr: result.stderr || "", + error: result.error ? result.error.message : null, ok: result.status === 0 }; } @@ -166,6 +172,7 @@ function summarizeLiveTurn({ label, plan, commandResult, outputPath }) { command: commandResult.command, status: commandResult.status, signal: commandResult.signal, + error: commandResult.error, ok: commandResult.ok, stdoutTail: tailText(commandResult.stdout), stderrTail: tailText(commandResult.stderr), @@ -176,7 +183,7 @@ function summarizeLiveTurn({ label, plan, commandResult, outputPath }) { }; } -function runLiveResumeProbe({ codexBin, first, second, cwd = process.cwd() }) { +function runLiveResumeProbe({ codexBin, first, second, cwd = process.cwd(), timeoutMs = DEFAULT_TIMEOUT_MS }) { if (!first.codex.model || !second.codex.model) { return { status: "blocked", @@ -204,7 +211,7 @@ function runLiveResumeProbe({ codexBin, first, second, cwd = process.cwd() }) { cwd, first.input ], - { cwd } + { cwd, timeoutMs } ); const firstTurn = summarizeLiveTurn({ label: "first", @@ -236,7 +243,7 @@ function runLiveResumeProbe({ codexBin, first, second, cwd = process.cwd() }) { secondOutputPath, second.input ], - { cwd } + { cwd, timeoutMs } ); const secondTurn = summarizeLiveTurn({ label: "second", @@ -274,11 +281,12 @@ export function runCodexCliFeasibilityProbe({ codexBin = "codex", targets = readJson(OPENAI_TARGETS_PATH).targets, live = false, - cwd = process.cwd() + cwd = process.cwd(), + timeoutMs = DEFAULT_TIMEOUT_MS } = {}) { - const rootHelp = runHelp(codexBin, ["--help"]); - const execHelp = runHelp(codexBin, ["exec", "--help"]); - const resumeHelp = runHelp(codexBin, ["exec", "resume", "--help"]); + const rootHelp = runHelp(codexBin, ["--help"], { timeoutMs }); + const execHelp = runHelp(codexBin, ["exec", "--help"], { timeoutMs }); + const resumeHelp = runHelp(codexBin, ["exec", "resume", "--help"], { timeoutMs }); const rootText = `${rootHelp.stdout}\n${rootHelp.stderr}`; const execText = `${execHelp.stdout}\n${execHelp.stderr}`; @@ -336,7 +344,7 @@ export function runCodexCliFeasibilityProbe({ : resumeBoundaryRerouteSupported ? "partial" : "advisory_only"; - const liveProbe = live && surfaceStatus !== "blocked" ? runLiveResumeProbe({ codexBin, first, second, cwd }) : null; + const liveProbe = live && surfaceStatus !== "blocked" ? runLiveResumeProbe({ codexBin, first, second, cwd, timeoutMs }) : null; const status = liveProbe ? liveProbe.status : surfaceStatus; return { @@ -381,7 +389,8 @@ async function main() { const args = process.argv.slice(2); const codexBin = getArg(args, "--codex-bin") || "codex"; const cwd = getArg(args, "--cwd") || process.cwd(); - const result = runCodexCliFeasibilityProbe({ codexBin, live: hasFlag(args, "--live"), cwd }); + const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); + const result = runCodexCliFeasibilityProbe({ codexBin, live: hasFlag(args, "--live"), cwd, timeoutMs }); process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); process.exitCode = result.status === "blocked" ? 1 : 0; } diff --git a/test/codex-cli-feasibility-probe.test.js b/test/codex-cli-feasibility-probe.test.js index 90c54b0..a6657ad 100644 --- a/test/codex-cli-feasibility-probe.test.js +++ b/test/codex-cli-feasibility-probe.test.js @@ -35,7 +35,7 @@ function createTargets() { ]; } -function createFakeCodexBin() { +function createFakeCodexBin({ hangLiveExec = false } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-cli-probe-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -56,6 +56,10 @@ if (args.join(" ") === "exec resume --help") { process.exit(0); } if (args[0] === "exec" && args[1] === "--model") { + if (${JSON.stringify(hangLiveExec)}) { + setInterval(() => {}, 1000); + return; + } const outputPath = args[args.indexOf("--output-last-message") + 1]; fs.writeFileSync(outputPath, "implemented retry logic"); console.log(JSON.stringify({ type: "session", session_id: "11111111-1111-4111-8111-111111111111" })); @@ -111,3 +115,18 @@ test("codex CLI live feasibility probe verifies resumed turns only with shared s assert.equal(result.liveProbe.turns[0].finalMessageBytes > 0, true); assert.equal(result.liveProbe.turns[1].finalMessageBytes > 0, true); }); + +test("codex CLI live feasibility probe times out stalled Codex commands", () => { + const result = runCodexCliFeasibilityProbe({ + codexBin: createFakeCodexBin({ hangLiveExec: true }), + targets: createTargets(), + live: true, + timeoutMs: 1000 + }); + + assert.equal(result.status, "blocked"); + assert.equal(result.liveProbe.status, "blocked"); + assert.equal(result.liveProbe.turns[0].ok, false); + assert.equal(result.liveProbe.turns[0].error.includes("ETIMEDOUT"), true); + assert.equal(result.liveProbe.turns[0].signal, "SIGTERM"); +}); From 4fb88595c0e40dfe8fc7899560b9bc7b6581a030 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 10:05:50 +0200 Subject: [PATCH 22/23] fix: harden codex app-server diagnostics Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-app-server-preflight.js | 4 +-- scripts/codex-app-server-switch-probe.js | 2 +- test/codex-app-server-preflight.test.js | 37 +++++++++++++++++++++- test/codex-app-server-switch-probe.test.js | 19 +++++++++++ 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/scripts/codex-app-server-preflight.js b/scripts/codex-app-server-preflight.js index 4c1765f..ee8ff34 100644 --- a/scripts/codex-app-server-preflight.js +++ b/scripts/codex-app-server-preflight.js @@ -224,7 +224,7 @@ async function checkAppServerAuth({ codexBin, timeoutMs }) { ok: Boolean(authStatus?.authMethod || accountStatus?.account), authStatus: redact(authStatus), accountStatus: redact(accountStatus), - stderrTail: tailText(client.stderr) + stderrTail: tailText(redact(client.stderr)) }; } finally { client.close(); @@ -312,7 +312,7 @@ export async function runCodexAppServerPreflight({ } catch (error) { checks.appServerAuth = { ok: false, - error: error.message + error: redact(error.message) }; diagnostics.push( diagnostic( diff --git a/scripts/codex-app-server-switch-probe.js b/scripts/codex-app-server-switch-probe.js index 278112a..5753178 100644 --- a/scripts/codex-app-server-switch-probe.js +++ b/scripts/codex-app-server-switch-probe.js @@ -491,7 +491,7 @@ async function main() { const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); const result = await runCodexAppServerSwitchProbe({ codexBin, cwd, timeoutMs }); process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); - process.exitCode = result.status === "blocked" ? 1 : 0; + process.exitCode = result.status === "verified" ? 0 : 1; } if (process.argv[1] === fileURLToPath(import.meta.url)) { diff --git a/test/codex-app-server-preflight.test.js b/test/codex-app-server-preflight.test.js index 03b2f74..4ff1045 100644 --- a/test/codex-app-server-preflight.test.js +++ b/test/codex-app-server-preflight.test.js @@ -5,7 +5,14 @@ import os from "node:os"; import path from "node:path"; import { runCodexAppServerPreflight, parseCodexVersion, compareVersions } from "../scripts/codex-app-server-preflight.js"; -function makeFakeCodex({ version = "0.130.0", auth = "authenticated", appServer = true, loginStatusStderr = "" } = {}) { +function makeFakeCodex({ + version = "0.130.0", + auth = "authenticated", + appServer = true, + appServerAuthStderr = "", + exitOnAuth = false, + loginStatusStderr = "" +} = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-app-server-preflight-test-")); const bin = path.join(dir, "codex"); const source = `#!/usr/bin/env node @@ -36,6 +43,7 @@ if (args.join(" ") === "login status") { } if (args.join(" ") === "app-server --listen stdio://") { if (!appServer) process.exit(2); + if (${JSON.stringify(appServerAuthStderr)}) process.stderr.write(${JSON.stringify(appServerAuthStderr)}); let buffer = ""; process.stdin.setEncoding("utf8"); process.stdin.on("data", (chunk) => { @@ -55,6 +63,7 @@ if (args.join(" ") === "app-server --listen stdio://") { if (!message.id) return; if (message.method === "initialize") write(message.id, {}); if (message.method === "getAuthStatus") { + if (${JSON.stringify(exitOnAuth)}) process.exit(7); write(message.id, auth === "authenticated" ? { authMethod: "chatgpt", authToken: null, requiresOpenaiAuth: false } : { authMethod: null, authToken: null, requiresOpenaiAuth: true }); @@ -98,6 +107,32 @@ test("preflight verifies a normal Codex install with app-server auth", async () assert.equal(result.checks.appServerAuth.accountStatus.account.email, "[redacted]"); }); +test("preflight redacts app-server auth stderr before returning diagnostics", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ appServerAuthStderr: "auth warning for person@example.com account id user-123456789012345\n" }), + timeoutMs: 5000 + }); + + assert.equal(result.status, "verified"); + assert.equal(result.checks.appServerAuth.stderrTail, "[redacted-email]"); +}); + +test("preflight redacts app-server auth errors before returning diagnostics", async () => { + const result = await runCodexAppServerPreflight({ + codexBin: makeFakeCodex({ + appServerAuthStderr: "fatal auth warning for person@example.com account id user-123456789012345\n", + exitOnAuth: true + }), + timeoutMs: 5000 + }); + + assert.equal(result.status, "failed"); + assert.equal(result.checks.appServerAuth.ok, false); + assert.equal(result.checks.appServerAuth.error.includes("person@example.com"), false); + assert.equal(result.checks.appServerAuth.error.includes("user-123456789012345"), false); + assert.equal(result.checks.appServerAuth.error, "[redacted-email]"); +}); + test("preflight redacts login-status stderr before returning diagnostics", async () => { const result = await runCodexAppServerPreflight({ codexBin: makeFakeCodex({ loginStatusStderr: "warning for person@example.com account id user-123456789012345\n" }), diff --git a/test/codex-app-server-switch-probe.test.js b/test/codex-app-server-switch-probe.test.js index b0759c6..b3fe381 100644 --- a/test/codex-app-server-switch-probe.test.js +++ b/test/codex-app-server-switch-probe.test.js @@ -1,10 +1,14 @@ import test from "node:test"; import assert from "node:assert/strict"; +import { spawnSync } from "node:child_process"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; +import { fileURLToPath } from "node:url"; import { runCodexAppServerSwitchProbe } from "../scripts/codex-app-server-switch-probe.js"; +const SWITCH_PROBE_SCRIPT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../scripts/codex-app-server-switch-probe.js"); + function createTargets() { return [ { @@ -226,3 +230,18 @@ test("codex app-server switch probe rejects conflicting observed second-turn mod assert.equal(result.turns[1].responseModel, "gpt-5.5"); assert.equal(result.turns[1].completedModel, "gpt-5.5"); }); + +test("codex app-server switch probe CLI exits non-zero for partial results", () => { + const result = spawnSync( + process.execPath, + [SWITCH_PROBE_SCRIPT, "--codex-bin", createFakeCodexBin({ secondTurnObservedModel: "gpt-5.5" }), "--timeout-ms", "5000"], + { + encoding: "utf8" + } + ); + + assert.equal(result.status, 1); + const output = JSON.parse(result.stdout); + assert.equal(output.status, "partial"); + assert.equal(output.verdict.appServerModelOverrideAccepted, false); +}); From a96c48b2eee14c8522c0e1db97f248535013e7c4 Mon Sep 17 00:00:00 2001 From: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> Date: Thu, 14 May 2026 10:19:29 +0200 Subject: [PATCH 23/23] fix: fail partial live codex cli probes Signed-off-by: Hanna Rosengren <4538260+hannasoderstromdev@users.noreply.github.com> --- scripts/codex-cli-feasibility-probe.js | 5 +++-- test/codex-cli-feasibility-probe.test.js | 26 ++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/scripts/codex-cli-feasibility-probe.js b/scripts/codex-cli-feasibility-probe.js index 8d307d8..da36563 100644 --- a/scripts/codex-cli-feasibility-probe.js +++ b/scripts/codex-cli-feasibility-probe.js @@ -387,12 +387,13 @@ export function runCodexCliFeasibilityProbe({ async function main() { const args = process.argv.slice(2); + const live = hasFlag(args, "--live"); const codexBin = getArg(args, "--codex-bin") || "codex"; const cwd = getArg(args, "--cwd") || process.cwd(); const timeoutMs = Number(getArg(args, "--timeout-ms") || DEFAULT_TIMEOUT_MS); - const result = runCodexCliFeasibilityProbe({ codexBin, live: hasFlag(args, "--live"), cwd, timeoutMs }); + const result = runCodexCliFeasibilityProbe({ codexBin, live, cwd, timeoutMs }); process.stdout.write(`${JSON.stringify(result, null, 2)}\n`); - process.exitCode = result.status === "blocked" ? 1 : 0; + process.exitCode = live ? (result.status === "verified" ? 0 : 1) : result.status === "blocked" ? 1 : 0; } if (process.argv[1] === fileURLToPath(import.meta.url)) { diff --git a/test/codex-cli-feasibility-probe.test.js b/test/codex-cli-feasibility-probe.test.js index a6657ad..1ebf07f 100644 --- a/test/codex-cli-feasibility-probe.test.js +++ b/test/codex-cli-feasibility-probe.test.js @@ -1,10 +1,14 @@ import test from "node:test"; import assert from "node:assert/strict"; +import { spawnSync } from "node:child_process"; import fs from "node:fs"; import os from "node:os"; import path from "node:path"; +import { fileURLToPath } from "node:url"; import { runCodexCliFeasibilityProbe } from "../scripts/codex-cli-feasibility-probe.js"; +const CLI_PROBE_SCRIPT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../scripts/codex-cli-feasibility-probe.js"); + function createTargets() { return [ { @@ -35,7 +39,7 @@ function createTargets() { ]; } -function createFakeCodexBin({ hangLiveExec = false } = {}) { +function createFakeCodexBin({ hangLiveExec = false, omitResumeSession = false } = {}) { const dir = fs.mkdtempSync(path.join(os.tmpdir(), "codex-cli-probe-test-")); const binPath = path.join(dir, "codex"); fs.writeFileSync( @@ -69,7 +73,9 @@ if (args[0] === "exec" && args[1] === "--model") { if (args[0] === "exec" && args[1] === "resume" && args.includes("--last")) { const outputPath = args[args.indexOf("--output-last-message") + 1]; fs.writeFileSync(outputPath, "summarized outcome"); - console.log(JSON.stringify({ type: "session", session_id: "11111111-1111-4111-8111-111111111111" })); + if (!${JSON.stringify(omitResumeSession)}) { + console.log(JSON.stringify({ type: "session", session_id: "11111111-1111-4111-8111-111111111111" })); + } console.log(JSON.stringify({ type: "turn_complete", model: args[args.indexOf("--model") + 1] })); process.exit(0); } @@ -130,3 +136,19 @@ test("codex CLI live feasibility probe times out stalled Codex commands", () => assert.equal(result.liveProbe.turns[0].error.includes("ETIMEDOUT"), true); assert.equal(result.liveProbe.turns[0].signal, "SIGTERM"); }); + +test("codex CLI live feasibility probe CLI exits non-zero for partial session evidence", () => { + const result = spawnSync( + process.execPath, + [CLI_PROBE_SCRIPT, "--live", "--codex-bin", createFakeCodexBin({ omitResumeSession: true }), "--timeout-ms", "5000"], + { + encoding: "utf8" + } + ); + + assert.equal(result.status, 1); + const output = JSON.parse(result.stdout); + assert.equal(output.status, "partial"); + assert.equal(output.liveProbe.continuityEvidence, "resume_last_success_without_session_id"); + assert.deepEqual(output.liveProbe.sharedSessionIds, []); +});