coder · ammar-agent · Feb 12, 2026 · chatgpt-codex-connector · Feb 12, 2026
diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
@@ -172,7 +172,7 @@ jobs:
 
           # Manual override (workflow_dispatch input)
           if [[ -n "${{ github.event.inputs.test_filter }}" ]]; then
-            TEST_INTEGRATION=1 bun x jest --coverage --maxWorkers=100% --silent ${{ github.event.inputs.test_filter }}
+            bun x jest --coverage --maxWorkers=100% --silent ${{ github.event.inputs.test_filter }}
             exit 0
           fi
 
@@ -185,11 +185,11 @@ jobs:
           # Backend changed: run ALL integration tests (includes tests/ui)
           if [[ "$BACKEND" == "true" ]]; then
             echo "Backend changes detected - running all integration tests"
-            TEST_INTEGRATION=1 bun x jest --coverage --maxWorkers=100% --silent tests/
+            bun x jest --coverage --maxWorkers=100% --silent tests/
           else
             # Browser-only changes: run tests/ui
             echo "Browser changes detected - running tests/ui"
-            TEST_INTEGRATION=1 bun x jest --coverage --maxWorkers=100% --silent tests/ui/
+            bun x jest --coverage --maxWorkers=100% --silent tests/ui/
           fi
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -296,7 +296,6 @@ jobs:
           bun x jest --silent tests/ipc/runtime/backgroundBashDirect.test.ts tests/ipc/runtime/executeBash.test.ts tests/ipc/projects/create.test.ts tests/ipc/agents/planCommands.test.ts
         env:
           BACKEND: ${{ needs.changes.outputs.backend }}
-          TEST_INTEGRATION: "1"
 
   smoke-server:
     name: Smoke / Server

diff --git a/.mux/skills/tests/SKILL.md b/.mux/skills/tests/SKILL.md
@@ -14,12 +14,13 @@ Two types of tests are preferred:
 
 Unit tests are located colocated with the source they test as ".test.ts[x]" files.
 
-Integration tests are located in `tests/`, with these primary harnesses:
+Integration tests are located in `tests/`, organized into semantic subfolders:
 
-- `tests/ipc` — tests that rely on the IPC and are focussed on ensuring backend behavior.
-- `tests/ui` — frontend integration tests that use the real IPC and happy-dom Full App rendering.
-- `tests/e2e` - end-to-end tests using Playwright which are needed to verify browser behavior that
-  can't be easily tested with happy-dom.
+- `tests/ipc/<area>/` — backend tests via IPC, grouped by oRPC namespace: `workspace/`, `streaming/`, `runtime/`, `terminal/`, `projects/`, `agents/`, `config/`, `providers/`.
+- `tests/ui/<area>/` — frontend integration tests using happy-dom Full App rendering, grouped by product area: `workspaces/`, `chat/`, `agents/`, `review/`, `tasks/`, `compaction/`, `config/`, `layout/`, `git/`, `runtime/`, `gateway/`.
+- `tests/e2e/` — end-to-end tests using Playwright for behavior that can't be easily tested with happy-dom.
+
+Shared helpers stay at the harness root (e.g., `tests/ipc/setup.ts`, `tests/ui/helpers.ts`).
 
 Additionally, we have stories in `src/browser/stories` that are primarily used for human visual
 verification of UI changes.
@@ -52,16 +53,16 @@ the production code, then verify the test passes.
 
 ## Test Runtime
 
-All tests in `tests/` are run under `bun x jest` with `TEST_INTEGRATION=1` set.
+All tests in `tests/` are run under `bun x jest`. Tests that live in `src/` run under `bun test` (generally these are unit tests).
 
-Otherwise, tests that live in `src/` run under `bun test` (generally these are unit tests).
+Tests that call real AI APIs set `jest.setTimeout(600_000)` at file level and use `validateApiKeys()` in a `beforeAll` to fail fast when keys are missing. No env var gating is needed — the Makefile controls which paths run.
 
 ## Runtime & Checks
 
 - Never kill the running Mux process; rely on the following for local validation:
   - `make typecheck`
   - `make static-check` (includes typecheck, lint, fmt-check, and docs link validation)
-  - targeted test invocations (e.g. `bun x jest tests/ipc/sendMessage.test.ts -t "pattern"`)
+  - targeted test invocations (e.g. `bun x jest tests/ipc/streaming/sendMessage.basic.test.ts -t "pattern"`)
 - Only wait on CI to pass when local, targeted checks pass.
 - Prefer surgical test invocations over running full suites.
 - Keep utils pure or parameterize external effects for easier testing.
@@ -91,8 +92,8 @@ Otherwise, tests that live in `src/` run under `bun test` (generally these are u
   - Backend API calls are fine for setup/teardown or to avoid expensive operations.
   - Consider moving the test to `tests/ipc` if backend logic needs granular testing.
 - Never bypass the UI in these tests; e.g. do not call `updatePersistedState` to change UI state—go through the UI to trigger the desired behavior.
-- These tests require `TEST_INTEGRATION=1`; use `shouldRunIntegrationTests()` guard.
 - Only call `validateApiKeys()` in tests that actually make AI API calls. Pure UI interaction tests (clicking buttons, selecting items) don't need API keys.
+- Tests that call real AI APIs should add `jest.setTimeout(600_000)` at file level.
 
 ### Happy-dom Limitations
 

diff --git a/Makefile b/Makefile
@@ -335,7 +335,7 @@ check-deadcode: node_modules/.installed ## Check for potential dead code (manual
 ## Testing
 test-integration: node_modules/.installed build-main ## Run all tests (unit + integration)
 	@bun test src
-	@TEST_INTEGRATION=1 bun x jest tests
+	@bun x jest tests
 
 test-unit: node_modules/.installed build-main ## Run unit tests
 	@bun test src

diff --git a/jest.config.js b/jest.config.js
@@ -41,6 +41,6 @@ module.exports = {
   maxWorkers: "50%",
   // Force exit after tests complete to avoid hanging on lingering handles
   forceExit: true,
-  // 10 minute timeout for integration tests, 10s for unit tests
-  testTimeout: process.env.TEST_INTEGRATION === "1" ? 600000 : 10000,
+  // 2 min default covers non-AI integration tests. AI tests set jest.setTimeout(600_000).
+  testTimeout: 120000,
 };
diff --git a/src/node/services/streamManager.test.ts b/src/node/services/streamManager.test.ts
@@ -7,17 +7,12 @@ import { APICallError, RetryError, type ModelMessage } from "ai";
 import type { HistoryService } from "./historyService";
 import { createTestHistoryService } from "./testHistoryService";
 import { createAnthropic } from "@ai-sdk/anthropic";
-import { shouldRunIntegrationTests, validateApiKeys } from "../../../tests/testUtils";
 import { DisposableTempDir } from "@/node/services/tempDir";
 import { createRuntime } from "@/node/runtime/runtimeFactory";
 
-// Skip integration tests if TEST_INTEGRATION is not set
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-// Validate API keys before running tests
-if (shouldRunIntegrationTests()) {
-  validateApiKeys(["ANTHROPIC_API_KEY"]);
-}
+// The "with real API" section calls Anthropic — gate on key availability
+const hasAnthropicKey = Boolean(process.env.ANTHROPIC_API_KEY);
+const describeWithApi = hasAnthropicKey ? describe : describe.skip;
 
 // Real HistoryService backed by a temp directory (created fresh per test)
 let historyService: HistoryService;
@@ -266,8 +261,8 @@ describe("StreamManager - Concurrent Stream Prevention", () => {
     streamManager.on("error", () => undefined);
   });
 
-  // Integration test - requires API key and TEST_INTEGRATION=1
-  describeIntegration("with real API", () => {
+  // Integration test - requires ANTHROPIC_API_KEY
+  describeWithApi("with real API", () => {
     test("should prevent concurrent streams for the same workspace", async () => {
       const workspaceId = "test-workspace-concurrent";
       const anthropic = createAnthropic({ apiKey: process.env.ANTHROPIC_API_KEY });

diff --git a/tests/ipc/agents/planCommands.test.ts b/tests/ipc/agents/planCommands.test.ts
@@ -9,18 +9,15 @@
 
 import * as fs from "fs/promises";
 import * as path from "path";
-import { shouldRunIntegrationTests, createTestEnvironment, cleanupTestEnvironment } from "../setup";
+import { createTestEnvironment, cleanupTestEnvironment } from "../setup";
 import type { TestEnvironment } from "../setup";
 import { createTempGitRepo, cleanupTempGitRepo, generateBranchName } from "../helpers";
 import { detectDefaultTrunkBranch } from "../../../src/node/git";
 import { getPlanFilePath } from "../../../src/common/utils/planStorage";
 import { createMuxMessage } from "../../../src/common/types/message";
 import { expandTilde } from "../../../src/node/runtime/tildeExpansion";
 
-// Skip all tests if TEST_INTEGRATION is not set
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-describeIntegration("Plan Commands Integration", () => {
+describe("Plan Commands Integration", () => {
   let env: TestEnvironment;
   let repoPath: string;
 

diff --git a/tests/ipc/compaction1MRetry.integration.test.ts b/tests/ipc/compaction1MRetry.integration.test.ts
@@ -1,3 +1,5 @@
+jest.setTimeout(600_000);
+
 /**
  * Integration test: Compaction 1M context retry.
  *
@@ -9,19 +11,12 @@
  * the compaction should succeed rather than returning context_exceeded.
  */
 
-import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "./setup";
+import { setupWorkspace } from "./setup";
 import { createStreamCollector, resolveOrpcClient } from "./helpers";
 import { HistoryService } from "../../src/node/services/historyService";
 import { createMuxMessage } from "../../src/common/types/message";
 import { KNOWN_MODELS } from "../../src/common/constants/knownModels";
 
-// Skip all tests if TEST_INTEGRATION is not set
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-if (shouldRunIntegrationTests()) {
-  validateApiKeys(["ANTHROPIC_API_KEY"]);
-}
-
 // ~1 token ≈ 4 chars in English text. To exceed 200k tokens we need ~800k chars.
 // Use ~260k tokens of padding to comfortably exceed the 200k default context.
 const TOKENS_PER_CHAR = 0.25; // conservative estimate
@@ -40,7 +35,7 @@ function buildFillerText(charCount: number): string {
   return base.repeat(repeats).slice(0, charCount);
 }
 
-describeIntegration("compaction 1M context retry", () => {
+describe("compaction 1M context retry", () => {
   // Compaction with 1M retry can take a while — summarizing 250k+ tokens of content
   const TEST_TIMEOUT_MS = 120_000;
 

diff --git a/tests/ipc/config/mcpConfig.test.ts b/tests/ipc/config/mcpConfig.test.ts
@@ -1,12 +1,6 @@
 import * as fs from "fs/promises";
 import * as path from "path";
-import {
-  shouldRunIntegrationTests,
-  cleanupTestEnvironment,
-  createTestEnvironment,
-  setupWorkspace,
-  validateApiKeys,
-} from "../setup";
+import { cleanupTestEnvironment, createTestEnvironment, setupWorkspace } from "../setup";
 import {
   createTempGitRepo,
   cleanupTempGitRepo,
@@ -19,7 +13,7 @@ import {
 } from "../helpers";
 import type { StreamCollector } from "../streamCollector";
 
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
+jest.setTimeout(600_000);
 
 const CHROME_DEVTOOLS_MCP_VERSION = "0.12.1";
 const CHROME_DEVTOOLS_MCP_NPX = `npx -y chrome-devtools-mcp@${CHROME_DEVTOOLS_MCP_VERSION}`;
@@ -31,9 +25,6 @@ const TEST_SCREENSHOT_MCP_SERVER_PATH = path.join(
   "mcp-screenshot-server.js"
 );
 const TEST_SCREENSHOT_MCP_SERVER_COMMAND = `node "${TEST_SCREENSHOT_MCP_SERVER_PATH}"`;
-if (shouldRunIntegrationTests()) {
-  validateApiKeys(["ANTHROPIC_API_KEY"]);
-}
 
 // Shared types for MCP content parsing
 type MediaItem = { type: "media"; data: string; mediaType: string };
@@ -131,7 +122,7 @@ function assertValidScreenshotResult(
   return { mediaItems, textItems };
 }
 
-describeIntegration("MCP global configuration", () => {
+describe("MCP global configuration", () => {
   test.concurrent("add, list, and remove MCP servers", async () => {
     const env = await createTestEnvironment();
     const repoPath = await createTempGitRepo();
@@ -294,7 +285,7 @@ describeIntegration("MCP global configuration", () => {
   });
 });
 
-describeIntegration("MCP server integration with model", () => {
+describe("MCP server integration with model", () => {
   // Test matrix for image format handling
   const imageFormatCases = [
     {

diff --git a/tests/ipc/config/modelNotFound.test.ts b/tests/ipc/config/modelNotFound.test.ts
@@ -1,16 +1,12 @@
-import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "../setup";
+import { setupWorkspace } from "../setup";
 import { sendMessageWithModel, createStreamCollector, modelString } from "../helpers";
 import type { StreamErrorMessage } from "@/common/orpc/types";
 
-// Skip all tests if TEST_INTEGRATION is not set
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
+jest.setTimeout(600_000);
 
 // Validate API keys before running tests
-if (shouldRunIntegrationTests()) {
-  validateApiKeys(["ANTHROPIC_API_KEY", "OPENAI_API_KEY"]);
-}
 
-describeIntegration("model_not_found error handling", () => {
+describe("model_not_found error handling", () => {
   test.concurrent(
     "should classify Anthropic 404 as model_not_found (not retryable)",
     async () => {

diff --git a/tests/ipc/doubleRegister.test.ts b/tests/ipc/doubleRegister.test.ts
@@ -1,9 +1,7 @@
-import { shouldRunIntegrationTests, createTestEnvironment, cleanupTestEnvironment } from "./setup";
+import { createTestEnvironment, cleanupTestEnvironment } from "./setup";
 import { resolveOrpcClient } from "./helpers";
 
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-describeIntegration("Service double registration", () => {
+describe("Service double registration", () => {
   test.concurrent(
     "should not throw when register() is called multiple times",
     async () => {

diff --git a/tests/ipc/projects/create.test.ts b/tests/ipc/projects/create.test.ts
@@ -12,12 +12,10 @@ import * as fs from "fs/promises";
 import * as path from "path";
 import { getMuxHome, getMuxProjectsDir } from "../../../src/common/constants/paths";
 import * as os from "os";
-import { shouldRunIntegrationTests, createTestEnvironment, cleanupTestEnvironment } from "../setup";
+import { createTestEnvironment, cleanupTestEnvironment } from "../setup";
 import { resolveOrpcClient } from "../helpers";
 
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-describeIntegration("PROJECT_CREATE IPC Handler", () => {
+describe("PROJECT_CREATE IPC Handler", () => {
   test.concurrent("should resolve bare project name to mux projects dir", async () => {
     const env = await createTestEnvironment();
     const bareName = `mux-test-bare-${Date.now()}`;

diff --git a/tests/ipc/projects/nameGeneration.test.ts b/tests/ipc/projects/nameGeneration.test.ts
@@ -7,13 +7,11 @@
  * - Model selection fallback works correctly
  */
 
-import { shouldRunIntegrationTests } from "../../testUtils";
 import { createTestEnvironment, cleanupTestEnvironment, type TestEnvironment } from "../setup";
 
 // Skip if integration tests are disabled (requires real API keys)
-const describeIfIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
 
-describeIfIntegration("Name generation with real LLM", () => {
+describe("Name generation with real LLM", () => {
   let env: TestEnvironment;
 
   beforeAll(async () => {

diff --git a/tests/ipc/projects/refactor.test.ts b/tests/ipc/projects/refactor.test.ts
@@ -1,12 +1,10 @@
 import * as fs from "fs/promises";
 import * as path from "path";
 import * as os from "os";
-import { shouldRunIntegrationTests, createTestEnvironment, cleanupTestEnvironment } from "../setup";
+import { createTestEnvironment, cleanupTestEnvironment } from "../setup";
 import { resolveOrpcClient } from "../helpers";
 
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
-describeIntegration("ProjectService IPC Handlers", () => {
+describe("ProjectService IPC Handlers", () => {
   test.concurrent("should list projects including the created one", async () => {
     const env = await createTestEnvironment();
     const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "mux-project-service-test-"));

diff --git a/tests/ipc/providers/anthropicCacheStrategy.test.ts b/tests/ipc/providers/anthropicCacheStrategy.test.ts
@@ -1,18 +1,19 @@
-import { setupWorkspace, shouldRunIntegrationTests } from "../setup";
+import { setupWorkspace } from "../setup";
 import { sendMessageWithModel, createStreamCollector } from "../helpers";
 
-// Skip tests unless TEST_INTEGRATION=1 AND required API keys are present
+// Skip tests unless ANTHROPIC_API_KEY is present
 const hasAnthropicKey = Boolean(process.env.ANTHROPIC_API_KEY);
-const shouldRunSuite = shouldRunIntegrationTests() && hasAnthropicKey;
-const describeIntegration = shouldRunSuite ? describe : describe.skip;
+const describeSuite = hasAnthropicKey ? describe : describe.skip;
 const TEST_TIMEOUT_MS = 45000; // 45s total: setup + 2 messages at 15s each
 
-if (shouldRunIntegrationTests() && !shouldRunSuite) {
+if (!hasAnthropicKey) {
   // eslint-disable-next-line no-console
   console.warn("Skipping Anthropic cache strategy integration tests: missing ANTHROPIC_API_KEY");
 }
 
-describeIntegration("Anthropic cache strategy integration", () => {
+jest.setTimeout(600_000);
+
+describeSuite("Anthropic cache strategy integration", () => {
   test(
     "should apply cache control to messages, system prompt, and tools for Anthropic models",
     async () => {

diff --git a/tests/ipc/providers/ollama.test.ts b/tests/ipc/providers/ollama.test.ts
@@ -1,4 +1,4 @@
-import { setupWorkspace, shouldRunIntegrationTests } from "../setup";
+import { setupWorkspace } from "../setup";
 import {
   sendMessageWithModel,
   createStreamCollector,
@@ -10,8 +10,10 @@ import {
 import { spawn } from "child_process";
 import { loadTokenizerModules } from "../../../src/node/utils/main/tokenizer";
 
-// Skip all tests if TEST_INTEGRATION or TEST_OLLAMA is not set
-const shouldRunOllamaTests = shouldRunIntegrationTests() && process.env.TEST_OLLAMA === "1";
+jest.setTimeout(600_000);
+
+// Skip all tests unless TEST_OLLAMA=1 (requires local Ollama service)
+const shouldRunOllamaTests = process.env.TEST_OLLAMA === "1";
 const describeOllama = shouldRunOllamaTests ? describe : describe.skip;
 
 // Ollama doesn't require API keys - it's a local service

diff --git a/tests/ipc/providers/openaiPreviousResponseIdRecovery.test.ts b/tests/ipc/providers/openaiPreviousResponseIdRecovery.test.ts
@@ -1,3 +1,5 @@
+jest.setTimeout(600_000);
+
 /**
  * OpenAI previousResponseId recovery integration test.
  *
@@ -6,7 +8,7 @@
  */
 
 import { randomBytes } from "crypto";
-import { setupWorkspace, shouldRunIntegrationTests, validateApiKeys } from "../setup";
+import { setupWorkspace } from "../setup";
 import {
   sendMessageWithModel,
   createStreamCollector,
@@ -17,13 +19,7 @@ import { KNOWN_MODELS } from "../../../src/common/constants/knownModels";
 import type { ToolPolicy } from "../../../src/common/utils/tools/toolPolicy";
 import { createMuxMessage } from "../../../src/common/types/message";
 
-// Skip all tests if TEST_INTEGRATION is not set
-const describeIntegration = shouldRunIntegrationTests() ? describe : describe.skip;
-
 // Validate API keys before running tests
-if (shouldRunIntegrationTests()) {
-  validateApiKeys(["OPENAI_API_KEY"]);
-}
 
 const OPENAI_MODEL = modelString("openai", KNOWN_MODELS.GPT.providerModelId);
 const DISABLE_TOOLS: ToolPolicy = [{ regex_match: ".*", action: "disable" }];
@@ -32,7 +28,7 @@ function createInvalidResponseId(): string {
   return `resp_${randomBytes(12).toString("hex")}`;
 }
 
-describeIntegration("OpenAI previousResponseId recovery", () => {
+describe("OpenAI previousResponseId recovery", () => {
   configureTestRetries(3);
 
   test.concurrent(