From ab8a8441c5164504fc6b8a77d210cd6839fbc21a Mon Sep 17 00:00:00 2001 From: fengtian Date: Sat, 11 Apr 2026 20:07:23 +0800 Subject: [PATCH 1/2] fix(runner): prevent ghost sessions from orphaned spawn webhooks spawnSession() registers a tracked session entry before the child's "Session started" webhook arrives. When the 15s webhook timeout fires, only pidToAwaiter / pidToErrorAwaiter are cleared; the pidToTrackedSession entry is left in place and the detached child keeps running. If the child eventually starts and reports its webhook, onHappySessionWebhook() still finds the stale tracking entry and promotes the orphan into a normal runner-managed session, surfacing a message-less "ghost session" in the web UI. This is easy to reproduce on opus[1m] --resume: observed real-world case where four rapid "resume" clicks produced four orphan children that all reported their webhooks ~60 minutes later, creating four ghost sessions on the dashboard. Three-part fix: 1. Make the webhook timeout configurable via HAPI_RUNNER_WEBHOOK_TIMEOUT_MS (default unchanged at 15_000). Users on slow models / large resumes can raise the ceiling so the timeout never fires in the first place. 2. On timeout, also delete the pidToTrackedSession entry and SIGTERM the child, so a late webhook cannot promote the orphan. 3. Defence in depth: if onHappySessionWebhook() receives a webhook from a PID that is not tracked but whose payload claims startedBy: 'runner', ignore it and SIGTERM the child. Genuine terminal-launched children correctly report startedBy: 'terminal', so this branch cannot false-positive on them. --- cli/src/runner/run.ts | 68 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index c4e907a1b..d22dd4614 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -127,6 +127,18 @@ export async function startRunner(): Promise { // Setup state - key by PID const pidToTrackedSession = new Map(); + // Webhook timeout tolerance. Opus 1M + --resume can legitimately take + // longer than the default 15s to reach the "Session started" webhook + // (observed real-world durations of 30s – 60min under rate-limit / + // heavy session restore). Allow advanced users to raise this ceiling + // so that slow starts no longer leave orphaned child processes which + // later report back as ghost sessions. + const envWebhookTimeout = Number(process.env.HAPI_RUNNER_WEBHOOK_TIMEOUT_MS); + const webhookTimeoutMs = + Number.isFinite(envWebhookTimeout) && envWebhookTimeout > 0 + ? envWebhookTimeout + : 15_000; + // Session spawning awaiter system const pidToAwaiter = new Map void>(); const pidToErrorAwaiter = new Map void>(); @@ -178,7 +190,32 @@ export async function startRunner(): Promise { logger.debug(`[RUNNER RUN] Resolved session awaiter for PID ${pid}`); } } else if (!existingSession) { - // New session started externally + // No tracked session for this PID. Two possibilities: + // 1. The child was spawned externally from a terminal (legitimate). + // 2. The child was runner-spawned but already had its tracking + // entry removed because its webhook arrived after the timeout + // (orphaned / ghost-session case). + // + // Differentiate via the webhook's own `startedBy` field: genuine + // terminal-launched children report `startedBy: 'terminal'`, so + // anything claiming `'runner'` here must be the second case and + // should be ignored + terminated instead of silently promoted. + if (sessionMetadata.startedBy === 'runner') { + logger.debug( + `[RUNNER RUN] Ignoring late webhook from orphaned runner-spawned PID ${pid} (session ${sessionId}). Terminating child.` + ); + try { + process.kill(pid, 'SIGTERM'); + } catch (err) { + logger.debug( + `[RUNNER RUN] Failed to SIGTERM orphaned runner child PID ${pid}:`, + err + ); + } + return; + } + + // New session started externally (terminal) const trackedSession: TrackedSession = { startedBy: 'hapi directly - likely by user from terminal', happySessionId: sessionId, @@ -508,19 +545,40 @@ export async function startRunner(): Promise { logger.debug(`[RUNNER RUN] Waiting for session webhook for PID ${pid}`); const spawnResult = await new Promise((resolve) => { - // Set timeout for webhook + // Set timeout for webhook. Default is 15s but can be raised via + // HAPI_RUNNER_WEBHOOK_TIMEOUT_MS for users on slow models + // (e.g. opus[1m] --resume). const timeout = setTimeout(() => { pidToAwaiter.delete(pid); pidToErrorAwaiter.delete(pid); + + // Remove the tracked session entry so a late-arriving webhook + // from this orphaned PID cannot be silently promoted into a + // ghost session by onHappySessionWebhook(). + pidToTrackedSession.delete(pid); + + // Best-effort terminate the orphan child. Without this, children + // spawned with `detached: true` keep running past the webhook + // deadline, eventually report back, and are turned into ghost + // sessions in the web UI. A single SIGTERM gives them a chance + // to clean up; even if the signal is ignored we already removed + // their tracking entry above, so they cannot be promoted. + try { + happyProcess?.kill('SIGTERM'); + } catch (err) { + logger.debug( + `[RUNNER RUN] Failed to SIGTERM orphaned PID ${pid} after webhook timeout:`, + err + ); + } + logger.debug(`[RUNNER RUN] Session webhook timeout for PID ${pid}`); logStderrTail(); resolve({ type: 'error', errorMessage: buildWebhookFailureMessage('timeout') }); - // 15 second timeout - I have seen timeouts on 10 seconds - // even though session was still created successfully in ~2 more seconds - }, 15_000); + }, webhookTimeoutMs); // Register awaiter pidToAwaiter.set(pid, (completedSession) => { From 4b83ff09d4f0aba4d8070dad8b525d6878634360 Mon Sep 17 00:00:00 2001 From: fengtian Date: Thu, 16 Apr 2026 07:55:54 +0800 Subject: [PATCH 2/2] fix(runner): use tree-kill on timeout and clean up worktree for orphans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback on the kill path and worktree cleanup: 1. Timeout handler: replace bare `happyProcess.kill('SIGTERM')` with `killProcessByChildProcess(happyProcess)` so the entire process tree (wrapper + detached agent grandchildren) is reaped, matching the existing `stopSession()` behaviour. 2. Orphan webhook handler: replace bare `process.kill(pid, 'SIGTERM')` with `killProcess(pid)` for proper SIGTERM → SIGKILL escalation. A ChildProcess reference is unavailable here (tracking entry already removed), so tree-kill is not possible — but the timeout handler should have already tree-killed the group; this is defence-in-depth. 3. Worktree leak: when a worktree session times out, register a one-shot `exit` listener on the child process to run `cleanupWorktree()` after the child actually exits. Previously `maybeCleanupWorktree('spawn-error')` would skip cleanup because the child was still alive at that point, and `onChildExited()` had no worktree awareness after the tracking entry was deleted — so the worktree leaked permanently. --- cli/src/runner/run.ts | 46 +++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/cli/src/runner/run.ts b/cli/src/runner/run.ts index d22dd4614..4b86dc4e2 100644 --- a/cli/src/runner/run.ts +++ b/cli/src/runner/run.ts @@ -204,14 +204,14 @@ export async function startRunner(): Promise { logger.debug( `[RUNNER RUN] Ignoring late webhook from orphaned runner-spawned PID ${pid} (session ${sessionId}). Terminating child.` ); - try { - process.kill(pid, 'SIGTERM'); - } catch (err) { - logger.debug( - `[RUNNER RUN] Failed to SIGTERM orphaned runner child PID ${pid}:`, - err - ); - } + // Use killProcess (SIGTERM → SIGKILL escalation) rather than a + // bare process.kill() so the orphan is reliably reaped even if + // it ignores SIGTERM. We don't have a ChildProcess reference + // here (tracking entry was already removed by the timeout + // handler), so tree-kill via killProcessByChildProcess is not + // available — but the timeout handler should have already + // tree-killed the process group; this is defence-in-depth. + void killProcess(pid); return; } @@ -557,19 +557,23 @@ export async function startRunner(): Promise { // ghost session by onHappySessionWebhook(). pidToTrackedSession.delete(pid); - // Best-effort terminate the orphan child. Without this, children - // spawned with `detached: true` keep running past the webhook - // deadline, eventually report back, and are turned into ghost - // sessions in the web UI. A single SIGTERM gives them a chance - // to clean up; even if the signal is ignored we already removed - // their tracking entry above, so they cannot be promoted. - try { - happyProcess?.kill('SIGTERM'); - } catch (err) { - logger.debug( - `[RUNNER RUN] Failed to SIGTERM orphaned PID ${pid} after webhook timeout:`, - err - ); + // Terminate the entire process tree (wrapper + agent + // grandchildren). Using killProcessByChildProcess instead of + // a bare SIGTERM ensures that detached grandchild processes + // (the actual claude/codex agent) are also reaped, and that + // SIGTERM → SIGKILL escalation kicks in if needed. + if (happyProcess) { + void killProcessByChildProcess(happyProcess); + } + + // If this was a worktree session, the worktree can only be + // safely removed after the child has actually exited (the + // child may still be writing to it). Register a one-shot + // exit listener so cleanup happens once the tree-kill lands. + if (worktreeInfo && happyProcess) { + happyProcess.once('exit', () => { + void cleanupWorktree(); + }); } logger.debug(`[RUNNER RUN] Session webhook timeout for PID ${pid}`);