diff --git a/test/cli/daemon-kubo-restart-race.test.ts b/test/cli/daemon-kubo-restart-race.test.ts index 2e88160..8aee4b1 100644 --- a/test/cli/daemon-kubo-restart-race.test.ts +++ b/test/cli/daemon-kubo-restart-race.test.ts @@ -247,6 +247,11 @@ describe("daemon shutdown with a late signal-exit registrant (issue #70)", () => const shutdownRes = await fetch(`${SIGEXIT_KUBO_API_URL}/shutdown`, { method: "POST" }); expect(shutdownRes.status).toBe(200); + // This is a setup precondition, not the assertion under test: after /shutdown the + // daemon must auto-restart kubo before we can verify it dies on SIGTERM. The detect + // window must absorb the injected PKC_CLI_TEST_IPFS_READY_DELAY_MS (5s) plus restart + // time on a heavily-loaded CI runner — locally this lands in ~13s, but ubuntu CI has + // been observed taking >30s, so widen to 60s (well within the test's 120s cap, issue #77). const kuboRestarted = await waitForCondition( async () => { try { @@ -256,7 +261,7 @@ describe("daemon shutdown with a late signal-exit registrant (issue #70)", () => return false; } }, - 30000, + 60000, 500 ); expect(kuboRestarted).toBe(true); diff --git a/test/cli/daemon.test.ts b/test/cli/daemon.test.ts index 8d295b3..4d578d7 100644 --- a/test/cli/daemon.test.ts +++ b/test/cli/daemon.test.ts @@ -389,7 +389,7 @@ describe("bitsocial daemon kubo restart cleanup", async () => { // without running exit hooks (asyncExitHook/process.on("exit")), so the daemon has no // opportunity to clean up kubo. On Unix, SIGTERM is caught by the exit hook which runs // killKuboProcess(). The normal user path (Ctrl+C/SIGINT) works on all platforms. - it.skipIf(process.platform === "win32")("stops kubo when daemon exits during a restart cycle", { timeout: 60000 }, async () => { + it.skipIf(process.platform === "win32")("stops kubo when daemon exits during a restart cycle", { timeout: 120000 }, async () => { const previousDelay = process.env["PKC_CLI_TEST_IPFS_READY_DELAY_MS"]; process.env["PKC_CLI_TEST_IPFS_READY_DELAY_MS"] = "5000"; @@ -404,9 +404,33 @@ describe("bitsocial daemon kubo restart cleanup", async () => { ); expect(typeof daemonProcess.pid).toBe("number"); + // Diagnostics for the flaky-on-CI failures (issue #70/#77): the daemon's DEBUG output is + // redirected to its log files (not stderr), so dump those — they show which kubo pids were + // spawned, restarted and killed, and crucially the timing. Without this the CI log only + // contains the bare assertion. Fired on BOTH precondition failures below. + const dumpDiagnostics = async (reason: string) => { + const proc = daemonProcess!; + const tail = (text: string | undefined, lines: number) => (text ?? "").split("\n").slice(-lines).join("\n"); + console.log(`[restart-cleanup diagnostics] ${reason}`); + console.log(`[restart-cleanup diagnostics] daemon exitCode=${proc.exitCode} signalCode=${proc.signalCode}`); + console.log(`[restart-cleanup diagnostics] daemon stdout tail:\n${tail(proc.capturedStdout, 40)}`); + console.log(`[restart-cleanup diagnostics] daemon stderr tail:\n${tail(proc.capturedStderr, 60)}`); + for (const logFile of await fsPromise.readdir(cleanupLogDir).catch(() => [] as string[])) { + const content = await fsPromise.readFile(path.join(cleanupLogDir, logFile), "utf-8").catch(() => ""); + console.log(`[restart-cleanup diagnostics] log file ${logFile} tail:\n${tail(content, 250)}`); + } + }; + const shutdownRes = await fetch(`${cleanupKuboApiUrl}/shutdown`, { method: "POST" }); expect(shutdownRes.status).toBe(200); + // Setup precondition (not the assertion under test): after /shutdown the daemon must + // auto-restart kubo before we can verify it dies on SIGTERM. The detect window must absorb + // the injected PKC_CLI_TEST_IPFS_READY_DELAY_MS (5s) plus restart under CI contention — + // test files run in parallel forks (fileParallelism) and a 2-vCPU ubuntu runner spawns many + // kubo nodes concurrently, so restart that lands in ~13s locally was observed taking >20s on + // CI (the old window). Widened to 45s; the test timeout was raised to 120s to match. If it + // still fails, dumpDiagnostics() prints the daemon's kubo restart timeline (issue #77). const kuboRestarted = await waitForCondition(async () => { try { const res = await fetch(`${cleanupKuboApiUrl}/bitswap/stat`, { method: "POST" }); @@ -414,7 +438,8 @@ describe("bitsocial daemon kubo restart cleanup", async () => { } catch { return false; } - }, 20000, 500); + }, 45000, 500); + if (!kuboRestarted) await dumpDiagnostics(`kubo did not come back up on ${cleanupKuboApiUrl} within the restart window`); expect(kuboRestarted).toBe(true); const killed = daemonProcess.kill(); @@ -433,21 +458,7 @@ describe("bitsocial daemon kubo restart cleanup", async () => { return true; } }, 10000, 500); - if (!kuboStoppedAfterKill) { - // Diagnostics for the flaky-on-CI failure (issue #70): the daemon's DEBUG output is - // redirected to its log files (not stderr), so dump those — they show which kubo - // pids were spawned, restarted and killed. Without this the CI log only contains - // the bare assertion. - const tail = (text: string | undefined, lines: number) => (text ?? "").split("\n").slice(-lines).join("\n"); - console.log(`[restart-cleanup diagnostics] kubo still responding on ${cleanupKuboApiUrl} after daemon exit`); - console.log(`[restart-cleanup diagnostics] daemon exitCode=${daemonProcess.exitCode} signalCode=${daemonProcess.signalCode}`); - console.log(`[restart-cleanup diagnostics] daemon stdout tail:\n${tail(daemonProcess.capturedStdout, 40)}`); - console.log(`[restart-cleanup diagnostics] daemon stderr tail:\n${tail(daemonProcess.capturedStderr, 60)}`); - for (const logFile of await fsPromise.readdir(cleanupLogDir).catch(() => [] as string[])) { - const content = await fsPromise.readFile(path.join(cleanupLogDir, logFile), "utf-8").catch(() => ""); - console.log(`[restart-cleanup diagnostics] log file ${logFile} tail:\n${tail(content, 250)}`); - } - } + if (!kuboStoppedAfterKill) await dumpDiagnostics(`kubo still responding on ${cleanupKuboApiUrl} after daemon exit`); expect(kuboStoppedAfterKill).toBe(true); } finally { if (daemonProcess) await stopPkcDaemon(daemonProcess); diff --git a/test/cli/logs.test.ts b/test/cli/logs.test.ts index a88b277..ae45a45 100644 --- a/test/cli/logs.test.ts +++ b/test/cli/logs.test.ts @@ -55,6 +55,71 @@ const runBitsocialLogs = (args: string[], logDir: string): Promise<{ stdout: str const buildLogLine = (date: Date, message: string, stream?: "stdout" | "stderr") => stream ? `[${date.toISOString()}] [${stream}] ${message}` : `[${date.toISOString()}] ${message}`; +// Drive a `bitsocial logs -f ...` process by observing its output. `onChunk` runs on every stdout +// chunk (use it to trigger file writes once a marker appears). The process is SIGINT-killed as soon +// as `doneWhen(stdout, stderr)` returns true — plus an optional `graceMs` afterwards so any late or +// erroneously-unfiltered output can still surface — or after `maxMs` as a safety cap. Anchoring the +// kill to observed output rather than a fixed wall-clock window from spawn keeps these tests robust +// on slow/loaded CI runners, where CLI cold-start alone can consume several seconds and a fixed +// window would kill the process before the awaited line is ever emitted (issue #77). +const runFollowUntil = ( + args: string[], + logDir: string, + opts: { + onChunk?: (stdout: string) => void; + doneWhen: (stdout: string, stderr: string) => boolean; + graceMs?: number; + maxMs?: number; + } +): Promise<{ stdout: string; stderr: string; timedOut: boolean }> => { + const { onChunk, doneWhen, graceMs = 0, maxMs = 25000 } = opts; + return new Promise((resolve, reject) => { + const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, ...args], { + stdio: ["pipe", "pipe", "pipe"] + }); + + let stdout = ""; + let stderr = ""; + let killed = false; + let finishing = false; + let timedOut = false; + const kill = () => { + if (killed) return; + killed = true; + proc.kill("SIGINT"); + }; + const safety = setTimeout(() => { + timedOut = true; + kill(); + }, maxMs); + const maybeFinish = () => { + if (finishing || killed) return; + if (doneWhen(stdout, stderr)) { + finishing = true; + if (graceMs > 0) setTimeout(kill, graceMs); + else kill(); + } + }; + proc.stdout.on("data", (data: Buffer) => { + stdout += data.toString(); + onChunk?.(stdout); + maybeFinish(); + }); + proc.stderr.on("data", (data: Buffer) => { + stderr += data.toString(); + maybeFinish(); + }); + proc.on("close", () => { + clearTimeout(safety); + resolve({ stdout, stderr, timedOut }); + }); + proc.on("error", (err) => { + clearTimeout(safety); + reject(err); + }); + }); +}; + describe("bitsocial logs (synthetic log file tests)", () => { let logDir: string; let logFile: string; @@ -338,38 +403,17 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-03-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-03-01T00:00:00.000Z"), "INITIAL_MARKER") + "\n"); - const result = await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let stderr = ""; - let createdNewFile = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); - // Wait for initial output before creating the new file + let createdNewFile = false; + const result = await runFollowUntil(["-f"], logDir, { + // Wait for initial output before creating the new file + onChunk: (stdout) => { if (!createdNewFile && stdout.includes("INITIAL_MARKER")) { createdNewFile = true; const file2 = path.join(logDir, "bitsocial_cli_daemon_2026-03-01T01-00-00.000Z.log"); fsPromise.writeFile(file2, buildLogLine(new Date("2026-03-01T01:00:00.000Z"), "NEW_FILE_MARKER") + "\n"); } - }); - proc.stderr.on("data", (data: Buffer) => { stderr += data.toString(); }); - - // Wait long enough for the 3-second new-file check to fire, then kill - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout, stderr }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout, stderr) => stdout.includes("NEW_FILE_MARKER") && stderr.includes("switched to new log file") }); expect(result.stdout).toContain("INITIAL_MARKER"); @@ -382,16 +426,9 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-04-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-04-01T00:00:00.000Z"), "initial stdout", "stdout") + "\n"); - const result = await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "--stdout", "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let stderr = ""; - let createdNewFile = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); + let createdNewFile = false; + const result = await runFollowUntil(["--stdout", "-f"], logDir, { + onChunk: (stdout) => { if (!createdNewFile && stdout.includes("initial stdout")) { createdNewFile = true; const file2 = path.join(logDir, "bitsocial_cli_daemon_2026-04-01T01-00-00.000Z.log"); @@ -401,21 +438,11 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { ].join("\n") + "\n"; fsPromise.writeFile(file2, content); } - }); - proc.stderr.on("data", (data: Buffer) => { stderr += data.toString(); }); - - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout, stderr }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout) => stdout.includes("new stdout msg"), + // Both lines are written together; grace period lets an erroneously-unfiltered + // stderr line surface before we kill, so the negative assertion stays meaningful. + graceMs: 1000 }); expect(result.stdout).toContain("initial stdout"); @@ -428,36 +455,37 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-05-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-05-01T00:00:00.000Z"), "initial line") + "\n"); - const result = await new Promise<{ stdout: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let appended = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); - // Only append after initial line has been observed, so CLI startup time doesn't matter + let appended = false; + let appendError: unknown; + const result = await runFollowUntil(["-f"], logDir, { + // Only append after initial line has been observed, so CLI startup time doesn't matter + onChunk: (stdout) => { if (!appended && stdout.includes("initial line")) { appended = true; - fsPromise.appendFile(file1, buildLogLine(new Date("2026-05-01T00:01:00.000Z"), "APPENDED_LINE") + "\n"); + fsPromise + .appendFile(file1, buildLogLine(new Date("2026-05-01T00:01:00.000Z"), "APPENDED_LINE") + "\n") + .catch((err) => { + appendError = err; + }); } - }); - - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout) => stdout.includes("APPENDED_LINE") }); + if (!result.stdout.includes("APPENDED_LINE")) { + // Diagnostics for the flaky-on-CI failure (issue #77): the test waited the full safety cap + // without ever seeing APPENDED_LINE. Distinguish the three possible causes so the next CI + // run is conclusive instead of a guess: + // - append never reached disk -> appendTriggered/appendError + on-disk content show it + // - logs -f never detected it -> on-disk has APPENDED_LINE but captured stdout does not + // - output truncated on exit -> timedOut true and the line is on disk but not captured + const onDisk = await fsPromise.readFile(file1, "utf-8").catch((err) => ``); + console.log(`[follow-append diagnostics] timedOut=${result.timedOut} appendTriggered=${appended} appendError=${String(appendError)}`); + console.log(`[follow-append diagnostics] file1 on disk (${file1}):\n${onDisk}`); + console.log(`[follow-append diagnostics] captured stdout (${result.stdout.length} bytes):\n${JSON.stringify(result.stdout)}`); + console.log(`[follow-append diagnostics] captured stderr:\n${JSON.stringify(result.stderr)}`); + } + expect(result.stdout).toContain("initial line"); expect(result.stdout).toContain("APPENDED_LINE"); });