From 89371c65d46b6285aff676a23849f3b3859d4218 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 11:41:32 +0000 Subject: [PATCH 1/3] test: deflake logs -f follow tests and kubo restart-race detect window The logs -f rotation tests killed the CLI on a fixed 8s timer anchored to process spawn, so the budget for the awaited output had to share that window with CLI cold-start (oclif/pkc-js import). On a slow CI runner startup ate the budget and the process was killed before the second marker was emitted (macos failure: stdout had the initial line but not APPENDED_LINE). Replace the fixed window with a shared runFollowUntil() helper that kills as soon as the awaited output is observed (with an optional grace period and a generous 25s safety cap), decoupling correctness from startup latency. Tests now finish in ~1-5s locally instead of always 8s. Also widen the restart-race setup precondition (restarted-kubo detect) from 30s to 60s: it must absorb the injected 5s ready-delay plus restart under runner load. Ran 43.8s before failing on ubuntu CI; ~13s locally. Still well within the test's 120s cap. Refs #77 --- test/cli/daemon-kubo-restart-race.test.ts | 7 +- test/cli/logs.test.ts | 157 +++++++++++----------- 2 files changed, 87 insertions(+), 77 deletions(-) diff --git a/test/cli/daemon-kubo-restart-race.test.ts b/test/cli/daemon-kubo-restart-race.test.ts index 2e88160..8aee4b1 100644 --- a/test/cli/daemon-kubo-restart-race.test.ts +++ b/test/cli/daemon-kubo-restart-race.test.ts @@ -247,6 +247,11 @@ describe("daemon shutdown with a late signal-exit registrant (issue #70)", () => const shutdownRes = await fetch(`${SIGEXIT_KUBO_API_URL}/shutdown`, { method: "POST" }); expect(shutdownRes.status).toBe(200); + // This is a setup precondition, not the assertion under test: after /shutdown the + // daemon must auto-restart kubo before we can verify it dies on SIGTERM. The detect + // window must absorb the injected PKC_CLI_TEST_IPFS_READY_DELAY_MS (5s) plus restart + // time on a heavily-loaded CI runner — locally this lands in ~13s, but ubuntu CI has + // been observed taking >30s, so widen to 60s (well within the test's 120s cap, issue #77). const kuboRestarted = await waitForCondition( async () => { try { @@ -256,7 +261,7 @@ describe("daemon shutdown with a late signal-exit registrant (issue #70)", () => return false; } }, - 30000, + 60000, 500 ); expect(kuboRestarted).toBe(true); diff --git a/test/cli/logs.test.ts b/test/cli/logs.test.ts index a88b277..13c74a2 100644 --- a/test/cli/logs.test.ts +++ b/test/cli/logs.test.ts @@ -55,6 +55,67 @@ const runBitsocialLogs = (args: string[], logDir: string): Promise<{ stdout: str const buildLogLine = (date: Date, message: string, stream?: "stdout" | "stderr") => stream ? `[${date.toISOString()}] [${stream}] ${message}` : `[${date.toISOString()}] ${message}`; +// Drive a `bitsocial logs -f ...` process by observing its output. `onChunk` runs on every stdout +// chunk (use it to trigger file writes once a marker appears). The process is SIGINT-killed as soon +// as `doneWhen(stdout, stderr)` returns true — plus an optional `graceMs` afterwards so any late or +// erroneously-unfiltered output can still surface — or after `maxMs` as a safety cap. Anchoring the +// kill to observed output rather than a fixed wall-clock window from spawn keeps these tests robust +// on slow/loaded CI runners, where CLI cold-start alone can consume several seconds and a fixed +// window would kill the process before the awaited line is ever emitted (issue #77). +const runFollowUntil = ( + args: string[], + logDir: string, + opts: { + onChunk?: (stdout: string) => void; + doneWhen: (stdout: string, stderr: string) => boolean; + graceMs?: number; + maxMs?: number; + } +): Promise<{ stdout: string; stderr: string }> => { + const { onChunk, doneWhen, graceMs = 0, maxMs = 25000 } = opts; + return new Promise((resolve, reject) => { + const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, ...args], { + stdio: ["pipe", "pipe", "pipe"] + }); + + let stdout = ""; + let stderr = ""; + let killed = false; + let finishing = false; + const kill = () => { + if (killed) return; + killed = true; + proc.kill("SIGINT"); + }; + const safety = setTimeout(kill, maxMs); + const maybeFinish = () => { + if (finishing || killed) return; + if (doneWhen(stdout, stderr)) { + finishing = true; + if (graceMs > 0) setTimeout(kill, graceMs); + else kill(); + } + }; + proc.stdout.on("data", (data: Buffer) => { + stdout += data.toString(); + onChunk?.(stdout); + maybeFinish(); + }); + proc.stderr.on("data", (data: Buffer) => { + stderr += data.toString(); + maybeFinish(); + }); + proc.on("close", () => { + clearTimeout(safety); + resolve({ stdout, stderr }); + }); + proc.on("error", (err) => { + clearTimeout(safety); + reject(err); + }); + }); +}; + describe("bitsocial logs (synthetic log file tests)", () => { let logDir: string; let logFile: string; @@ -338,38 +399,17 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-03-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-03-01T00:00:00.000Z"), "INITIAL_MARKER") + "\n"); - const result = await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let stderr = ""; - let createdNewFile = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); - // Wait for initial output before creating the new file + let createdNewFile = false; + const result = await runFollowUntil(["-f"], logDir, { + // Wait for initial output before creating the new file + onChunk: (stdout) => { if (!createdNewFile && stdout.includes("INITIAL_MARKER")) { createdNewFile = true; const file2 = path.join(logDir, "bitsocial_cli_daemon_2026-03-01T01-00-00.000Z.log"); fsPromise.writeFile(file2, buildLogLine(new Date("2026-03-01T01:00:00.000Z"), "NEW_FILE_MARKER") + "\n"); } - }); - proc.stderr.on("data", (data: Buffer) => { stderr += data.toString(); }); - - // Wait long enough for the 3-second new-file check to fire, then kill - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout, stderr }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout, stderr) => stdout.includes("NEW_FILE_MARKER") && stderr.includes("switched to new log file") }); expect(result.stdout).toContain("INITIAL_MARKER"); @@ -382,16 +422,9 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-04-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-04-01T00:00:00.000Z"), "initial stdout", "stdout") + "\n"); - const result = await new Promise<{ stdout: string; stderr: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "--stdout", "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let stderr = ""; - let createdNewFile = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); + let createdNewFile = false; + const result = await runFollowUntil(["--stdout", "-f"], logDir, { + onChunk: (stdout) => { if (!createdNewFile && stdout.includes("initial stdout")) { createdNewFile = true; const file2 = path.join(logDir, "bitsocial_cli_daemon_2026-04-01T01-00-00.000Z.log"); @@ -401,21 +434,11 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { ].join("\n") + "\n"; fsPromise.writeFile(file2, content); } - }); - proc.stderr.on("data", (data: Buffer) => { stderr += data.toString(); }); - - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout, stderr }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout) => stdout.includes("new stdout msg"), + // Both lines are written together; grace period lets an erroneously-unfiltered + // stderr line surface before we kill, so the negative assertion stays meaningful. + graceMs: 1000 }); expect(result.stdout).toContain("initial stdout"); @@ -428,34 +451,16 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { const file1 = path.join(logDir, "bitsocial_cli_daemon_2026-05-01T00-00-00.000Z.log"); await fsPromise.writeFile(file1, buildLogLine(new Date("2026-05-01T00:00:00.000Z"), "initial line") + "\n"); - const result = await new Promise<{ stdout: string }>((resolve, reject) => { - const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, "-f"], { - stdio: ["pipe", "pipe", "pipe"] - }); - - let stdout = ""; - let appended = false; - proc.stdout.on("data", (data: Buffer) => { - stdout += data.toString(); - // Only append after initial line has been observed, so CLI startup time doesn't matter + let appended = false; + const result = await runFollowUntil(["-f"], logDir, { + // Only append after initial line has been observed, so CLI startup time doesn't matter + onChunk: (stdout) => { if (!appended && stdout.includes("initial line")) { appended = true; fsPromise.appendFile(file1, buildLogLine(new Date("2026-05-01T00:01:00.000Z"), "APPENDED_LINE") + "\n"); } - }); - - const timer = setTimeout(() => { - proc.kill("SIGINT"); - }, 8000); - - proc.on("close", () => { - clearTimeout(timer); - resolve({ stdout }); - }); - proc.on("error", (err) => { - clearTimeout(timer); - reject(err); - }); + }, + doneWhen: (stdout) => stdout.includes("APPENDED_LINE") }); expect(result.stdout).toContain("initial line"); From 4ae1b1ed2fe86357feb9bf8342e7e5a4ad333eda Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 11:59:37 +0000 Subject: [PATCH 2/3] test: fix diagnostics gap + widen restart window in kubo cleanup test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'stops kubo when daemon exits during a restart cycle' test failed on ubuntu CI at line 418 (expect(kuboRestarted).toBe(true)) — the same restart-detect precondition class as the sibling test in daemon-kubo-restart-race.test.ts. Two problems found by reading the CI job log: 1. Diagnostics gap: the daemon DEBUG-log dump only fired when the LATER assertion (kuboStoppedAfterKill, line 451) failed, not the restart-detect at line 418 that actually fails. So the CI log contained only the bare assertion with no daemon timeline — couldn't see what kubo did. Extracted the dump into dumpDiagnostics() and call it on BOTH precondition failures. 2. Window too tight under CI contention: vitest runs test files in parallel forks (fileParallelism) and a 2-vCPU ubuntu runner spawns many kubo nodes concurrently (this test ran alongside daemon-kubo-restart-race.test.ts's 4 kubo nodes). Restart lands in ~13s locally but was observed taking >20s on CI. Widened the detect window 20s -> 45s and raised the test timeout 60s -> 120s to match the sibling test. If it still fails, dumpDiagnostics() now prints the daemon's kubo restart/kill timeline so we can distinguish contention from a real bug. Refs #77 --- test/cli/daemon.test.ts | 45 +++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/test/cli/daemon.test.ts b/test/cli/daemon.test.ts index 8d295b3..4d578d7 100644 --- a/test/cli/daemon.test.ts +++ b/test/cli/daemon.test.ts @@ -389,7 +389,7 @@ describe("bitsocial daemon kubo restart cleanup", async () => { // without running exit hooks (asyncExitHook/process.on("exit")), so the daemon has no // opportunity to clean up kubo. On Unix, SIGTERM is caught by the exit hook which runs // killKuboProcess(). The normal user path (Ctrl+C/SIGINT) works on all platforms. - it.skipIf(process.platform === "win32")("stops kubo when daemon exits during a restart cycle", { timeout: 60000 }, async () => { + it.skipIf(process.platform === "win32")("stops kubo when daemon exits during a restart cycle", { timeout: 120000 }, async () => { const previousDelay = process.env["PKC_CLI_TEST_IPFS_READY_DELAY_MS"]; process.env["PKC_CLI_TEST_IPFS_READY_DELAY_MS"] = "5000"; @@ -404,9 +404,33 @@ describe("bitsocial daemon kubo restart cleanup", async () => { ); expect(typeof daemonProcess.pid).toBe("number"); + // Diagnostics for the flaky-on-CI failures (issue #70/#77): the daemon's DEBUG output is + // redirected to its log files (not stderr), so dump those — they show which kubo pids were + // spawned, restarted and killed, and crucially the timing. Without this the CI log only + // contains the bare assertion. Fired on BOTH precondition failures below. + const dumpDiagnostics = async (reason: string) => { + const proc = daemonProcess!; + const tail = (text: string | undefined, lines: number) => (text ?? "").split("\n").slice(-lines).join("\n"); + console.log(`[restart-cleanup diagnostics] ${reason}`); + console.log(`[restart-cleanup diagnostics] daemon exitCode=${proc.exitCode} signalCode=${proc.signalCode}`); + console.log(`[restart-cleanup diagnostics] daemon stdout tail:\n${tail(proc.capturedStdout, 40)}`); + console.log(`[restart-cleanup diagnostics] daemon stderr tail:\n${tail(proc.capturedStderr, 60)}`); + for (const logFile of await fsPromise.readdir(cleanupLogDir).catch(() => [] as string[])) { + const content = await fsPromise.readFile(path.join(cleanupLogDir, logFile), "utf-8").catch(() => ""); + console.log(`[restart-cleanup diagnostics] log file ${logFile} tail:\n${tail(content, 250)}`); + } + }; + const shutdownRes = await fetch(`${cleanupKuboApiUrl}/shutdown`, { method: "POST" }); expect(shutdownRes.status).toBe(200); + // Setup precondition (not the assertion under test): after /shutdown the daemon must + // auto-restart kubo before we can verify it dies on SIGTERM. The detect window must absorb + // the injected PKC_CLI_TEST_IPFS_READY_DELAY_MS (5s) plus restart under CI contention — + // test files run in parallel forks (fileParallelism) and a 2-vCPU ubuntu runner spawns many + // kubo nodes concurrently, so restart that lands in ~13s locally was observed taking >20s on + // CI (the old window). Widened to 45s; the test timeout was raised to 120s to match. If it + // still fails, dumpDiagnostics() prints the daemon's kubo restart timeline (issue #77). const kuboRestarted = await waitForCondition(async () => { try { const res = await fetch(`${cleanupKuboApiUrl}/bitswap/stat`, { method: "POST" }); @@ -414,7 +438,8 @@ describe("bitsocial daemon kubo restart cleanup", async () => { } catch { return false; } - }, 20000, 500); + }, 45000, 500); + if (!kuboRestarted) await dumpDiagnostics(`kubo did not come back up on ${cleanupKuboApiUrl} within the restart window`); expect(kuboRestarted).toBe(true); const killed = daemonProcess.kill(); @@ -433,21 +458,7 @@ describe("bitsocial daemon kubo restart cleanup", async () => { return true; } }, 10000, 500); - if (!kuboStoppedAfterKill) { - // Diagnostics for the flaky-on-CI failure (issue #70): the daemon's DEBUG output is - // redirected to its log files (not stderr), so dump those — they show which kubo - // pids were spawned, restarted and killed. Without this the CI log only contains - // the bare assertion. - const tail = (text: string | undefined, lines: number) => (text ?? "").split("\n").slice(-lines).join("\n"); - console.log(`[restart-cleanup diagnostics] kubo still responding on ${cleanupKuboApiUrl} after daemon exit`); - console.log(`[restart-cleanup diagnostics] daemon exitCode=${daemonProcess.exitCode} signalCode=${daemonProcess.signalCode}`); - console.log(`[restart-cleanup diagnostics] daemon stdout tail:\n${tail(daemonProcess.capturedStdout, 40)}`); - console.log(`[restart-cleanup diagnostics] daemon stderr tail:\n${tail(daemonProcess.capturedStderr, 60)}`); - for (const logFile of await fsPromise.readdir(cleanupLogDir).catch(() => [] as string[])) { - const content = await fsPromise.readFile(path.join(cleanupLogDir, logFile), "utf-8").catch(() => ""); - console.log(`[restart-cleanup diagnostics] log file ${logFile} tail:\n${tail(content, 250)}`); - } - } + if (!kuboStoppedAfterKill) await dumpDiagnostics(`kubo still responding on ${cleanupKuboApiUrl} after daemon exit`); expect(kuboStoppedAfterKill).toBe(true); } finally { if (daemonProcess) await stopPkcDaemon(daemonProcess); From c26e21f709b230c2b9f4dfa6824ffe4ebdaf8be9 Mon Sep 17 00:00:00 2001 From: Rinse Date: Mon, 8 Jun 2026 12:12:57 +0000 Subject: [PATCH 3/3] test: add diagnostics to logs -f append-follow test (issue #77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'continues watching old file' test failed again on ubuntu CI, this time waiting the full 25s safety cap with APPENDED_LINE never appearing — so it is NOT a timing-window flake. The two sibling follow tests (which detect a NEW file via the 3s full-reread poll) passed fast; only this one, which appends to the file already being followed (surfaced by the incremental byte-offset poll), fails. Cannot reproduce locally even under CPU saturation (32 cores). Add targeted diagnostics that fire when APPENDED_LINE is missing, to pin down which of three causes it is on the next CI run: - append never reached disk (appendTriggered / appendError + on-disk content) - logs -f never detected it (on-disk has the line, captured stdout does not) - output truncated on SIGINT exit (timedOut + line on disk but not captured) Also surface appendFile rejections (previously swallowed) and have the helper report whether it hit the safety cap. Refs #77 --- test/cli/logs.test.ts | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/test/cli/logs.test.ts b/test/cli/logs.test.ts index 13c74a2..ae45a45 100644 --- a/test/cli/logs.test.ts +++ b/test/cli/logs.test.ts @@ -71,7 +71,7 @@ const runFollowUntil = ( graceMs?: number; maxMs?: number; } -): Promise<{ stdout: string; stderr: string }> => { +): Promise<{ stdout: string; stderr: string; timedOut: boolean }> => { const { onChunk, doneWhen, graceMs = 0, maxMs = 25000 } = opts; return new Promise((resolve, reject) => { const proc = spawn("node", ["./bin/run", "logs", "--logPath", logDir, ...args], { @@ -82,12 +82,16 @@ const runFollowUntil = ( let stderr = ""; let killed = false; let finishing = false; + let timedOut = false; const kill = () => { if (killed) return; killed = true; proc.kill("SIGINT"); }; - const safety = setTimeout(kill, maxMs); + const safety = setTimeout(() => { + timedOut = true; + kill(); + }, maxMs); const maybeFinish = () => { if (finishing || killed) return; if (doneWhen(stdout, stderr)) { @@ -107,7 +111,7 @@ const runFollowUntil = ( }); proc.on("close", () => { clearTimeout(safety); - resolve({ stdout, stderr }); + resolve({ stdout, stderr, timedOut }); }); proc.on("error", (err) => { clearTimeout(safety); @@ -452,17 +456,36 @@ describe("bitsocial logs -f log file rotation (synthetic)", () => { await fsPromise.writeFile(file1, buildLogLine(new Date("2026-05-01T00:00:00.000Z"), "initial line") + "\n"); let appended = false; + let appendError: unknown; const result = await runFollowUntil(["-f"], logDir, { // Only append after initial line has been observed, so CLI startup time doesn't matter onChunk: (stdout) => { if (!appended && stdout.includes("initial line")) { appended = true; - fsPromise.appendFile(file1, buildLogLine(new Date("2026-05-01T00:01:00.000Z"), "APPENDED_LINE") + "\n"); + fsPromise + .appendFile(file1, buildLogLine(new Date("2026-05-01T00:01:00.000Z"), "APPENDED_LINE") + "\n") + .catch((err) => { + appendError = err; + }); } }, doneWhen: (stdout) => stdout.includes("APPENDED_LINE") }); + if (!result.stdout.includes("APPENDED_LINE")) { + // Diagnostics for the flaky-on-CI failure (issue #77): the test waited the full safety cap + // without ever seeing APPENDED_LINE. Distinguish the three possible causes so the next CI + // run is conclusive instead of a guess: + // - append never reached disk -> appendTriggered/appendError + on-disk content show it + // - logs -f never detected it -> on-disk has APPENDED_LINE but captured stdout does not + // - output truncated on exit -> timedOut true and the line is on disk but not captured + const onDisk = await fsPromise.readFile(file1, "utf-8").catch((err) => ``); + console.log(`[follow-append diagnostics] timedOut=${result.timedOut} appendTriggered=${appended} appendError=${String(appendError)}`); + console.log(`[follow-append diagnostics] file1 on disk (${file1}):\n${onDisk}`); + console.log(`[follow-append diagnostics] captured stdout (${result.stdout.length} bytes):\n${JSON.stringify(result.stdout)}`); + console.log(`[follow-append diagnostics] captured stderr:\n${JSON.stringify(result.stderr)}`); + } + expect(result.stdout).toContain("initial line"); expect(result.stdout).toContain("APPENDED_LINE"); });