From 1141fa1a6e29c9489bd00ed6e55e0c7f72a0b0af Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 2 May 2026 21:27:10 +0900 Subject: [PATCH 01/55] Enhance Studio development experience with HMR and early-stop features - Integrated Rolldown for hot module replacement (HMR) in `arkor dev`, allowing real-time updates to the training interface without page refresh. - Implemented `requestEarlyStop` and `replaceCallbacks` methods in the Trainer API to facilitate graceful stopping of training jobs and dynamic callback updates during execution. - Updated documentation to reflect new features and usage patterns for improved developer guidance. - Adjusted cleanup logic for better resource management during development sessions. --- AGENTS.md | 14 +- docs/concepts/studio.mdx | 2 +- docs/ja/concepts/studio.mdx | 2 +- docs/ja/sdk/trainer-control.mdx | 22 +- docs/sdk/trainer-control.mdx | 22 +- packages/arkor/package.json | 2 +- packages/arkor/src/cli/commands/build.ts | 40 ++- packages/arkor/src/cli/commands/dev.ts | 32 +- packages/arkor/src/cli/commands/start.test.ts | 2 +- packages/arkor/src/core/arkor.test.ts | 1 + packages/arkor/src/core/runner.test.ts | 87 +++++- packages/arkor/src/core/runner.ts | 60 +++- packages/arkor/src/core/trainer.test.ts | 249 ++++++++++++++++ packages/arkor/src/core/trainer.ts | 62 ++++ packages/arkor/src/core/types.ts | 11 + packages/arkor/src/studio/hmr.test.ts | 141 +++++++++ packages/arkor/src/studio/hmr.ts | 148 ++++++++++ packages/arkor/src/studio/server.test.ts | 126 ++++++++ packages/arkor/src/studio/server.ts | 111 ++++++- .../studio-app/src/components/RunTraining.tsx | 79 ++++- packages/studio-app/src/lib/api.ts | 20 ++ pnpm-lock.yaml | 274 +----------------- 22 files changed, 1206 insertions(+), 301 deletions(-) create mode 100644 packages/arkor/src/studio/hmr.test.ts create mode 100644 packages/arkor/src/studio/hmr.ts diff --git a/AGENTS.md b/AGENTS.md index 5d4d8455..60960f68 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -63,7 +63,7 @@ cd my-arkor-app && pnpm dev # Studio at http://127.0. `arkor dev` generates a 32-byte base64url token per launch ([packages/arkor/src/cli/commands/dev.ts](packages/arkor/src/cli/commands/dev.ts)) and: -1. Passes it to `buildStudioApp({ studioToken })`. The Hono server validates every `/api/*` request via `X-Arkor-Studio-Token` header (or `?studioToken=` query for `EventSource`, which can't set headers). Comparison uses `timingSafeEqual`. +1. Passes it to `buildStudioApp({ studioToken })`. The Hono server validates every `/api/*` request via `X-Arkor-Studio-Token` header (or `?studioToken=` query for `EventSource`, which can't set headers). Comparison uses `timingSafeEqual`. The query-token allow-list lives in `eventStreamPathPattern` in [packages/arkor/src/studio/server.ts](packages/arkor/src/studio/server.ts) — currently `/api/jobs/:id/events` and `/api/dev/events`. **Adding to that regex is CSRF-sensitive: each entry must be a GET stream-only route, never a mutation endpoint.** 2. Persists it to `~/.arkor/studio-token` (mode 0600) so the SPA dev workflow (`pnpm --filter @arkor/studio-app dev`) can read it via the `arkor-studio-token` Vite plugin in [packages/studio-app/vite.config.ts](packages/studio-app/vite.config.ts), which injects `` into `index.html` on each request. Persistence failure must NOT block server start (read-only `$HOME` on Docker, etc.) — just warn. 3. Cleans up on `exit`/SIGINT/SIGTERM/SIGHUP via `unlinkSync`. @@ -73,11 +73,19 @@ The whole point: prevents another browser tab on the same machine from POSTing ` When touching the Studio server or SPA fetch layer, preserve: token via header for `fetch`, query param for `EventSource`, host-header guard, no CORS, timing-safe compare. The Vite plugin is dev-only (`apply: "serve"`) — running it during `vite build` would bake a stale per-launch token into the production `index.html` and shadow the runtime tag, causing every `/api/*` call to 403. +### HMR + graceful early-stop + +`arkor dev` keeps a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` ([packages/arkor/src/studio/hmr.ts](packages/arkor/src/studio/hmr.ts)) and pushes rebuild events over `/api/dev/events` (SSE). The SPA re-fetches `/api/manifest` on each event so the Run Training button stays in sync without a browser refresh. + +When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server SIGTERM's the child. The child's signal handler in `runTrainer` calls `Trainer.requestEarlyStop()`, which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA then auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. + +Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. The hard kill timer in `requestEarlyStopOnActive` exists only as a stuck-process fallback. + ### Project entry-point discovery The CLI/Studio look at `src/arkor/index.ts` in user projects. Discovery in [packages/arkor/src/core/runner.ts](packages/arkor/src/core/runner.ts) accepts (in order): a named `arkor` export from `createArkor({...})`, a bare `trainer` export, a default export holding either an Arkor manifest or a Trainer, or a `default.trainer` nested shape. `createArkor` returns a frozen, opaque manifest tagged with `_kind: "arkor"`; treat it as a value to hand to tooling, not a programmable client. -`arkor build` ([packages/arkor/src/cli/commands/build.ts](packages/arkor/src/cli/commands/build.ts)) bundles to `.arkor/build/index.mjs` with esbuild; bare specifiers (e.g. `arkor`, anything in `node_modules`) stay external so the artifact resolves the runtime SDK from the project's installed copy. +`arkor build` ([packages/arkor/src/cli/commands/build.ts](packages/arkor/src/cli/commands/build.ts)) bundles to `.arkor/build/index.mjs` with [Rolldown](https://rolldown.rs); bare specifiers (e.g. `arkor`, anything in `node_modules`) stay external so the artifact resolves the runtime SDK from the project's installed copy. The `transform.target` is derived from `process.versions.node` at build time so the bundle targets the same Node binary that will execute it. ### E2E suite specifics @@ -101,4 +109,4 @@ Don't split these into "docs in a follow-up PR" or "tests later" — land them i - **Don't call a HuggingFace model name "non-existent"** based on training-data alone. Templates reference real models (e.g. `unsloth/gemma-4-E4B-it`) that may post-date Claude's knowledge cutoff. Verify (e.g. `WebFetch`) before flagging in issues or PR comments. If unverifiable, hedge ("could not confirm") rather than asserting absence. - **Generated files** copied into package dirs are gitignored: `packages/*/CONTRIBUTING.md` (from root), `packages/arkor/docs/` (from root `docs/`). Edit the source under repo root, not the copies. - **Node version**: published packages declare `engines.node >=22.6`. Use Node 24 (latest preferred) for development per [CONTRIBUTING.md](CONTRIBUTING.md). -- **pnpm policy** ([pnpm-workspace.yaml](pnpm-workspace.yaml)): `minimumReleaseAge: 1440` (24 h) and `trustPolicy: no-downgrade` are intentional supply-chain guards. `allowBuilds` is the explicit allow-list for postinstall scripts (rolldown, unrs-resolver, esbuild). +- **pnpm policy** ([pnpm-workspace.yaml](pnpm-workspace.yaml)): `minimumReleaseAge: 1440` (24 h) and `trustPolicy: no-downgrade` are intentional supply-chain guards. `allowBuilds` is the explicit allow-list for postinstall scripts (rolldown for `arkor build`, esbuild for Vite's dependency optimization, unrs-resolver). diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index 5724079e..3ef52cf8 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -13,7 +13,7 @@ Three jobs: 2. **See training happen.** A jobs list with live status, a loss chart that updates as the run streams in, and a tail of training events. You can leave it open in a tab while you work on other things. 3. **Try a finished model.** A Playground page lets you pick the base model or the final adapter from any completed job and chat with it. The Playground does not load intermediate checkpoints; for mid-run inference, use [`onCheckpoint`](/concepts/lifecycle) callbacks in your trainer. -A note on the dev loop: Studio's `/api/manifest` endpoint rebuilds and re-imports your trainer on every request (with a cache-bust query, see `packages/arkor/src/studio/manifest.ts`), but the UI only fetches it when the Run training page mounts. So if you edit `src/arkor/` and stay on the same Run training page, the next click reuses the existing `.arkor/build/index.mjs` and runs your old code. Refresh the page (or run `pnpm exec arkor build` from the terminal) between edits and clicks to pick up the new code reliably. +A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` and pushes rebuild notifications to the SPA over a Server-Sent Events stream (`/api/dev/events`). Edit a file, save, and the Run training button updates with the new trainer name without a refresh. If a training run is in flight, the Studio asks it to early-stop at the next checkpoint (`Trainer.requestEarlyStop()`) so the work isn't wasted, then re-spawns the run with the rebuilt artifact. The Cloud-side job for the previous run reaches `cancelled` after the checkpoint is uploaded. ## Where Studio runs diff --git a/docs/ja/concepts/studio.mdx b/docs/ja/concepts/studio.mdx index 1e4adedb..b8ae40ff 100644 --- a/docs/ja/concepts/studio.mdx +++ b/docs/ja/concepts/studio.mdx @@ -13,7 +13,7 @@ Studio は `arkor dev` 実行時に得られるローカル Web UI です。サ 2. **学習を眺める。** ライブステータス付きのジョブ一覧、ストリーム到着とともに更新される Loss チャート、学習イベントのテール。タブで開きっぱなしにして他の作業ができます。 3. **完成モデルを試す。** Playground ページでベースモデルや任意の完了ジョブの最終アダプタを選んでチャットできます。中間チェックポイントは Playground からはロードしません。学習中の推論には [`onCheckpoint`](/ja/concepts/lifecycle) コールバックをトレーナーで使ってください。 -dev ループのメモ: Studio の `/api/manifest` エンドポイントはリクエストごとにトレーナーをリビルド・再 import しますが(キャッシュバストクエリ付き、`packages/arkor/src/studio/manifest.ts` を参照)、UI が fetch するのは Run training ページがマウントされたときだけです。`src/arkor/` を編集して同じ Run training ページに留まり続けると、次のクリックは既存の `.arkor/build/index.mjs` を再利用して古いコードで走ります。確実に新しいコードを取り込むには、編集とクリックの間にページをリロード(あるいはターミナルから `pnpm exec arkor build`)してください。 +dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッチャを `src/arkor/` 上で常駐させ、再ビルド通知を Server-Sent Events ストリーム (`/api/dev/events`) で SPA に push します。ファイルを編集して保存すれば、Run training ボタンのトレーナー名表示はリロード無しで更新されます。学習が走っている最中であれば、Studio はそのジョブに次のチェックポイントで Early Stopping を要求し(`Trainer.requestEarlyStop()`、ここまでの学習成果は保全)、再ビルドした成果物で自動的に再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移します。 ## Studio が動く場所 diff --git a/docs/ja/sdk/trainer-control.mdx b/docs/ja/sdk/trainer-control.mdx index 6a6bd74f..8683d3d5 100644 --- a/docs/ja/sdk/trainer-control.mdx +++ b/docs/ja/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start、wait、cancel、abortSignal、再接続の仕組み。" # トレーナー制御 -`createTrainer` は次の 3 メソッドを持つ `Trainer` オブジェクトを返します: +`createTrainer` は次の 4 メソッドを持つ `Trainer` オブジェクトを返します: ```ts interface Trainer { @@ -13,6 +13,7 @@ interface Trainer { start(): Promise<{ jobId: string }>; wait(): Promise; cancel(): Promise; + requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } interface TrainingResult { @@ -54,6 +55,25 @@ await trainer.cancel(); - そうでなければバックエンドにキャンセルリクエストを送ります。 - **ベストエフォート。** SDK は終端ステータスでショートサーキットしません。学習が既に completed / failed / cancelled なら、バックエンドは non-2xx を返すことがあり `cancel()` は reject します。投機的に呼ぶなら `try / catch` で囲んでください。 +## `requestEarlyStop()` + +```ts +await trainer.requestEarlyStop(); +// もしくはチェックポイント待ちのデッドラインを指定: +await trainer.requestEarlyStop({ timeoutMs: 60_000 }); +``` + +「直近のチェックポイントを保全する」 `cancel()` の兄弟版です。 + +- ラッチを armed にします。トレーナーは **次の** `checkpoint.saved` イベントが来るまで実行を続けます。チェックポイントが永続化された時点で SDK が代わりに `cancel()` を呼び、戻り値の Promise を resolve します。 +- `timeoutMs`(デフォルト: 5 分)以内にチェックポイントが来なかった場合は即時 `cancel()` にフォールバックします。`saveSteps` の間隔がそれより長い場合はこの値を調整してください。 +- 冪等: 連続して呼んでも同じ in-flight Promise を共有し、`cancel()` は 1 度しか発火しません。 +- `start()` 前、もしくはジョブが既に終端ステータスに達している場合は何もしません。 + +これは `arkor dev` の HMR パイプラインが内部で使っている API です。実行中にソースファイルを保存すると Studio が spawn 済みの `arkor start` プロセスに `SIGTERM` を送り、シグナルハンドラが `requestEarlyStop()` を呼んでチェックポイントのアップロード完了後にクリーンに終了します。Cloud 側のジョブは `cancelled` ステータスで完了します。 + +自前のコード(プログラム的な two-process パターンなど)から `requestEarlyStop()` を直接呼ぶこともできます。Cookbook の [Early Stopping](/ja/cookbook/early-stopping) レシピが `onCheckpoint` + `abortSignal` + `cancel()` で組み立てているのと同じ「実行中ステップを捨てずに止める」セマンティクスを、ワンショットで提供します。レシピ版の方が柔軟(メトリクス次第で abort のタイミングを決めるなど)ですが、こちらは「次のチェックポイントで止める」という典型ケースの便利フックです。 + ## `abortSignal` ```ts diff --git a/docs/sdk/trainer-control.mdx b/docs/sdk/trainer-control.mdx index ef40cf46..c43a404b 100644 --- a/docs/sdk/trainer-control.mdx +++ b/docs/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start, wait, cancel, abortSignal, and how reconnects work." # Trainer control -`createTrainer` returns a `Trainer` object with three methods: +`createTrainer` returns a `Trainer` object with four methods: ```ts interface Trainer { @@ -13,6 +13,7 @@ interface Trainer { start(): Promise<{ jobId: string }>; wait(): Promise; cancel(): Promise; + requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } interface TrainingResult { @@ -54,6 +55,25 @@ await trainer.cancel(); - Otherwise it sends a cancel request to the backend. - **Best-effort.** The SDK does not short-circuit on terminal status; if the run already completed, failed, or was cancelled, the backend may return a non-2xx and `cancel()` rejects. Wrap in `try / catch` if you call it speculatively. +## `requestEarlyStop()` + +```ts +await trainer.requestEarlyStop(); +// or with a custom checkpoint deadline: +await trainer.requestEarlyStop({ timeoutMs: 60_000 }); +``` + +The "preserve the latest checkpoint" sibling of `cancel()`: + +- Arms a latch. The trainer keeps running until the **next** `checkpoint.saved` event lands. Once the checkpoint is durable, the SDK calls `cancel()` for you and resolves the returned promise. +- If no checkpoint arrives within `timeoutMs` (default: 5 minutes), falls back to `cancel()` immediately. Tune this if your `saveSteps` cadence is longer than 5 min. +- Idempotent — repeat calls share the same in-flight promise and only fire `cancel()` once. +- A no-op when called before `start()` or after the job has already reached a terminal status. + +This is what `arkor dev`'s HMR pipeline uses internally: when you save a source file mid-run, Studio sends `SIGTERM` to the spawned `arkor start` process; that process catches the signal, calls `requestEarlyStop()`, and exits cleanly once the checkpoint is uploaded. The Cloud-side job ends in the `cancelled` status. + +You can use `requestEarlyStop()` directly from your own code (e.g. in a programmatic two-process pattern) if you want the same "stop, but don't throw away the in-flight step" semantics that the cookbook's [Early stopping](/cookbook/early-stopping) recipe builds out of `onCheckpoint` + `abortSignal` + `cancel()`. The recipe is more flexible (you decide when to abort based on metrics); this method is the convenience hook for the common "stop after the next checkpoint" case. + ## `abortSignal` ```ts diff --git a/packages/arkor/package.json b/packages/arkor/package.json index d74cda08..58112ef4 100644 --- a/packages/arkor/package.json +++ b/packages/arkor/package.json @@ -55,10 +55,10 @@ "@clack/prompts": "^0.8.0", "@hono/node-server": "^1.14.0", "commander": "^13.0.0", - "esbuild": "^0.28.0", "hono": "^4.7.0", "open": "^10.0.0", "posthog-node": "^5.30.6", + "rolldown": "^1.0.0-rc.17", "zod": "^4.3.6" }, "devDependencies": { diff --git a/packages/arkor/src/cli/commands/build.ts b/packages/arkor/src/cli/commands/build.ts index af761428..3d555c59 100644 --- a/packages/arkor/src/cli/commands/build.ts +++ b/packages/arkor/src/cli/commands/build.ts @@ -1,7 +1,7 @@ import { existsSync } from "node:fs"; import { mkdir } from "node:fs/promises"; import { isAbsolute, relative, resolve } from "node:path"; -import { build as esbuild } from "esbuild"; +import { rolldown } from "rolldown"; import { ui } from "../prompts"; export interface BuildOptions { @@ -25,6 +25,16 @@ export interface BuildResult { const DEFAULT_ENTRY = "src/arkor/index.ts"; const DEFAULT_OUT_DIR = ".arkor/build"; +/** + * `node.` derived from the running Node binary. Build host and run + * host are effectively the same process: Studio spawns `arkor start` with + * `process.execPath`, so the bundle can target precisely what will execute it. + */ +function resolveNodeTarget(): string { + const [major = "22", minor = "6"] = process.versions.node.split("."); + return `node${major}.${minor}`; +} + /** * Bundle the user's `src/arkor/index.ts` into a single ESM artifact at * `.arkor/build/index.mjs`. @@ -48,16 +58,28 @@ export async function runBuild(opts: BuildOptions = {}): Promise { await mkdir(outDir, { recursive: true }); const outFile = resolve(outDir, "index.mjs"); - await esbuild({ - entryPoints: [entry], - bundle: true, + const bundle = await rolldown({ + input: entry, + cwd, platform: "node", - format: "esm", - target: "node22.6", - outfile: outFile, - packages: "external", - logLevel: "error", + logLevel: "warn", + transform: { target: resolveNodeTarget() }, + // Mirror esbuild's `packages: "external"`: any specifier that isn't a + // relative or absolute path stays external. `node:`-prefixed builtins are + // already handled by `platform: "node"` but we keep the explicit allow as + // a safety net in case the builtin set drifts. + external: (id, _importer, isResolved) => { + if (isResolved) return false; + if (id.startsWith(".")) return false; + if (isAbsolute(id)) return false; + return true; + }, }); + try { + await bundle.write({ file: outFile, format: "esm" }); + } finally { + await bundle.close(); + } if (!opts.quiet) { ui.log.success( diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index e2bf01cf..2ba1c9b3 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -16,6 +16,7 @@ import { type AnonymousCredentials, } from "../../core/credentials"; import { buildStudioApp } from "../../studio/server"; +import { createHmrCoordinator } from "../../studio/hmr"; import { ANON_PERSISTENCE_NUDGE } from "../anonymous"; import { ui } from "../prompts"; @@ -190,6 +191,25 @@ function scheduleStudioTokenCleanup(path: string): void { } } +function scheduleHmrCleanup(hmr: { dispose: () => Promise }): void { + let disposed = false; + const dispose = () => { + if (disposed) return; + disposed = true; + hmr.dispose().catch(() => { + // best-effort: shutdown is racing other cleanup paths + }); + }; + // Mirror `scheduleStudioTokenCleanup` exit hooks. Note that those handlers + // already call `process.exit(0)` for the same signals; this listener fires + // first because Node invokes signal handlers in registration order, so the + // dispose call lands before exit. + process.on("exit", dispose); + for (const sig of ["SIGINT", "SIGTERM", "SIGHUP"] as const) { + process.on(sig, dispose); + } +} + export async function runDev(options: DevOptions = {}): Promise { await ensureCredentialsForStudio(); @@ -199,6 +219,15 @@ export async function runDev(options: DevOptions = {}): Promise { // hitting `arkor start` (and therefore RCE via dynamic import). const studioToken = randomBytes(32).toString("base64url"); + // HMR coordinator: a long-lived rolldown watcher over the user's + // `src/arkor` graph. Lazy-started on first `/api/dev/events` connection so + // an `arkor dev` launched in an unbuilt project doesn't immediately fail. + // Registered before the studio-token cleanup so the latter remains the + // most-recently-attached signal listener (existing tests rely on this + // ordering to find the token-removal handler). + const hmr = createHmrCoordinator({ cwd: process.cwd() }); + scheduleHmrCleanup(hmr); + // Persisting the token to disk is *only* needed for the Vite SPA dev // workflow. The bundled `:port` flow injects the meta tag at request time // via `buildStudioApp`, so a failure here (read-only $HOME on Docker / @@ -217,7 +246,7 @@ export async function runDev(options: DevOptions = {}): Promise { // `autoAnonymous: true` (the default) lets the Hono server retry the // anonymous bootstrap on first `/api/credentials` hit if the up-front // attempt above failed (e.g. cloud-api was unreachable at launch). - const app = buildStudioApp({ studioToken }); + const app = buildStudioApp({ studioToken, hmr }); // Bind to 127.0.0.1 (not "localhost") so the listener can't end up on `::1` // only — `@hono/node-server` passes hostname to `net.Server.listen`, which // calls `dns.lookup`. On hosts where `/etc/hosts` orders `::1 localhost` @@ -229,6 +258,7 @@ export async function runDev(options: DevOptions = {}): Promise { const url = `http://localhost:${port}`; serve({ fetch: app.fetch, port, hostname: "127.0.0.1" }); process.stdout.write(`Arkor Studio running on ${url}\n`); + process.stdout.write(`HMR enabled (watching src/arkor)\n`); if (options.open) { try { await open(url); diff --git a/packages/arkor/src/cli/commands/start.test.ts b/packages/arkor/src/cli/commands/start.test.ts index 8209818b..a08d70f4 100644 --- a/packages/arkor/src/cli/commands/start.test.ts +++ b/packages/arkor/src/cli/commands/start.test.ts @@ -78,7 +78,7 @@ describe("runStart", () => { it("skips the build step when the artifact already exists and no entry override is given", async () => { // Branch coverage for `Boolean(opts.entry) || !existsSync(outFile)` — // the path where both halves are false. Pre-build the artifact, then - // confirm runStart imports it without triggering esbuild again. + // confirm runStart imports it without triggering rolldown again. mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); // First call builds normally. diff --git a/packages/arkor/src/core/arkor.test.ts b/packages/arkor/src/core/arkor.test.ts index 64e5e82e..d3dc41a7 100644 --- a/packages/arkor/src/core/arkor.test.ts +++ b/packages/arkor/src/core/arkor.test.ts @@ -23,6 +23,7 @@ function fakeTrainer(name = "run"): Trainer { }; }, async cancel() {}, + async requestEarlyStop() {}, }; } diff --git a/packages/arkor/src/core/runner.test.ts b/packages/arkor/src/core/runner.test.ts index b2560c8d..ee1667be 100644 --- a/packages/arkor/src/core/runner.test.ts +++ b/packages/arkor/src/core/runner.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, afterEach, beforeEach } from "vitest"; +import { describe, it, expect, afterEach, beforeEach, vi } from "vitest"; import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; @@ -33,6 +33,7 @@ function fakeTrainer(onStart?: () => void, onWait?: () => void): Trainer { }; }, async cancel() {}, + async requestEarlyStop() {}, }; } @@ -207,3 +208,87 @@ describe("runTrainer — entry extraction", () => { expect(typeof t.wait).toBe("function"); }); }); + +describe("runTrainer — shutdown signal handling", () => { + it("first SIGTERM calls trainer.requestEarlyStop and exits 0; second SIGTERM exits 143", async () => { + // Fake trainer whose `wait()` hangs until the test manually resolves it + // (via a global helper). This lets us hold the run in flight long + // enough to assert both signal-handling branches without racing the + // `finally` block that removes the listeners. + const trainerSrc = ` + let earlyStopCalls = 0; + let resolveWait; + const waitPromise = new Promise((r) => { resolveWait = r; }); + globalThis.__test_signalProbe = { + get earlyStopCalls() { return earlyStopCalls; }, + finishWait: () => resolveWait({ + job: { + id: "j1", orgId: "o", projectId: "p", name: "n", + status: "completed", + config: { model: "m", datasetSource: { type: "huggingface", name: "x" } }, + createdAt: "2026", + }, + artifacts: [], + }), + }; + export const trainer = { + name: "n", + start: async () => ({ jobId: "j1" }), + wait: () => waitPromise, + cancel: async () => {}, + requestEarlyStop: async () => { earlyStopCalls++; }, + }; + `; + const entry = join(cwd, "src/arkor/index.mjs"); + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(entry, trainerSrc); + + const exitCalls: number[] = []; + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((code?: number) => { + exitCalls.push(code ?? 0); + return undefined as never; + }) as typeof process.exit); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + try { + const runPromise = runTrainer("src/arkor/index.mjs"); + // Wait for import + start() to settle so the handler is registered + // before we synthesise SIGTERM. The fake's `wait()` hangs forever, so + // the run remains in flight throughout the assertions. + await new Promise((r) => setTimeout(r, 25)); + + const probe = (globalThis as unknown as { + __test_signalProbe: { + earlyStopCalls: number; + finishWait: () => void; + }; + }).__test_signalProbe; + + // 1st SIGTERM → requestEarlyStop is called, exit(0) scheduled in the + // promise's `.finally`. + process.emit("SIGTERM", "SIGTERM"); + await new Promise((r) => setTimeout(r, 25)); + expect(probe.earlyStopCalls).toBe(1); + expect(exitCalls).toContain(0); + + // 2nd SIGTERM (still in-flight, listeners not yet removed) → + // exit(143) immediately, no second requestEarlyStop call. + process.emit("SIGTERM", "SIGTERM"); + await new Promise((r) => setTimeout(r, 25)); + expect(probe.earlyStopCalls).toBe(1); + expect(exitCalls).toContain(143); + + // Release the hung wait() so runPromise can complete and the + // shutdown handlers detach via the finally block. + probe.finishWait(); + await runPromise; + } finally { + exitSpy.mockRestore(); + stdoutSpy.mockRestore(); + delete (globalThis as Record).__test_signalProbe; + } + }); +}); diff --git a/packages/arkor/src/core/runner.ts b/packages/arkor/src/core/runner.ts index e674b70e..1fc5b0b2 100644 --- a/packages/arkor/src/core/runner.ts +++ b/packages/arkor/src/core/runner.ts @@ -42,6 +42,51 @@ function extractTrainer(mod: Record): Trainer { ); } +const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"] as const; + +/** + * Two-stage signal handling so HMR rebuilds (Studio sends SIGTERM) preserve + * the in-flight checkpoint work: + * + * - 1st signal → `trainer.requestEarlyStop()`. The trainer keeps running, + * lets the next `checkpoint.saved` event land, then issues `cancel()`. + * - 2nd signal → immediate `process.exit(143)`. Escape hatch for an + * impatient operator or a hung early-stop. + * + * The handlers are removed in `finally` so a normal `wait()` completion + * doesn't leave stale listeners behind — important because `runTrainer` can + * be called multiple times in tests within a single Node process. + */ +function installShutdownHandlers(trainer: Trainer): () => void { + let signalCount = 0; + const handler = (signal: NodeJS.Signals): void => { + signalCount += 1; + if (signalCount > 1) { + process.stdout.write( + `Received second ${signal}; exiting without waiting for checkpoint.\n`, + ); + process.exit(143); + // Explicit return so test mocks of process.exit (which don't actually + // terminate the worker) don't fall through into the early-stop path. + return; + } + process.stdout.write( + `Received ${signal}; early-stopping at next checkpoint…\n`, + ); + trainer + .requestEarlyStop() + .catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`requestEarlyStop failed: ${msg}\n`); + }) + .finally(() => process.exit(0)); + }; + for (const sig of SHUTDOWN_SIGNALS) process.on(sig, handler); + return () => { + for (const sig of SHUTDOWN_SIGNALS) process.off(sig, handler); + }; +} + export async function runTrainer(file?: string): Promise { const relative = file ?? DEFAULT_ENTRY; const abs = isAbsolute(relative) ? relative : resolve(process.cwd(), relative); @@ -53,8 +98,15 @@ export async function runTrainer(file?: string): Promise { const mod = (await import(pathToFileURL(abs).href)) as Record; const trainer = extractTrainer(mod); - const { jobId } = await trainer.start(); - process.stdout.write(`Started job ${jobId}\n`); - const result = await trainer.wait(); - process.stdout.write(`Job ${result.job.id} finished with status=${result.job.status}\n`); + const removeShutdownHandlers = installShutdownHandlers(trainer); + try { + const { jobId } = await trainer.start(); + process.stdout.write(`Started job ${jobId}\n`); + const result = await trainer.wait(); + process.stdout.write( + `Job ${result.job.id} finished with status=${result.job.status}\n`, + ); + } finally { + removeShutdownHandlers(); + } } diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 3e5633d2..13e8145c 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1304,3 +1304,252 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { } }); }); + +describe("createTrainer (early stop)", () => { + const minimalJobRow = { + id: "j-stop", + orgId: "o1", + projectId: "p1", + name: "run", + status: "queued", + config: { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }, + createdAt: "2026-01-01T00:00:00Z", + startedAt: null, + completedAt: null, + }; + + it("calls cancel after the next checkpoint when early-stop is requested mid-run", async () => { + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + // SSE stream: training.started → training.log → checkpoint.saved. + // The checkpoint event is the trigger for the early-stop branch in + // dispatch(); after that, the loop should treat the run as terminal + // (we asserted this by ending the wait() promise without sending + // training.completed). + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: checkpoint.saved\ndata: ${JSON.stringify({ + type: "checkpoint.saved", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + step: 10, + })}\n\n`, + ]; + + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + // Arm the early-stop latch from inside the on-log callback so it + // fires before the checkpoint dispatch — mirrors the real CLI + // path where SIGTERM arrives mid-run. Fire-and-forget so the + // dispatch loop isn't blocked waiting for the latch's own + // checkpoint trigger to arrive. + onLog: () => { + void trainer.requestEarlyStop({ timeoutMs: 60_000 }); + }, + }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + await trainer.wait(); + } finally { + globalThis.fetch = original; + } + expect(cancelCalls).toBe(1); + }); + + it("falls back to immediate cancel when no checkpoint arrives within timeoutMs", async () => { + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + // No checkpoint in the stream — only training.completed, which would + // normally finish the run. We hand-roll a stream that never ends so + // the timeout fallback is what actually triggers cancel. + let streamController: ReadableStreamDefaultController | null = + null; + const stallingStream = new ReadableStream({ + start(controller) { + streamController = controller; + const enc = new TextEncoder(); + controller.enqueue( + enc.encode( + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + ), + ); + }, + }); + + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(stallingStream, { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + // Closing the stream now mimics cloud-api's response to a cancel: + // the SSE channel ends and wait() exits its loop. + streamController?.close(); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + await trainer.start(); + // Tiny timeout so the test doesn't actually wait 5 minutes. + const stopPromise = trainer.requestEarlyStop({ timeoutMs: 5 }); + await stopPromise; + expect(cancelCalls).toBe(1); + } finally { + globalThis.fetch = original; + } + }); + + it("is a no-op before start() and resolves immediately", async () => { + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + // Should resolve without contacting cloud-api at all. + await trainer.requestEarlyStop({ timeoutMs: 1 }); + }); + + it("is idempotent — repeated calls share the same in-flight promise", async () => { + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + await trainer.start(); + const a = trainer.requestEarlyStop({ timeoutMs: 5 }); + const b = trainer.requestEarlyStop({ timeoutMs: 5 }); + await Promise.all([a, b]); + // The fallback timer fires once, so cancel is called once even though + // requestEarlyStop was called twice. + expect(cancelCalls).toBe(1); + } finally { + globalThis.fetch = original; + } + }); +}); diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index 874382f0..f5e88ca5 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -139,6 +139,18 @@ export function createTrainer( let scope: { orgSlug: string; projectSlug: string } | null = null; let clientPromise: Promise | null = null; + // Early-stop state. `requestEarlyStop()` arms the latch; the next + // `checkpoint.saved` dispatch (or the timeout, whichever fires first) + // calls cancel() and resolves the deferred. Idempotent across repeat + // calls — they share the same deferred. + const DEFAULT_EARLY_STOP_TIMEOUT_MS = 5 * 60 * 1000; + let earlyStopDeferred: { + promise: Promise; + resolve: () => void; + timer: NodeJS.Timeout | null; + } | null = null; + let earlyStopRequested = false; + async function getClient(): Promise { if (!clientPromise) { clientPromise = (async () => { @@ -244,6 +256,15 @@ export function createTrainer( artifacts: event.artifacts, }; await callbacks.onCheckpoint?.(ctx); + // Early-stop latch: a checkpoint just landed, so the in-flight work + // is durable. Cancel the cloud job and end `wait()` cleanly. + if (earlyStopRequested && earlyStopDeferred) { + await trainer.cancel(); + if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); + earlyStopDeferred.resolve(); + earlyStopDeferred = null; + return { terminal: true, artifacts: terminalResult?.artifacts ?? [] }; + } return { terminal: false, artifacts: terminalResult?.artifacts ?? [] }; } case "training.completed": { @@ -390,6 +411,47 @@ export function createTrainer( const client = await getClient(); await client.cancelJob(startedJob.id, scope); }, + + async requestEarlyStop(opts: { timeoutMs?: number } = {}): Promise { + // Nothing in flight: cleanup any prior latch and resolve. + if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { + if (earlyStopDeferred) { + if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); + earlyStopDeferred.resolve(); + earlyStopDeferred = null; + } + earlyStopRequested = false; + return; + } + // Idempotent: a second call piggybacks on the first. + if (earlyStopDeferred) return earlyStopDeferred.promise; + + earlyStopRequested = true; + let resolveFn!: () => void; + const promise = new Promise((resolve) => { + resolveFn = resolve; + }); + const timeoutMs = opts.timeoutMs ?? DEFAULT_EARLY_STOP_TIMEOUT_MS; + const timer = setTimeout(() => { + // Timed out waiting for a checkpoint — fall back to immediate cancel. + // Capture the active deferred reference: by the time the cancel POST + // resolves, the checkpoint branch may have nulled out the shared + // slot, but this fallback path still owns the deferred it created. + const active = earlyStopDeferred; + trainer + .cancel() + .catch(() => {}) + .finally(() => { + if (active) active.resolve(); + if (earlyStopDeferred === active) earlyStopDeferred = null; + }); + }, timeoutMs); + // `Timer.unref` keeps the early-stop timer from blocking process exit + // when the host runtime finishes for unrelated reasons. + timer.unref?.(); + earlyStopDeferred = { promise, resolve: resolveFn, timer }; + return promise; + }, }; return trainer; diff --git a/packages/arkor/src/core/types.ts b/packages/arkor/src/core/types.ts index e5fe1f26..c0ec4d31 100644 --- a/packages/arkor/src/core/types.ts +++ b/packages/arkor/src/core/types.ts @@ -200,6 +200,17 @@ export interface Trainer { wait(): Promise; /** Best-effort cancel; resolves once the cloud API accepts the request. */ cancel(): Promise; + /** + * Stop after the next saved checkpoint. The trainer keeps running, lets the + * in-flight step finish + checkpoint upload complete, then issues `cancel()`. + * Resolves once the cancel POST has been accepted. Falls back to immediate + * cancel if no checkpoint arrives within `timeoutMs` (default: 5 min). + * + * Idempotent: repeat calls return the same in-flight promise. If the job + * has not been `start()`ed or has already reached a terminal status, this + * resolves immediately without contacting the cloud API. + */ + requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } /** diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts new file mode 100644 index 00000000..2ea6392c --- /dev/null +++ b/packages/arkor/src/studio/hmr.test.ts @@ -0,0 +1,141 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { + mkdirSync, + mkdtempSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createHmrCoordinator, type HmrEvent } from "./hmr"; + +const FAKE_MANIFEST = `export const arkor = Object.freeze({ + _kind: "arkor", + trainer: { name: "alpha" }, +}); +`; + +let cwd: string; + +beforeEach(() => { + cwd = mkdtempSync(join(tmpdir(), "arkor-hmr-test-")); +}); + +afterEach(() => { + rmSync(cwd, { recursive: true, force: true }); +}); + +function nextEvent( + events: HmrEvent[], + predicate: (e: HmrEvent) => boolean, + timeoutMs = 10_000, +): Promise { + return new Promise((resolve, reject) => { + const start = Date.now(); + const tick = () => { + const found = events.find(predicate); + if (found) return resolve(found); + if (Date.now() - start > timeoutMs) { + return reject( + new Error( + `Timed out waiting for matching HMR event after ${timeoutMs}ms`, + ), + ); + } + setTimeout(tick, 25); + }; + tick(); + }); +} + +describe("createHmrCoordinator", () => { + it("emits a `ready` event after the first successful build", async () => { + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + const ready = await nextEvent(events, (e) => e.type === "ready"); + expect(ready.outFile).toMatch(/\.arkor[\\/]+build[\\/]+index\.mjs$/); + expect(typeof ready.hash).toBe("string"); + } finally { + await hmr.dispose(); + } + }); + + it("emits a `rebuild` event after a source edit", async () => { + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + const ready = await nextEvent(events, (e) => e.type === "ready"); + // Touch the entry with new content so the watcher detects a change. + writeFileSync( + join(cwd, "src/arkor/index.ts"), + FAKE_MANIFEST.replace(`"alpha"`, `"beta"`), + ); + const rebuild = await nextEvent(events, (e) => e.type === "rebuild"); + expect(rebuild.outFile).toBe(ready.outFile); + expect(rebuild.hash).not.toBe(ready.hash); + } finally { + await hmr.dispose(); + } + }); + + it("emits an `error` event when the entry is missing on subscribe", async () => { + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + const err = await nextEvent(events, (e) => e.type === "error", 1000); + expect(err.message).toMatch(/Build entry not found/); + } finally { + await hmr.dispose(); + } + }); + + it("replays the latest event to late subscribers", async () => { + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const firstEvents: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => firstEvents.push(e)); + try { + await nextEvent(firstEvents, (e) => e.type === "ready"); + // A new subscriber should receive the cached state synchronously + // before any new build is triggered. + const lateEvents: HmrEvent[] = []; + hmr.subscribe((e) => lateEvents.push(e)); + expect(lateEvents.length).toBeGreaterThanOrEqual(1); + expect(lateEvents[0]?.type).toBe("ready"); + } finally { + await hmr.dispose(); + } + }); + + it("stops broadcasting after dispose()", async () => { + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + await nextEvent(events, (e) => e.type === "ready"); + await hmr.dispose(); + const countAfterDispose = events.length; + + // Edit after dispose must not produce any further events. + writeFileSync( + join(cwd, "src/arkor/index.ts"), + FAKE_MANIFEST.replace(`"alpha"`, `"gamma"`), + ); + await new Promise((r) => setTimeout(r, 250)); + expect(events.length).toBe(countAfterDispose); + }); +}); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts new file mode 100644 index 00000000..42a03078 --- /dev/null +++ b/packages/arkor/src/studio/hmr.ts @@ -0,0 +1,148 @@ +import { existsSync, statSync } from "node:fs"; +import { isAbsolute, resolve } from "node:path"; +import { watch, type RolldownWatcher } from "rolldown"; + +export type HmrEventType = "ready" | "rebuild" | "error"; + +export interface HmrEvent { + type: HmrEventType; + outFile?: string; + /** Short content fingerprint (mtime + size) so subscribers can dedupe. */ + hash?: string; + /** Human-readable error message; only present on `type === "error"`. */ + message?: string; +} + +export interface HmrCoordinator { + /** + * Receive the current cached state immediately, then every subsequent event. + * Returns an unsubscribe function. + */ + subscribe(fn: (event: HmrEvent) => void): () => void; + dispose(): Promise; +} + +export interface HmrOptions { + cwd: string; + /** Defaults to `src/arkor/index.ts`. */ + entry?: string; + /** Defaults to `.arkor/build`. */ + outDir?: string; +} + +const DEFAULT_ENTRY = "src/arkor/index.ts"; +const DEFAULT_OUT_DIR = ".arkor/build"; + +function resolveNodeTarget(): string { + const [major = "22", minor = "6"] = process.versions.node.split("."); + return `node${major}.${minor}`; +} + +function fingerprint(outFile: string): string { + try { + const s = statSync(outFile); + return `${s.mtimeMs.toFixed(0)}-${s.size}`; + } catch { + return Date.now().toString(36); + } +} + +/** + * Spin up a rolldown watcher over the user's `src/arkor` entry, broadcasting + * `ready` / `rebuild` / `error` to subscribers. Used by `arkor dev` to push + * `/api/dev/events` SSE notifications to the SPA. + * + * Lazy: the watcher only starts on the first `subscribe` call so a Studio + * launch in a project without `src/arkor/index.ts` doesn't immediately fail + * — the watcher kicks in once the user creates the file and the SPA opens an + * EventSource. After every successful build the watcher caches the latest + * state and replays it to new subscribers so a late-mounting component still + * sees the trainer. + */ +export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { + const cwd = opts.cwd; + const entryRel = opts.entry ?? DEFAULT_ENTRY; + const entry = isAbsolute(entryRel) ? entryRel : resolve(cwd, entryRel); + const outDirRel = opts.outDir ?? DEFAULT_OUT_DIR; + const outDir = isAbsolute(outDirRel) ? outDirRel : resolve(cwd, outDirRel); + const outFile = resolve(outDir, "index.mjs"); + + const subscribers = new Set<(event: HmrEvent) => void>(); + let lastEvent: HmrEvent | null = null; + let watcher: RolldownWatcher | null = null; + let disposed = false; + + function broadcast(event: HmrEvent): void { + lastEvent = event; + for (const fn of subscribers) { + try { + fn(event); + } catch { + // Subscribers are SSE controllers — a thrown error usually means the + // connection closed mid-flight. Drop it so one bad subscriber can't + // poison the broadcast for the rest. + } + } + } + + function startWatcher(): void { + if (watcher || disposed) return; + if (!existsSync(entry)) { + broadcast({ + type: "error", + message: `Build entry not found: ${entry}. Create ${DEFAULT_ENTRY} or pass an explicit entry argument.`, + }); + return; + } + watcher = watch({ + input: entry, + cwd, + platform: "node", + logLevel: "warn", + transform: { target: resolveNodeTarget() }, + external: (id, _importer, isResolved) => { + if (isResolved) return false; + if (id.startsWith(".")) return false; + if (isAbsolute(id)) return false; + return true; + }, + output: { file: outFile, format: "esm" }, + }); + let firstBuild = true; + watcher.on("event", (event) => { + if (event.code === "BUNDLE_END") { + // rolldown requires the per-build result to be closed to avoid leaks. + event.result.close().catch(() => {}); + const type: HmrEventType = firstBuild ? "ready" : "rebuild"; + firstBuild = false; + broadcast({ type, outFile, hash: fingerprint(outFile) }); + } else if (event.code === "ERROR") { + event.result.close().catch(() => {}); + broadcast({ + type: "error", + message: event.error instanceof Error ? event.error.message : String(event.error), + }); + } + }); + } + + return { + subscribe(fn) { + subscribers.add(fn); + if (lastEvent) fn(lastEvent); + startWatcher(); + return () => { + subscribers.delete(fn); + }; + }, + async dispose() { + disposed = true; + subscribers.clear(); + if (watcher) { + const w = watcher; + watcher = null; + await w.close().catch(() => {}); + } + }, + }; +} diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 48c1651e..21cbf94d 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -11,6 +11,7 @@ import { import { tmpdir } from "node:os"; import { join, resolve } from "node:path"; import { buildStudioApp } from "./server"; +import type { HmrCoordinator, HmrEvent } from "./hmr"; import { writeCredentials } from "../core/credentials"; import { writeState } from "../core/state"; import { @@ -1337,4 +1338,129 @@ process.exit(0); expect(body.error).toMatch(/credentials|login/i); }); }); + + describe("/api/dev/events (HMR)", () => { + function fakeHmr() { + // Mirror the real HmrCoordinator surface but stay synchronous so the + // test doesn't depend on rolldown.watch starting up. `emit` is a test + // hook for pushing events into the SSE stream from the test body. + const subs = new Set<(e: HmrEvent) => void>(); + const coordinator: HmrCoordinator = { + subscribe(fn) { + subs.add(fn); + return () => { + subs.delete(fn); + }; + }, + async dispose() { + subs.clear(); + }, + }; + return { + coordinator, + emit(event: HmrEvent) { + for (const fn of subs) fn(event); + }, + get subscriberCount() { + return subs.size; + }, + }; + } + + it("is unregistered when no hmr coordinator is supplied", async () => { + const app = build(); + const res = await app.request("/api/dev/events", { + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + }, + }); + expect(res.status).toBe(404); + }); + + it("rejects /api/dev/events without a token", async () => { + const fake = fakeHmr(); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fake.coordinator, + }); + const res = await app.request("/api/dev/events", { + headers: { host: "127.0.0.1:4000" }, + }); + expect(res.status).toBe(403); + }); + + it("accepts the studio token via ?studioToken= for the dev event stream", async () => { + const fake = fakeHmr(); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fake.coordinator, + }); + const res = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "127.0.0.1:4000" } }, + ); + expect(res.status).toBe(200); + expect(res.headers.get("content-type")).toBe("text/event-stream"); + // Cancelling the body's reader should release the subscriber. + const reader = res.body!.getReader(); + await reader.cancel(); + expect(fake.subscriberCount).toBe(0); + }); + + it("rejects /api/dev/events when host header is non-loopback", async () => { + const fake = fakeHmr(); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fake.coordinator, + }); + const res = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "evil.example.com" } }, + ); + expect(res.status).toBe(403); + }); + + it("forwards rebuild events as SSE frames", async () => { + const fake = fakeHmr(); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fake.coordinator, + }); + const res = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "127.0.0.1:4000" } }, + ); + const reader = res.body!.getReader(); + const decoder = new TextDecoder(); + + fake.emit({ type: "ready", outFile: "/tmp/x", hash: "abc" }); + // Read chunks until we have at least one full SSE frame. + let received = ""; + while (!received.includes("\n\n")) { + const { value, done } = await reader.read(); + if (done) break; + received += decoder.decode(value, { stream: true }); + } + expect(received).toContain("event: ready"); + expect(received).toContain('"outFile":"/tmp/x"'); + await reader.cancel(); + }); + }); }); diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index cd8d3209..d7f744ee 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -1,4 +1,4 @@ -import { spawn } from "node:child_process"; +import { spawn, type ChildProcess } from "node:child_process"; import { readFile, realpath } from "node:fs/promises"; import { timingSafeEqual } from "node:crypto"; import { Hono } from "hono"; @@ -16,6 +16,7 @@ import { SDK_VERSION } from "../core/version"; import { ensureProjectState } from "../core/projectState"; import { readState } from "../core/state"; import { readManifestSummary } from "./manifest"; +import type { HmrCoordinator, HmrEvent } from "./hmr"; const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { @@ -59,6 +60,15 @@ export interface StudioServerOptions { * here points at the bin itself). Override in tests. */ binPath?: string; + /** + * Optional HMR coordinator. When provided, the server registers + * `/api/dev/events` as an SSE stream that pushes rebuild / error events to + * the SPA, and rebuilds also signal SIGTERM to active `/api/train` + * subprocesses so they early-stop at the next checkpoint and the SPA can + * restart them with the new bundle. Wired in by `arkor dev`; left + * undefined for any non-dev consumer of `buildStudioApp`. + */ + hmr?: HmrCoordinator; } function tokensMatch(provided: string, expected: string): boolean { @@ -111,7 +121,12 @@ export function buildStudioApp(options: StudioServerOptions) { const app = new Hono(); const loopbackHostPattern = /^(127\.0\.0\.1|localhost)(:\d+)?$/; - const jobEventsPathPattern = /^\/api\/jobs\/[^/]+\/events$/; + // Routes where `?studioToken=` is accepted instead of the + // `X-Arkor-Studio-Token` header. Used only for `EventSource` streams, + // which cannot send custom headers. Adding to this list is CSRF-sensitive: + // it must always be a GET stream-only route, never a mutation endpoint. + const eventStreamPathPattern = + /^\/api\/jobs\/[^/]+\/events$|^\/api\/dev\/events$/; // Host-header guard for every route, including static HTML that carries the // per-launch Studio token. This is the DNS-rebinding boundary: a victim @@ -138,7 +153,7 @@ export function buildStudioApp(options: StudioServerOptions) { // require the header so a leaked token in a URL is not enough to POST. app.use("/api/*", async (c, next) => { const queryTokenAllowed = - c.req.method === "GET" && jobEventsPathPattern.test(c.req.path); + c.req.method === "GET" && eventStreamPathPattern.test(c.req.path); const provided = c.req.header("x-arkor-studio-token") ?? (queryTokenAllowed ? c.req.query("studioToken") : undefined) ?? @@ -279,6 +294,32 @@ export function buildStudioApp(options: StudioServerOptions) { return new Response(upstream.body, { status: upstream.status, headers }); }); + // Active `/api/train` subprocesses. HMR rebuilds iterate this map and + // SIGTERM each entry so its in-process signal handler (see + // `runTrainer`) can call `trainer.requestEarlyStop()`. Keyed by pid so + // tests can introspect. + interface ActiveTrain { + child: ChildProcess; + trainFile?: string; + } + const activeTrains = new Map(); + + function requestEarlyStopOnActive(): Array<{ + pid: number; + trainFile?: string; + }> { + const targets: Array<{ pid: number; trainFile?: string }> = []; + for (const [pid, entry] of activeTrains) { + try { + entry.child.kill("SIGTERM"); + } catch { + // child may have already exited between the iterator and the kill + } + targets.push({ pid, trainFile: entry.trainFile }); + } + return targets; + } + app.post("/api/train", async (c) => { const body = (await c.req.json().catch(() => ({}))) as { file?: string }; let trainFile: string | undefined; @@ -312,17 +353,22 @@ export function buildStudioApp(options: StudioServerOptions) { stdio: "pipe", cwd: trainCwd, }); + if (typeof child.pid === "number") { + activeTrains.set(child.pid, { child, trainFile }); + } const stream = new ReadableStream({ start(controller) { const enc = new TextEncoder(); child.stdout.on("data", (d) => controller.enqueue(enc.encode(d))); child.stderr.on("data", (d) => controller.enqueue(enc.encode(d))); child.on("close", (code) => { + if (typeof child.pid === "number") activeTrains.delete(child.pid); controller.enqueue(enc.encode(`\n---\nexit=${code}\n`)); controller.close(); }); }, cancel() { + if (typeof child.pid === "number") activeTrains.delete(child.pid); child.kill(); }, }); @@ -332,6 +378,65 @@ export function buildStudioApp(options: StudioServerOptions) { }); }); + // `/api/dev/events` — SSE stream of HMR rebuild / error notifications. + // Only active when `arkor dev` passed an HMR coordinator. The CSRF model + // accepts `?studioToken=` here (whitelisted in `eventStreamPathPattern`) + // because `EventSource` cannot send headers. When HMR is not configured + // the route still has an explicit 404 so the request doesn't fall through + // to the SPA index.html (which would mislead the SPA into thinking the + // EventSource connected successfully). + if (!options.hmr) { + app.get("/api/dev/events", (c) => + c.json({ error: "HMR not enabled" }, 404), + ); + } + if (options.hmr) { + const hmr = options.hmr; + app.get("/api/dev/events", (c) => { + let unsubscribe: (() => void) | null = null; + const stream = new ReadableStream({ + start(controller) { + const enc = new TextEncoder(); + const send = ( + event: HmrEvent & { + restart?: boolean; + restartTargets?: Array<{ pid: number; trainFile?: string }>; + }, + ) => { + const payload = JSON.stringify(event); + try { + controller.enqueue( + enc.encode(`event: ${event.type}\ndata: ${payload}\n\n`), + ); + } catch { + // controller closed mid-write; the unsubscribe path below + // takes care of the rest. + } + }; + unsubscribe = hmr.subscribe((event) => { + if (event.type === "rebuild" && activeTrains.size > 0) { + const restartTargets = requestEarlyStopOnActive(); + send({ ...event, restart: true, restartTargets }); + } else { + send(event); + } + }); + }, + cancel() { + unsubscribe?.(); + unsubscribe = null; + }, + }); + return new Response(stream, { + status: 200, + headers: { + "content-type": "text/event-stream", + "cache-control": "no-cache, no-transform", + }, + }); + }); + } + // Playground hits this so mid-training inference from Studio has the same // auth path as the rest of /api/*. State is auto-bootstrapped (anon only) // so the Playground's base-model mode works on a fresh launch with no diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index efa8b6bd..b5f46ab5 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -1,7 +1,9 @@ import { useEffect, useRef, useState } from "react"; import { fetchManifest, + openDevEvents, streamTraining, + type DevEvent, type ManifestResult, } from "../lib/api"; @@ -9,7 +11,13 @@ export function RunTraining() { const [running, setRunning] = useState(false); const [log, setLog] = useState(""); const [manifest, setManifest] = useState(null); + const [hmrStatus, setHmrStatus] = useState< + "idle" | "rebuilding" | "early-stopping" | "restarting" + >("idle"); const boxRef = useRef(null); + const lastTrainFileRef = useRef(undefined); + const restartPendingRef = useRef(false); + const runningRef = useRef(false); useEffect(() => { let cancelled = false; @@ -29,7 +37,54 @@ export function RunTraining() { }; }, []); - async function run() { + // HMR: listen for rebuild notifications from `arkor dev` and refresh the + // manifest. When a rebuild also early-stopped a running training run, the + // server flags `restart: true`; defer the actual re-invocation until the + // current `streamTraining` resolves so we don't run two cloud jobs at once. + useEffect(() => { + const es = openDevEvents(); + const onMessage = (raw: MessageEvent) => { + let payload: DevEvent; + try { + payload = JSON.parse(raw.data) as DevEvent; + } catch { + return; + } + if (payload.type === "error") { + setManifest({ error: payload.message ?? "Build failed" }); + setHmrStatus("idle"); + return; + } + // Always refresh the manifest on ready/rebuild. + void fetchManifest() + .then(setManifest) + .catch((err: unknown) => { + setManifest({ + error: err instanceof Error ? err.message : String(err), + }); + }); + if (payload.restart) { + // Training run is early-stopping; the active stream will resolve + // once the next checkpoint lands and the subprocess exits cleanly. + // The `finally` block of `run()` picks up the pending flag and + // re-spawns with the same args. + restartPendingRef.current = true; + setHmrStatus(runningRef.current ? "early-stopping" : "idle"); + } else { + setHmrStatus("idle"); + } + }; + es.addEventListener("ready", onMessage); + es.addEventListener("rebuild", onMessage); + es.addEventListener("error", onMessage); + return () => { + es.close(); + }; + }, []); + + async function run(file?: string): Promise { + runningRef.current = true; + lastTrainFileRef.current = file; setRunning(true); setLog(""); try { @@ -41,11 +96,23 @@ export function RunTraining() { }); return next; }); - }); + }, file); } catch (err) { setLog((prev) => prev + `\n[error] ${err instanceof Error ? err.message : String(err)}\n`); } finally { + runningRef.current = false; setRunning(false); + if (restartPendingRef.current) { + restartPendingRef.current = false; + setHmrStatus("restarting"); + // Re-spawn with the same args after a microtask so React commits the + // `running=false` state first (otherwise the re-entry overlaps). + queueMicrotask(() => { + void run(lastTrainFileRef.current); + }); + } else { + setHmrStatus("idle"); + } } } @@ -75,9 +142,15 @@ export function RunTraining() { createArkor.

)} - + {hmrStatus === "early-stopping" && ( + Stopping at next checkpoint… + )} + {hmrStatus === "restarting" && ( + Restarting with updated code… + )}
         {log || "Output will appear here."}
       
diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index ffc54055..3cf6ab6d 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -106,6 +106,26 @@ export function openJobEvents(jobId: string): EventSource { ); } +/** + * HMR rebuild notifications from `arkor dev`. Server pushes a `ready` + * event on first bundle, `rebuild` on each subsequent change, and `error` + * when the bundle fails to compile. `restart: true` indicates a training + * subprocess was signalled to early-stop and the SPA should re-spawn it + * after the current `streamTraining` resolves. + */ +export interface DevEvent { + type: "ready" | "rebuild" | "error"; + outFile?: string; + hash?: string; + message?: string; + restart?: boolean; + restartTargets?: Array<{ pid: number; trainFile?: string }>; +} + +export function openDevEvents(): EventSource { + return new EventSource(withStudioToken("/api/dev/events")); +} + export interface ChatRequestBody { messages: Array<{ role: "system" | "user" | "assistant"; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 48d95dbc..ff53bc4e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -47,9 +47,6 @@ importers: commander: specifier: ^13.0.0 version: 13.1.0 - esbuild: - specifier: ^0.28.0 - version: 0.28.0 hono: specifier: ^4.7.0 version: 4.12.14 @@ -59,6 +56,9 @@ importers: posthog-node: specifier: ^5.30.6 version: 5.30.6(rxjs@7.8.2) + rolldown: + specifier: ^1.0.0-rc.17 + version: 1.0.0-rc.17 zod: specifier: ^4.3.6 version: 4.3.6 @@ -316,312 +316,156 @@ packages: cpu: [ppc64] os: [aix] - '@esbuild/aix-ppc64@0.28.0': - resolution: {integrity: sha512-lhRUCeuOyJQURhTxl4WkpFTjIsbDayJHih5kZC1giwE+MhIzAb7mEsQMqMf18rHLsrb5qI1tafG20mLxEWcWlA==} - engines: {node: '>=18'} - cpu: [ppc64] - os: [aix] - '@esbuild/android-arm64@0.25.12': resolution: {integrity: sha512-6AAmLG7zwD1Z159jCKPvAxZd4y/VTO0VkprYy+3N2FtJ8+BQWFXU+OxARIwA46c5tdD9SsKGZ/1ocqBS/gAKHg==} engines: {node: '>=18'} cpu: [arm64] os: [android] - '@esbuild/android-arm64@0.28.0': - resolution: {integrity: sha512-+WzIXQOSaGs33tLEgYPYe/yQHf0WTU0X42Jca3y8NWMbUVhp7rUnw+vAsRC/QiDrdD31IszMrZy+qwPOPjd+rw==} - engines: {node: '>=18'} - cpu: [arm64] - os: [android] - '@esbuild/android-arm@0.25.12': resolution: {integrity: sha512-VJ+sKvNA/GE7Ccacc9Cha7bpS8nyzVv0jdVgwNDaR4gDMC/2TTRc33Ip8qrNYUcpkOHUT5OZ0bUcNNVZQ9RLlg==} engines: {node: '>=18'} cpu: [arm] os: [android] - '@esbuild/android-arm@0.28.0': - resolution: {integrity: sha512-wqh0ByljabXLKHeWXYLqoJ5jKC4XBaw6Hk08OfMrCRd2nP2ZQ5eleDZC41XHyCNgktBGYMbqnrJKq/K/lzPMSQ==} - engines: {node: '>=18'} - cpu: [arm] - os: [android] - '@esbuild/android-x64@0.25.12': resolution: {integrity: sha512-5jbb+2hhDHx5phYR2By8GTWEzn6I9UqR11Kwf22iKbNpYrsmRB18aX/9ivc5cabcUiAT/wM+YIZ6SG9QO6a8kg==} engines: {node: '>=18'} cpu: [x64] os: [android] - '@esbuild/android-x64@0.28.0': - resolution: {integrity: sha512-+VJggoaKhk2VNNqVL7f6S189UzShHC/mR9EE8rDdSkdpN0KflSwWY/gWjDrNxxisg8Fp1ZCD9jLMo4m0OUfeUA==} - engines: {node: '>=18'} - cpu: [x64] - os: [android] - '@esbuild/darwin-arm64@0.25.12': resolution: {integrity: sha512-N3zl+lxHCifgIlcMUP5016ESkeQjLj/959RxxNYIthIg+CQHInujFuXeWbWMgnTo4cp5XVHqFPmpyu9J65C1Yg==} engines: {node: '>=18'} cpu: [arm64] os: [darwin] - '@esbuild/darwin-arm64@0.28.0': - resolution: {integrity: sha512-0T+A9WZm+bZ84nZBtk1ckYsOvyA3x7e2Acj1KdVfV4/2tdG4fzUp91YHx+GArWLtwqp77pBXVCPn2We7Letr0Q==} - engines: {node: '>=18'} - cpu: [arm64] - os: [darwin] - '@esbuild/darwin-x64@0.25.12': resolution: {integrity: sha512-HQ9ka4Kx21qHXwtlTUVbKJOAnmG1ipXhdWTmNXiPzPfWKpXqASVcWdnf2bnL73wgjNrFXAa3yYvBSd9pzfEIpA==} engines: {node: '>=18'} cpu: [x64] os: [darwin] - '@esbuild/darwin-x64@0.28.0': - resolution: {integrity: sha512-fyzLm/DLDl/84OCfp2f/XQ4flmORsjU7VKt8HLjvIXChJoFFOIL6pLJPH4Yhd1n1gGFF9mPwtlN5Wf82DZs+LQ==} - engines: {node: '>=18'} - cpu: [x64] - os: [darwin] - '@esbuild/freebsd-arm64@0.25.12': resolution: {integrity: sha512-gA0Bx759+7Jve03K1S0vkOu5Lg/85dou3EseOGUes8flVOGxbhDDh/iZaoek11Y8mtyKPGF3vP8XhnkDEAmzeg==} engines: {node: '>=18'} cpu: [arm64] os: [freebsd] - '@esbuild/freebsd-arm64@0.28.0': - resolution: {integrity: sha512-l9GeW5UZBT9k9brBYI+0WDffcRxgHQD8ShN2Ur4xWq/NFzUKm3k5lsH4PdaRgb2w7mI9u61nr2gI2mLI27Nh3Q==} - engines: {node: '>=18'} - cpu: [arm64] - os: [freebsd] - '@esbuild/freebsd-x64@0.25.12': resolution: {integrity: sha512-TGbO26Yw2xsHzxtbVFGEXBFH0FRAP7gtcPE7P5yP7wGy7cXK2oO7RyOhL5NLiqTlBh47XhmIUXuGciXEqYFfBQ==} engines: {node: '>=18'} cpu: [x64] os: [freebsd] - '@esbuild/freebsd-x64@0.28.0': - resolution: {integrity: sha512-BXoQai/A0wPO6Es3yFJ7APCiKGc1tdAEOgeTNy3SsB491S3aHn4S4r3e976eUnPdU+NbdtmBuLncYir2tMU9Nw==} - engines: {node: '>=18'} - cpu: [x64] - os: [freebsd] - '@esbuild/linux-arm64@0.25.12': resolution: {integrity: sha512-8bwX7a8FghIgrupcxb4aUmYDLp8pX06rGh5HqDT7bB+8Rdells6mHvrFHHW2JAOPZUbnjUpKTLg6ECyzvas2AQ==} engines: {node: '>=18'} cpu: [arm64] os: [linux] - '@esbuild/linux-arm64@0.28.0': - resolution: {integrity: sha512-RVyzfb3FWsGA55n6WY0MEIEPURL1FcbhFE6BffZEMEekfCzCIMtB5yyDcFnVbTnwk+CLAgTujmV/Lgvih56W+A==} - engines: {node: '>=18'} - cpu: [arm64] - os: [linux] - '@esbuild/linux-arm@0.25.12': resolution: {integrity: sha512-lPDGyC1JPDou8kGcywY0YILzWlhhnRjdof3UlcoqYmS9El818LLfJJc3PXXgZHrHCAKs/Z2SeZtDJr5MrkxtOw==} engines: {node: '>=18'} cpu: [arm] os: [linux] - '@esbuild/linux-arm@0.28.0': - resolution: {integrity: sha512-CjaaREJagqJp7iTaNQjjidaNbCKYcd4IDkzbwwxtSvjI7NZm79qiHc8HqciMddQ6CKvJT6aBd8lO9kN/ZudLlw==} - engines: {node: '>=18'} - cpu: [arm] - os: [linux] - '@esbuild/linux-ia32@0.25.12': resolution: {integrity: sha512-0y9KrdVnbMM2/vG8KfU0byhUN+EFCny9+8g202gYqSSVMonbsCfLjUO+rCci7pM0WBEtz+oK/PIwHkzxkyharA==} engines: {node: '>=18'} cpu: [ia32] os: [linux] - '@esbuild/linux-ia32@0.28.0': - resolution: {integrity: sha512-KBnSTt1kxl9x70q+ydterVdl+Cn0H18ngRMRCEQfrbqdUuntQQ0LoMZv47uB97NljZFzY6HcfqEZ2SAyIUTQBQ==} - engines: {node: '>=18'} - cpu: [ia32] - os: [linux] - '@esbuild/linux-loong64@0.25.12': resolution: {integrity: sha512-h///Lr5a9rib/v1GGqXVGzjL4TMvVTv+s1DPoxQdz7l/AYv6LDSxdIwzxkrPW438oUXiDtwM10o9PmwS/6Z0Ng==} engines: {node: '>=18'} cpu: [loong64] os: [linux] - '@esbuild/linux-loong64@0.28.0': - resolution: {integrity: sha512-zpSlUce1mnxzgBADvxKXX5sl8aYQHo2ezvMNI8I0lbblJtp8V4odlm3Yzlj7gPyt3T8ReksE6bK+pT3WD+aJRg==} - engines: {node: '>=18'} - cpu: [loong64] - os: [linux] - '@esbuild/linux-mips64el@0.25.12': resolution: {integrity: sha512-iyRrM1Pzy9GFMDLsXn1iHUm18nhKnNMWscjmp4+hpafcZjrr2WbT//d20xaGljXDBYHqRcl8HnxbX6uaA/eGVw==} engines: {node: '>=18'} cpu: [mips64el] os: [linux] - '@esbuild/linux-mips64el@0.28.0': - resolution: {integrity: sha512-2jIfP6mmjkdmeTlsX/9vmdmhBmKADrWqN7zcdtHIeNSCH1SqIoNI63cYsjQR8J+wGa4Y5izRcSHSm8K3QWmk3w==} - engines: {node: '>=18'} - cpu: [mips64el] - os: [linux] - '@esbuild/linux-ppc64@0.25.12': resolution: {integrity: sha512-9meM/lRXxMi5PSUqEXRCtVjEZBGwB7P/D4yT8UG/mwIdze2aV4Vo6U5gD3+RsoHXKkHCfSxZKzmDssVlRj1QQA==} engines: {node: '>=18'} cpu: [ppc64] os: [linux] - '@esbuild/linux-ppc64@0.28.0': - resolution: {integrity: sha512-bc0FE9wWeC0WBm49IQMPSPILRocGTQt3j5KPCA8os6VprfuJ7KD+5PzESSrJ6GmPIPJK965ZJHTUlSA6GNYEhg==} - engines: {node: '>=18'} - cpu: [ppc64] - os: [linux] - '@esbuild/linux-riscv64@0.25.12': resolution: {integrity: sha512-Zr7KR4hgKUpWAwb1f3o5ygT04MzqVrGEGXGLnj15YQDJErYu/BGg+wmFlIDOdJp0PmB0lLvxFIOXZgFRrdjR0w==} engines: {node: '>=18'} cpu: [riscv64] os: [linux] - '@esbuild/linux-riscv64@0.28.0': - resolution: {integrity: sha512-SQPZOwoTTT/HXFXQJG/vBX8sOFagGqvZyXcgLA3NhIqcBv1BJU1d46c0rGcrij2B56Z2rNiSLaZOYW5cUk7yLQ==} - engines: {node: '>=18'} - cpu: [riscv64] - os: [linux] - '@esbuild/linux-s390x@0.25.12': resolution: {integrity: sha512-MsKncOcgTNvdtiISc/jZs/Zf8d0cl/t3gYWX8J9ubBnVOwlk65UIEEvgBORTiljloIWnBzLs4qhzPkJcitIzIg==} engines: {node: '>=18'} cpu: [s390x] os: [linux] - '@esbuild/linux-s390x@0.28.0': - resolution: {integrity: sha512-SCfR0HN8CEEjnYnySJTd2cw0k9OHB/YFzt5zgJEwa+wL/T/raGWYMBqwDNAC6dqFKmJYZoQBRfHjgwLHGSrn3Q==} - engines: {node: '>=18'} - cpu: [s390x] - os: [linux] - '@esbuild/linux-x64@0.25.12': resolution: {integrity: sha512-uqZMTLr/zR/ed4jIGnwSLkaHmPjOjJvnm6TVVitAa08SLS9Z0VM8wIRx7gWbJB5/J54YuIMInDquWyYvQLZkgw==} engines: {node: '>=18'} cpu: [x64] os: [linux] - '@esbuild/linux-x64@0.28.0': - resolution: {integrity: sha512-us0dSb9iFxIi8srnpl931Nvs65it/Jd2a2K3qs7fz2WfGPHqzfzZTfec7oxZJRNPXPnNYZtanmRc4AL/JwVzHQ==} - engines: {node: '>=18'} - cpu: [x64] - os: [linux] - '@esbuild/netbsd-arm64@0.25.12': resolution: {integrity: sha512-xXwcTq4GhRM7J9A8Gv5boanHhRa/Q9KLVmcyXHCTaM4wKfIpWkdXiMog/KsnxzJ0A1+nD+zoecuzqPmCRyBGjg==} engines: {node: '>=18'} cpu: [arm64] os: [netbsd] - '@esbuild/netbsd-arm64@0.28.0': - resolution: {integrity: sha512-CR/RYotgtCKwtftMwJlUU7xCVNg3lMYZ0RzTmAHSfLCXw3NtZtNpswLEj/Kkf6kEL3Gw+BpOekRX0BYCtklhUw==} - engines: {node: '>=18'} - cpu: [arm64] - os: [netbsd] - '@esbuild/netbsd-x64@0.25.12': resolution: {integrity: sha512-Ld5pTlzPy3YwGec4OuHh1aCVCRvOXdH8DgRjfDy/oumVovmuSzWfnSJg+VtakB9Cm0gxNO9BzWkj6mtO1FMXkQ==} engines: {node: '>=18'} cpu: [x64] os: [netbsd] - '@esbuild/netbsd-x64@0.28.0': - resolution: {integrity: sha512-nU1yhmYutL+fQ71Kxnhg8uEOdC0pwEW9entHykTgEbna2pw2dkbFSMeqjjyHZoCmt8SBkOSvV+yNmm94aUrrqw==} - engines: {node: '>=18'} - cpu: [x64] - os: [netbsd] - '@esbuild/openbsd-arm64@0.25.12': resolution: {integrity: sha512-fF96T6KsBo/pkQI950FARU9apGNTSlZGsv1jZBAlcLL1MLjLNIWPBkj5NlSz8aAzYKg+eNqknrUJ24QBybeR5A==} engines: {node: '>=18'} cpu: [arm64] os: [openbsd] - '@esbuild/openbsd-arm64@0.28.0': - resolution: {integrity: sha512-cXb5vApOsRsxsEl4mcZ1XY3D4DzcoMxR/nnc4IyqYs0rTI8ZKmW6kyyg+11Z8yvgMfAEldKzP7AdP64HnSC/6g==} - engines: {node: '>=18'} - cpu: [arm64] - os: [openbsd] - '@esbuild/openbsd-x64@0.25.12': resolution: {integrity: sha512-MZyXUkZHjQxUvzK7rN8DJ3SRmrVrke8ZyRusHlP+kuwqTcfWLyqMOE3sScPPyeIXN/mDJIfGXvcMqCgYKekoQw==} engines: {node: '>=18'} cpu: [x64] os: [openbsd] - '@esbuild/openbsd-x64@0.28.0': - resolution: {integrity: sha512-8wZM2qqtv9UP3mzy7HiGYNH/zjTA355mpeuA+859TyR+e+Tc08IHYpLJuMsfpDJwoLo1ikIJI8jC3GFjnRClzA==} - engines: {node: '>=18'} - cpu: [x64] - os: [openbsd] - '@esbuild/openharmony-arm64@0.25.12': resolution: {integrity: sha512-rm0YWsqUSRrjncSXGA7Zv78Nbnw4XL6/dzr20cyrQf7ZmRcsovpcRBdhD43Nuk3y7XIoW2OxMVvwuRvk9XdASg==} engines: {node: '>=18'} cpu: [arm64] os: [openharmony] - '@esbuild/openharmony-arm64@0.28.0': - resolution: {integrity: sha512-FLGfyizszcef5C3YtoyQDACyg95+dndv79i2EekILBofh5wpCa1KuBqOWKrEHZg3zrL3t5ouE5jgr94vA+Wb2w==} - engines: {node: '>=18'} - cpu: [arm64] - os: [openharmony] - '@esbuild/sunos-x64@0.25.12': resolution: {integrity: sha512-3wGSCDyuTHQUzt0nV7bocDy72r2lI33QL3gkDNGkod22EsYl04sMf0qLb8luNKTOmgF/eDEDP5BFNwoBKH441w==} engines: {node: '>=18'} cpu: [x64] os: [sunos] - '@esbuild/sunos-x64@0.28.0': - resolution: {integrity: sha512-1ZgjUoEdHZZl/YlV76TSCz9Hqj9h9YmMGAgAPYd+q4SicWNX3G5GCyx9uhQWSLcbvPW8Ni7lj4gDa1T40akdlw==} - engines: {node: '>=18'} - cpu: [x64] - os: [sunos] - '@esbuild/win32-arm64@0.25.12': resolution: {integrity: sha512-rMmLrur64A7+DKlnSuwqUdRKyd3UE7oPJZmnljqEptesKM8wx9J8gx5u0+9Pq0fQQW8vqeKebwNXdfOyP+8Bsg==} engines: {node: '>=18'} cpu: [arm64] os: [win32] - '@esbuild/win32-arm64@0.28.0': - resolution: {integrity: sha512-Q9StnDmQ/enxnpxCCLSg0oo4+34B9TdXpuyPeTedN/6+iXBJ4J+zwfQI28u/Jl40nOYAxGoNi7mFP40RUtkmUA==} - engines: {node: '>=18'} - cpu: [arm64] - os: [win32] - '@esbuild/win32-ia32@0.25.12': resolution: {integrity: sha512-HkqnmmBoCbCwxUKKNPBixiWDGCpQGVsrQfJoVGYLPT41XWF8lHuE5N6WhVia2n4o5QK5M4tYr21827fNhi4byQ==} engines: {node: '>=18'} cpu: [ia32] os: [win32] - '@esbuild/win32-ia32@0.28.0': - resolution: {integrity: sha512-zF3ag/gfiCe6U2iczcRzSYJKH1DCI+ByzSENHlM2FcDbEeo5Zd2C86Aq0tKUYAJJ1obRP84ymxIAksZUcdztHA==} - engines: {node: '>=18'} - cpu: [ia32] - os: [win32] - '@esbuild/win32-x64@0.25.12': resolution: {integrity: sha512-alJC0uCZpTFrSL0CCDjcgleBXPnCrEAhTBILpeAp7M/OFgoqtAetfBzX0xM00MUsVVPpVjlPuMbREqnZCXaTnA==} engines: {node: '>=18'} cpu: [x64] os: [win32] - '@esbuild/win32-x64@0.28.0': - resolution: {integrity: sha512-pEl1bO9mfAmIC+tW5btTmrKaujg3zGtUmWNdCw/xs70FBjwAL3o9OEKNHvNmnyylD6ubxUERiEhdsL0xBQ9efw==} - engines: {node: '>=18'} - cpu: [x64] - os: [win32] - '@floating-ui/core@1.7.5': resolution: {integrity: sha512-1Ih4WTWyw0+lKyFMcBHGbb5U5FtuHJuujoyyr5zTaWS5EYMeT6Jb2AuDeftsCsEuchO+mM2ij5+q9crhydzLhQ==} @@ -2600,11 +2444,6 @@ packages: engines: {node: '>=18'} hasBin: true - esbuild@0.28.0: - resolution: {integrity: sha512-sNR9MHpXSUV/XB4zmsFKN+QgVG82Cc7+/aaxJ8Adi8hyOac+EXptIp45QBPaVyX3N70664wRbTcLTOemCAnyqw==} - engines: {node: '>=18'} - hasBin: true - escalade@3.2.0: resolution: {integrity: sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==} engines: {node: '>=6'} @@ -5425,159 +5264,81 @@ snapshots: '@esbuild/aix-ppc64@0.25.12': optional: true - '@esbuild/aix-ppc64@0.28.0': - optional: true - '@esbuild/android-arm64@0.25.12': optional: true - '@esbuild/android-arm64@0.28.0': - optional: true - '@esbuild/android-arm@0.25.12': optional: true - '@esbuild/android-arm@0.28.0': - optional: true - '@esbuild/android-x64@0.25.12': optional: true - '@esbuild/android-x64@0.28.0': - optional: true - '@esbuild/darwin-arm64@0.25.12': optional: true - '@esbuild/darwin-arm64@0.28.0': - optional: true - '@esbuild/darwin-x64@0.25.12': optional: true - '@esbuild/darwin-x64@0.28.0': - optional: true - '@esbuild/freebsd-arm64@0.25.12': optional: true - '@esbuild/freebsd-arm64@0.28.0': - optional: true - '@esbuild/freebsd-x64@0.25.12': optional: true - '@esbuild/freebsd-x64@0.28.0': - optional: true - '@esbuild/linux-arm64@0.25.12': optional: true - '@esbuild/linux-arm64@0.28.0': - optional: true - '@esbuild/linux-arm@0.25.12': optional: true - '@esbuild/linux-arm@0.28.0': - optional: true - '@esbuild/linux-ia32@0.25.12': optional: true - '@esbuild/linux-ia32@0.28.0': - optional: true - '@esbuild/linux-loong64@0.25.12': optional: true - '@esbuild/linux-loong64@0.28.0': - optional: true - '@esbuild/linux-mips64el@0.25.12': optional: true - '@esbuild/linux-mips64el@0.28.0': - optional: true - '@esbuild/linux-ppc64@0.25.12': optional: true - '@esbuild/linux-ppc64@0.28.0': - optional: true - '@esbuild/linux-riscv64@0.25.12': optional: true - '@esbuild/linux-riscv64@0.28.0': - optional: true - '@esbuild/linux-s390x@0.25.12': optional: true - '@esbuild/linux-s390x@0.28.0': - optional: true - '@esbuild/linux-x64@0.25.12': optional: true - '@esbuild/linux-x64@0.28.0': - optional: true - '@esbuild/netbsd-arm64@0.25.12': optional: true - '@esbuild/netbsd-arm64@0.28.0': - optional: true - '@esbuild/netbsd-x64@0.25.12': optional: true - '@esbuild/netbsd-x64@0.28.0': - optional: true - '@esbuild/openbsd-arm64@0.25.12': optional: true - '@esbuild/openbsd-arm64@0.28.0': - optional: true - '@esbuild/openbsd-x64@0.25.12': optional: true - '@esbuild/openbsd-x64@0.28.0': - optional: true - '@esbuild/openharmony-arm64@0.25.12': optional: true - '@esbuild/openharmony-arm64@0.28.0': - optional: true - '@esbuild/sunos-x64@0.25.12': optional: true - '@esbuild/sunos-x64@0.28.0': - optional: true - '@esbuild/win32-arm64@0.25.12': optional: true - '@esbuild/win32-arm64@0.28.0': - optional: true - '@esbuild/win32-ia32@0.25.12': optional: true - '@esbuild/win32-ia32@0.28.0': - optional: true - '@esbuild/win32-x64@0.25.12': optional: true - '@esbuild/win32-x64@0.28.0': - optional: true - '@floating-ui/core@1.7.5': dependencies: '@floating-ui/utils': 0.2.11 @@ -7923,35 +7684,6 @@ snapshots: '@esbuild/win32-ia32': 0.25.12 '@esbuild/win32-x64': 0.25.12 - esbuild@0.28.0: - optionalDependencies: - '@esbuild/aix-ppc64': 0.28.0 - '@esbuild/android-arm': 0.28.0 - '@esbuild/android-arm64': 0.28.0 - '@esbuild/android-x64': 0.28.0 - '@esbuild/darwin-arm64': 0.28.0 - '@esbuild/darwin-x64': 0.28.0 - '@esbuild/freebsd-arm64': 0.28.0 - '@esbuild/freebsd-x64': 0.28.0 - '@esbuild/linux-arm': 0.28.0 - '@esbuild/linux-arm64': 0.28.0 - '@esbuild/linux-ia32': 0.28.0 - '@esbuild/linux-loong64': 0.28.0 - '@esbuild/linux-mips64el': 0.28.0 - '@esbuild/linux-ppc64': 0.28.0 - '@esbuild/linux-riscv64': 0.28.0 - '@esbuild/linux-s390x': 0.28.0 - '@esbuild/linux-x64': 0.28.0 - '@esbuild/netbsd-arm64': 0.28.0 - '@esbuild/netbsd-x64': 0.28.0 - '@esbuild/openbsd-arm64': 0.28.0 - '@esbuild/openbsd-x64': 0.28.0 - '@esbuild/openharmony-arm64': 0.28.0 - '@esbuild/sunos-x64': 0.28.0 - '@esbuild/win32-arm64': 0.28.0 - '@esbuild/win32-ia32': 0.28.0 - '@esbuild/win32-x64': 0.28.0 - escalade@3.2.0: {} escape-html@1.0.3: {} From f705338f37d1f1621ac0213a3b241fc951493433 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 2 May 2026 21:27:35 +0900 Subject: [PATCH 02/55] Implement callback hot-swapping and enhance cleanup logic for graceful shutdown - Introduced `replaceCallbacks` method in the Trainer API to allow dynamic updates of lifecycle callbacks during training runs. - Enhanced signal handling for graceful early stopping, ensuring in-flight checkpoints are preserved during HMR rebuilds. - Added `registerCleanupHook` for streamlined resource management on process exit, improving cleanup logic across development commands. - Updated documentation to reflect new features and usage patterns for better developer guidance. --- AGENTS.md | 11 +- docs/concepts/studio.mdx | 5 +- docs/ja/concepts/studio.mdx | 5 +- docs/ja/sdk/trainer-control.mdx | 15 +- docs/sdk/trainer-control.mdx | 15 +- packages/arkor/src/cli/cleanupHooks.ts | 62 ++++++ packages/arkor/src/cli/commands/build.ts | 55 ++--- packages/arkor/src/cli/commands/dev.ts | 51 ++--- packages/arkor/src/core/arkor.test.ts | 1 + packages/arkor/src/core/configHash.test.ts | 51 +++++ packages/arkor/src/core/configHash.ts | 36 ++++ packages/arkor/src/core/rolldownConfig.ts | 84 ++++++++ packages/arkor/src/core/runner.test.ts | 1 + packages/arkor/src/core/runner.ts | 55 +---- packages/arkor/src/core/runnerSignals.test.ts | 193 ++++++++++++++++++ packages/arkor/src/core/runnerSignals.ts | 124 +++++++++++ packages/arkor/src/core/trainer.test.ts | 83 ++++++++ packages/arkor/src/core/trainer.ts | 28 ++- packages/arkor/src/core/trainerInspection.ts | 75 +++++++ packages/arkor/src/core/types.ts | 12 ++ packages/arkor/src/studio/hmr.ts | 130 +++++++----- packages/arkor/src/studio/manifest.ts | 55 +++-- packages/arkor/src/studio/server.ts | 82 ++++---- .../arkor/src/studio/trainRegistry.test.ts | 118 +++++++++++ packages/arkor/src/studio/trainRegistry.ts | 117 +++++++++++ .../studio-app/src/components/RunTraining.tsx | 13 +- packages/studio-app/src/lib/api.ts | 11 + 27 files changed, 1259 insertions(+), 229 deletions(-) create mode 100644 packages/arkor/src/cli/cleanupHooks.ts create mode 100644 packages/arkor/src/core/configHash.test.ts create mode 100644 packages/arkor/src/core/configHash.ts create mode 100644 packages/arkor/src/core/rolldownConfig.ts create mode 100644 packages/arkor/src/core/runnerSignals.test.ts create mode 100644 packages/arkor/src/core/runnerSignals.ts create mode 100644 packages/arkor/src/core/trainerInspection.ts create mode 100644 packages/arkor/src/studio/trainRegistry.test.ts create mode 100644 packages/arkor/src/studio/trainRegistry.ts diff --git a/AGENTS.md b/AGENTS.md index 60960f68..dce272be 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -73,13 +73,16 @@ The whole point: prevents another browser tab on the same machine from POSTing ` When touching the Studio server or SPA fetch layer, preserve: token via header for `fetch`, query param for `EventSource`, host-header guard, no CORS, timing-safe compare. The Vite plugin is dev-only (`apply: "serve"`) — running it during `vite build` would bake a stale per-launch token into the production `index.html` and shadow the runtime tag, causing every `/api/*` call to 403. -### HMR + graceful early-stop +### HMR + graceful early-stop + callback hot-swap -`arkor dev` keeps a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` ([packages/arkor/src/studio/hmr.ts](packages/arkor/src/studio/hmr.ts)) and pushes rebuild events over `/api/dev/events` (SSE). The SPA re-fetches `/api/manifest` on each event so the Run Training button stays in sync without a browser refresh. +`arkor dev` keeps a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` ([packages/arkor/src/studio/hmr.ts](packages/arkor/src/studio/hmr.ts)) and pushes rebuild events over `/api/dev/events` (SSE). On each successful build the watcher dynamic-imports the artifact, pulls a `TrainerInspection` snapshot off the discovered trainer (via the cross-realm `Symbol.for("arkor.trainer.inspect")` brand attached in [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts)), and computes a stable `configHash` from the cloud-side `JobConfig`. The SPA re-fetches `/api/manifest` on each event so the Run Training button stays in sync without a browser refresh. -When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server SIGTERM's the child. The child's signal handler in `runTrainer` calls `Trainer.requestEarlyStop()`, which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA then auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. +When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server makes a per-child decision in [packages/arkor/src/studio/trainRegistry.ts](packages/arkor/src/studio/trainRegistry.ts): -Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. The hard kill timer in `requestEarlyStopOnActive` exists only as a stuck-process fallback. +- **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and calls `Trainer.replaceCallbacks` in place. The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. +- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` calls `Trainer.requestEarlyStop()`, which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. + +Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. ### Project entry-point discovery diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index 3ef52cf8..6d186b73 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -13,7 +13,10 @@ Three jobs: 2. **See training happen.** A jobs list with live status, a loss chart that updates as the run streams in, and a tail of training events. You can leave it open in a tab while you work on other things. 3. **Try a finished model.** A Playground page lets you pick the base model or the final adapter from any completed job and chat with it. The Playground does not load intermediate checkpoints; for mid-run inference, use [`onCheckpoint`](/concepts/lifecycle) callbacks in your trainer. -A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` and pushes rebuild notifications to the SPA over a Server-Sent Events stream (`/api/dev/events`). Edit a file, save, and the Run training button updates with the new trainer name without a refresh. If a training run is in flight, the Studio asks it to early-stop at the next checkpoint (`Trainer.requestEarlyStop()`) so the work isn't wasted, then re-spawns the run with the rebuilt artifact. The Cloud-side job for the previous run reaches `cancelled` after the checkpoint is uploaded. +A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` and pushes rebuild notifications to the SPA over a Server-Sent Events stream (`/api/dev/events`). Edit a file, save, and the Run training button updates with the new trainer name without a refresh. If a training run is in flight, the Studio compares the new bundle's cloud-side `JobConfig` hash to the one captured when the run was spawned: + +- **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and calls `Trainer.replaceCallbacks` in place. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. +- **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; `Trainer.requestEarlyStop` lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. ## Where Studio runs diff --git a/docs/ja/concepts/studio.mdx b/docs/ja/concepts/studio.mdx index b8ae40ff..a60df962 100644 --- a/docs/ja/concepts/studio.mdx +++ b/docs/ja/concepts/studio.mdx @@ -13,7 +13,10 @@ Studio は `arkor dev` 実行時に得られるローカル Web UI です。サ 2. **学習を眺める。** ライブステータス付きのジョブ一覧、ストリーム到着とともに更新される Loss チャート、学習イベントのテール。タブで開きっぱなしにして他の作業ができます。 3. **完成モデルを試す。** Playground ページでベースモデルや任意の完了ジョブの最終アダプタを選んでチャットできます。中間チェックポイントは Playground からはロードしません。学習中の推論には [`onCheckpoint`](/ja/concepts/lifecycle) コールバックをトレーナーで使ってください。 -dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッチャを `src/arkor/` 上で常駐させ、再ビルド通知を Server-Sent Events ストリーム (`/api/dev/events`) で SPA に push します。ファイルを編集して保存すれば、Run training ボタンのトレーナー名表示はリロード無しで更新されます。学習が走っている最中であれば、Studio はそのジョブに次のチェックポイントで Early Stopping を要求し(`Trainer.requestEarlyStop()`、ここまでの学習成果は保全)、再ビルドした成果物で自動的に再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移します。 +dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッチャを `src/arkor/` 上で常駐させ、再ビルド通知を Server-Sent Events ストリーム (`/api/dev/events`) で SPA に push します。ファイルを編集して保存すれば、Run training ボタンのトレーナー名表示はリロード無しで更新されます。学習が走っている最中であれば、Studio は再ビルドしたバンドルの Cloud 側 `JobConfig` ハッシュを、spawn 時に保存したハッシュと比較します。 + +- **ハッシュ一致(コールバックのみ変更)。** ランナーへ SIGUSR2 を送ります。ランナーは再ビルドされた成果物を再 import し、その場で `Trainer.replaceCallbacks` を呼びます。Cloud 側の学習はそのまま継続し、GPU 時間を無駄にせず、SPA には "Callbacks hot-swapped" と短く表示されます。 +- **ハッシュ不一致(モデル / データセット / ハイパーパラメータが変わった)。** ランナーへ SIGTERM を送ります。`Trainer.requestEarlyStop` が次のチェックポイントのアップロードを待ってから `cancel()` を発火し、SPA が再ビルドした成果物で再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移するので、ここまでの学習成果は artifact として保全されます。 ## Studio が動く場所 diff --git a/docs/ja/sdk/trainer-control.mdx b/docs/ja/sdk/trainer-control.mdx index 8683d3d5..2c02d59c 100644 --- a/docs/ja/sdk/trainer-control.mdx +++ b/docs/ja/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start、wait、cancel、abortSignal、再接続の仕組み。" # トレーナー制御 -`createTrainer` は次の 4 メソッドを持つ `Trainer` オブジェクトを返します: +`createTrainer` は次の 5 メソッドを持つ `Trainer` オブジェクトを返します: ```ts interface Trainer { @@ -14,6 +14,7 @@ interface Trainer { wait(): Promise; cancel(): Promise; requestEarlyStop(opts?: { timeoutMs?: number }): Promise; + replaceCallbacks(callbacks: Partial): void; } interface TrainingResult { @@ -74,6 +75,18 @@ await trainer.requestEarlyStop({ timeoutMs: 60_000 }); 自前のコード(プログラム的な two-process パターンなど)から `requestEarlyStop()` を直接呼ぶこともできます。Cookbook の [Early Stopping](/ja/cookbook/early-stopping) レシピが `onCheckpoint` + `abortSignal` + `cancel()` で組み立てているのと同じ「実行中ステップを捨てずに止める」セマンティクスを、ワンショットで提供します。レシピ版の方が柔軟(メトリクス次第で abort のタイミングを決めるなど)ですが、こちらは「次のチェックポイントで止める」という典型ケースの便利フックです。 +## `replaceCallbacks()` + +```ts +trainer.replaceCallbacks({ + onLog: ({ step, loss }) => myMetrics.record(step, loss), +}); +``` + +実行中の run のままライフサイクルコールバックを atomic に差し替えます。次にディスパッチされるイベント(`onLog` / `onCheckpoint` …)は新しいオブジェクトから読みます。すでに `await` 中のハンドラは resolve するまで古い参照を保持します。Cloud 側の config(モデル、データセット、ハイパーパラメータ)は `start()` 時点で確定しており、このメソッド経由では **変更できません** — それらを変えたい場合は `requestEarlyStop()` を呼んで再投入してください。 + +これは `arkor dev` の「コールバックのみ HMR」パスで使われている SDK プリミティブです。実行中にソースを保存すると Studio は再ビルドした `JobConfig` のハッシュを spawn 時に保存したハッシュと比較します。一致 → SIGUSR2 → ランナーが再 import して `replaceCallbacks()` を呼ぶ(Cloud 側の学習は無傷)。不一致 → SIGTERM → 既存の `requestEarlyStop()` 経路に切り替わって、新しいバンドルで SPA が再投入します。 + ## `abortSignal` ```ts diff --git a/docs/sdk/trainer-control.mdx b/docs/sdk/trainer-control.mdx index c43a404b..3f664cd2 100644 --- a/docs/sdk/trainer-control.mdx +++ b/docs/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start, wait, cancel, abortSignal, and how reconnects work." # Trainer control -`createTrainer` returns a `Trainer` object with four methods: +`createTrainer` returns a `Trainer` object with five methods: ```ts interface Trainer { @@ -14,6 +14,7 @@ interface Trainer { wait(): Promise; cancel(): Promise; requestEarlyStop(opts?: { timeoutMs?: number }): Promise; + replaceCallbacks(callbacks: Partial): void; } interface TrainingResult { @@ -74,6 +75,18 @@ This is what `arkor dev`'s HMR pipeline uses internally: when you save a source You can use `requestEarlyStop()` directly from your own code (e.g. in a programmatic two-process pattern) if you want the same "stop, but don't throw away the in-flight step" semantics that the cookbook's [Early stopping](/cookbook/early-stopping) recipe builds out of `onCheckpoint` + `abortSignal` + `cancel()`. The recipe is more flexible (you decide when to abort based on metrics); this method is the convenience hook for the common "stop after the next checkpoint" case. +## `replaceCallbacks()` + +```ts +trainer.replaceCallbacks({ + onLog: ({ step, loss }) => myMetrics.record(step, loss), +}); +``` + +Atomically swap the lifecycle callbacks while a run is in flight. The next dispatched event (`onLog`, `onCheckpoint`, ...) reads from the new object; events already mid-`await` keep their old reference until they resolve. Cloud-side config (model, dataset, hyperparameters) is fixed at `start()` time and **cannot** be changed via this method — for those, use `requestEarlyStop()` and re-spawn. + +This is the SDK primitive `arkor dev` uses for the "callback-only HMR" path: when you save a source file mid-run, Studio diffs the rebuilt `JobConfig` against the spawn-time hash. Equal hashes → SIGUSR2, the runner re-imports and calls `replaceCallbacks()`, the cloud-side training run is untouched. Different hashes → SIGTERM, the existing `requestEarlyStop()` flow takes over and the SPA re-spawns with the new bundle. + ## `abortSignal` ```ts diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts new file mode 100644 index 00000000..473bcbc7 --- /dev/null +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -0,0 +1,62 @@ +const TERMINATING_SIGNALS = ["SIGINT", "SIGTERM", "SIGHUP"] as const; + +export interface CleanupHookOptions { + /** + * Idempotent cleanup body. Wrapped with a `done` guard so a noisy + * shutdown (signal arriving while `process.exit` is already running an + * `exit` listener) doesn't trigger a double-cleanup. + */ + cleanup: () => void | Promise; + /** + * Whether the signal-handler arm of the registration should also call + * `process.exit(0)` after cleanup. Use `true` for the outermost + * cleanup responsible for terminating the process; `false` for inner + * cleanups that should pass control through to a sibling exit + * handler. Default: `false`. + */ + exitOnSignal?: boolean; +} + +/** + * Register a cleanup hook that fires on `process.exit` and on + * SIGINT / SIGTERM / SIGHUP. Used by `runDev` to dispose long-lived + * resources (the studio-token file, the HMR coordinator) without each + * call site re-implementing the same idempotent-guard + per-signal + * registration boilerplate. + * + * Registration order matters: Node fires listeners in the order they + * were attached, so the *first* `registerCleanupHook` call gets to run + * before subsequent ones. The Studio dev launcher relies on this to + * guarantee that "tear down HMR" lands before "remove studio-token". + */ +export function registerCleanupHook(options: CleanupHookOptions): void { + let done = false; + // Synchronous wrapper so signal handlers preserve "cleanup landed + // before this function returns" — important for sync cleanups (e.g. + // `unlinkSync`) and for tests that assert the side effect right after + // invoking the handler. Async cleanups are fire-and-forget with a + // catch so a hung dispose doesn't block exit. + const run = (): void => { + if (done) return; + done = true; + try { + const result = options.cleanup(); + if (result && typeof (result as Promise).catch === "function") { + (result as Promise).catch(() => { + // best-effort: shutdown is racing other cleanup paths + }); + } + } catch { + // best-effort + } + }; + + process.on("exit", run); + + for (const sig of TERMINATING_SIGNALS) { + process.on(sig, () => { + run(); + if (options.exitOnSignal) process.exit(0); + }); + } +} diff --git a/packages/arkor/src/cli/commands/build.ts b/packages/arkor/src/cli/commands/build.ts index 3d555c59..c4609039 100644 --- a/packages/arkor/src/cli/commands/build.ts +++ b/packages/arkor/src/cli/commands/build.ts @@ -1,7 +1,12 @@ import { existsSync } from "node:fs"; import { mkdir } from "node:fs/promises"; -import { isAbsolute, relative, resolve } from "node:path"; +import { relative } from "node:path"; import { rolldown } from "rolldown"; +import { + BUILD_DEFAULTS, + resolveBuildEntry, + rolldownInputOptions, +} from "../../core/rolldownConfig"; import { ui } from "../prompts"; export interface BuildOptions { @@ -22,59 +27,25 @@ export interface BuildResult { outFile: string; } -const DEFAULT_ENTRY = "src/arkor/index.ts"; -const DEFAULT_OUT_DIR = ".arkor/build"; - -/** - * `node.` derived from the running Node binary. Build host and run - * host are effectively the same process: Studio spawns `arkor start` with - * `process.execPath`, so the bundle can target precisely what will execute it. - */ -function resolveNodeTarget(): string { - const [major = "22", minor = "6"] = process.versions.node.split("."); - return `node${major}.${minor}`; -} - /** * Bundle the user's `src/arkor/index.ts` into a single ESM artifact at * `.arkor/build/index.mjs`. * - * Bare specifiers (`arkor`, anything from `node_modules`) are kept external so - * the artifact resolves the runtime SDK from the project's installed copy. - * Relative imports are bundled inline. + * Bare specifiers (`arkor`, anything from `node_modules`) are kept external + * so the artifact resolves the runtime SDK from the project's installed + * copy. Relative imports are bundled inline. The transform target is + * derived from the running Node binary (see `resolveNodeTarget`). */ export async function runBuild(opts: BuildOptions = {}): Promise { - const cwd = opts.cwd ?? process.cwd(); - const entryRel = opts.entry ?? DEFAULT_ENTRY; - const entry = isAbsolute(entryRel) ? entryRel : resolve(cwd, entryRel); + const { cwd, entry, outDir, outFile } = resolveBuildEntry(opts); if (!existsSync(entry)) { throw new Error( - `Build entry not found: ${entry}. Create ${DEFAULT_ENTRY} or pass an explicit entry argument.`, + `Build entry not found: ${entry}. Create ${BUILD_DEFAULTS.entry} or pass an explicit entry argument.`, ); } - - const outDirRel = opts.outDir ?? DEFAULT_OUT_DIR; - const outDir = isAbsolute(outDirRel) ? outDirRel : resolve(cwd, outDirRel); await mkdir(outDir, { recursive: true }); - const outFile = resolve(outDir, "index.mjs"); - const bundle = await rolldown({ - input: entry, - cwd, - platform: "node", - logLevel: "warn", - transform: { target: resolveNodeTarget() }, - // Mirror esbuild's `packages: "external"`: any specifier that isn't a - // relative or absolute path stays external. `node:`-prefixed builtins are - // already handled by `platform: "node"` but we keep the explicit allow as - // a safety net in case the builtin set drifts. - external: (id, _importer, isResolved) => { - if (isResolved) return false; - if (id.startsWith(".")) return false; - if (isAbsolute(id)) return false; - return true; - }, - }); + const bundle = await rolldown(rolldownInputOptions({ cwd, entry })); try { await bundle.write({ file: outFile, format: "esm" }); } finally { diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index 2ba1c9b3..89dd7f5e 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -18,6 +18,7 @@ import { import { buildStudioApp } from "../../studio/server"; import { createHmrCoordinator } from "../../studio/hmr"; import { ANON_PERSISTENCE_NUDGE } from "../anonymous"; +import { registerCleanupHook } from "../cleanupHooks"; import { ui } from "../prompts"; export interface DevOptions { @@ -172,42 +173,26 @@ async function persistStudioToken(token: string): Promise { } function scheduleStudioTokenCleanup(path: string): void { - let cleaned = false; - const cleanup = () => { - if (cleaned) return; - cleaned = true; - try { - unlinkSync(path); - } catch { - // best-effort - } - }; - process.on("exit", cleanup); - for (const sig of ["SIGINT", "SIGTERM", "SIGHUP"] as const) { - process.on(sig, () => { - cleanup(); - process.exit(0); - }); - } + registerCleanupHook({ + cleanup: () => { + try { + unlinkSync(path); + } catch { + // best-effort + } + }, + // Outermost cleanup: responsible for terminating the process after + // all earlier-registered hooks (e.g. HMR dispose) have run. + exitOnSignal: true, + }); } function scheduleHmrCleanup(hmr: { dispose: () => Promise }): void { - let disposed = false; - const dispose = () => { - if (disposed) return; - disposed = true; - hmr.dispose().catch(() => { - // best-effort: shutdown is racing other cleanup paths - }); - }; - // Mirror `scheduleStudioTokenCleanup` exit hooks. Note that those handlers - // already call `process.exit(0)` for the same signals; this listener fires - // first because Node invokes signal handlers in registration order, so the - // dispose call lands before exit. - process.on("exit", dispose); - for (const sig of ["SIGINT", "SIGTERM", "SIGHUP"] as const) { - process.on(sig, dispose); - } + // Registered before the studio-token cleanup so it runs first on + // shutdown — Node fires signal handlers in registration order, and we + // want the watcher to release file handles before the outermost + // process.exit. + registerCleanupHook({ cleanup: () => hmr.dispose() }); } export async function runDev(options: DevOptions = {}): Promise { diff --git a/packages/arkor/src/core/arkor.test.ts b/packages/arkor/src/core/arkor.test.ts index d3dc41a7..0b353786 100644 --- a/packages/arkor/src/core/arkor.test.ts +++ b/packages/arkor/src/core/arkor.test.ts @@ -24,6 +24,7 @@ function fakeTrainer(name = "run"): Trainer { }, async cancel() {}, async requestEarlyStop() {}, + replaceCallbacks() {}, }; } diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts new file mode 100644 index 00000000..def058ed --- /dev/null +++ b/packages/arkor/src/core/configHash.test.ts @@ -0,0 +1,51 @@ +import { describe, it, expect } from "vitest"; +import { hashJobConfig } from "./configHash"; +import type { JobConfig } from "./types"; + +describe("hashJobConfig", () => { + it("returns the same hash for key-order-equivalent configs", () => { + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + maxSteps: 10, + learningRate: 1e-4, + }; + const b: JobConfig = { + learningRate: 1e-4, + maxSteps: 10, + datasetSource: { name: "x", type: "huggingface" }, + model: "m", + } as JobConfig; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + + it("returns different hashes for materially different configs", () => { + const base: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }; + expect(hashJobConfig(base)).not.toBe( + hashJobConfig({ ...base, model: "m2" }), + ); + expect(hashJobConfig(base)).not.toBe( + hashJobConfig({ + ...base, + datasetSource: { type: "huggingface", name: "y" }, + }), + ); + }); + + it("is order-stable for nested arrays (dataset format / split)", () => { + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", "b", "c"], + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", "b", "c"], + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); +}); diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts new file mode 100644 index 00000000..eb8cda22 --- /dev/null +++ b/packages/arkor/src/core/configHash.ts @@ -0,0 +1,36 @@ +import { createHash } from "node:crypto"; +import type { JobConfig } from "./types"; + +/** + * Deterministic JSON serialiser: keys sorted at every nesting level so + * `{a:1, b:2}` and `{b:2, a:1}` produce the same string. Necessary because + * `JSON.stringify` follows insertion order, which isn't stable across + * `buildJobConfig` revisions or user-side spread-merge tricks. + */ +function stableStringify(value: unknown): string { + if (value === null || typeof value !== "object") return JSON.stringify(value); + if (Array.isArray(value)) { + return `[${value.map(stableStringify).join(",")}]`; + } + const keys = Object.keys(value as Record).sort(); + const parts = keys.map( + (k) => + `${JSON.stringify(k)}:${stableStringify( + (value as Record)[k], + )}`, + ); + return `{${parts.join(",")}}`; +} + +/** + * Stable fingerprint of a `JobConfig`. Used by HMR to decide whether a + * rebuild changed only the in-process callbacks (configHash unchanged → + * hot-swap) or the cloud-side training config (configHash changed → + * full restart with `requestEarlyStop`). + */ +export function hashJobConfig(config: JobConfig): string { + return createHash("sha256") + .update(stableStringify(config)) + .digest("hex") + .slice(0, 16); +} diff --git a/packages/arkor/src/core/rolldownConfig.ts b/packages/arkor/src/core/rolldownConfig.ts new file mode 100644 index 00000000..3e687d13 --- /dev/null +++ b/packages/arkor/src/core/rolldownConfig.ts @@ -0,0 +1,84 @@ +import { isAbsolute, resolve } from "node:path"; +import type { InputOptions } from "rolldown"; + +const DEFAULT_ENTRY = "src/arkor/index.ts"; +const DEFAULT_OUT_DIR = ".arkor/build"; + +export interface BuildEntryOptions { + /** Source entry path; defaults to `src/arkor/index.ts`. */ + entry?: string; + /** Output directory; defaults to `.arkor/build`. */ + outDir?: string; + /** Project root; defaults to `process.cwd()`. */ + cwd?: string; +} + +export interface ResolvedBuildEntry { + /** Project root (absolute). */ + cwd: string; + /** Entry source file (absolute). */ + entry: string; + /** Output directory (absolute). */ + outDir: string; + /** Output bundle (absolute, always `/index.mjs`). */ + outFile: string; +} + +/** Resolve `cwd` / `entry` / `outDir` to absolute paths with the standard defaults. */ +export function resolveBuildEntry(opts: BuildEntryOptions): ResolvedBuildEntry { + const cwd = opts.cwd ?? process.cwd(); + const entryRel = opts.entry ?? DEFAULT_ENTRY; + const entry = isAbsolute(entryRel) ? entryRel : resolve(cwd, entryRel); + const outDirRel = opts.outDir ?? DEFAULT_OUT_DIR; + const outDir = isAbsolute(outDirRel) ? outDirRel : resolve(cwd, outDirRel); + const outFile = resolve(outDir, "index.mjs"); + return { cwd, entry, outDir, outFile }; +} + +/** + * `node.` derived from the running Node binary. Build host and + * run host are effectively the same process (Studio spawns `arkor start` with + * `process.execPath`), so the bundle can target precisely what will execute it. + */ +export function resolveNodeTarget(): string { + const [major = "22", minor = "6"] = process.versions.node.split("."); + return `node${major}.${minor}`; +} + +/** + * Build the shared rolldown options object used by both `runBuild` (one-shot) + * and the HMR coordinator (`watch()`). Centralising the configuration here + * keeps the two pipelines aligned: anything that affects the bundle shape — + * external resolution, transform target, platform — is set in one place so + * the artifact a watcher writes is byte-equivalent to a one-shot rebuild. + */ +export function rolldownInputOptions( + resolved: Pick, +): InputOptions { + return { + input: resolved.entry, + cwd: resolved.cwd, + platform: "node", + logLevel: "warn", + transform: { target: resolveNodeTarget() }, + // Mirror esbuild's `packages: "external"`: any specifier that isn't a + // relative or absolute path stays external. `node:`-prefixed builtins + // are already handled by `platform: "node"`; the explicit allow below + // is a safety net in case the builtin set drifts. + external: (id, _importer, isResolved) => { + if (isResolved) return false; + if (id.startsWith(".")) return false; + if (isAbsolute(id)) return false; + return true; + }, + }; +} + +/** + * Re-exported defaults so consumers (like error messages) can name the same + * paths we resolve internally. + */ +export const BUILD_DEFAULTS = { + entry: DEFAULT_ENTRY, + outDir: DEFAULT_OUT_DIR, +} as const; diff --git a/packages/arkor/src/core/runner.test.ts b/packages/arkor/src/core/runner.test.ts index ee1667be..cde91c56 100644 --- a/packages/arkor/src/core/runner.test.ts +++ b/packages/arkor/src/core/runner.test.ts @@ -34,6 +34,7 @@ function fakeTrainer(onStart?: () => void, onWait?: () => void): Trainer { }, async cancel() {}, async requestEarlyStop() {}, + replaceCallbacks() {}, }; } diff --git a/packages/arkor/src/core/runner.ts b/packages/arkor/src/core/runner.ts index 1fc5b0b2..2d26c53b 100644 --- a/packages/arkor/src/core/runner.ts +++ b/packages/arkor/src/core/runner.ts @@ -2,6 +2,10 @@ import { existsSync } from "node:fs"; import { resolve, isAbsolute } from "node:path"; import { pathToFileURL } from "node:url"; import { isArkor } from "./arkor"; +import { + installCallbackReloadHandler, + installShutdownHandlers, +} from "./runnerSignals"; import type { Trainer } from "./types"; const DEFAULT_ENTRY = "src/arkor/index.ts"; @@ -42,51 +46,6 @@ function extractTrainer(mod: Record): Trainer { ); } -const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"] as const; - -/** - * Two-stage signal handling so HMR rebuilds (Studio sends SIGTERM) preserve - * the in-flight checkpoint work: - * - * - 1st signal → `trainer.requestEarlyStop()`. The trainer keeps running, - * lets the next `checkpoint.saved` event land, then issues `cancel()`. - * - 2nd signal → immediate `process.exit(143)`. Escape hatch for an - * impatient operator or a hung early-stop. - * - * The handlers are removed in `finally` so a normal `wait()` completion - * doesn't leave stale listeners behind — important because `runTrainer` can - * be called multiple times in tests within a single Node process. - */ -function installShutdownHandlers(trainer: Trainer): () => void { - let signalCount = 0; - const handler = (signal: NodeJS.Signals): void => { - signalCount += 1; - if (signalCount > 1) { - process.stdout.write( - `Received second ${signal}; exiting without waiting for checkpoint.\n`, - ); - process.exit(143); - // Explicit return so test mocks of process.exit (which don't actually - // terminate the worker) don't fall through into the early-stop path. - return; - } - process.stdout.write( - `Received ${signal}; early-stopping at next checkpoint…\n`, - ); - trainer - .requestEarlyStop() - .catch((err: unknown) => { - const msg = err instanceof Error ? err.message : String(err); - process.stderr.write(`requestEarlyStop failed: ${msg}\n`); - }) - .finally(() => process.exit(0)); - }; - for (const sig of SHUTDOWN_SIGNALS) process.on(sig, handler); - return () => { - for (const sig of SHUTDOWN_SIGNALS) process.off(sig, handler); - }; -} - export async function runTrainer(file?: string): Promise { const relative = file ?? DEFAULT_ENTRY; const abs = isAbsolute(relative) ? relative : resolve(process.cwd(), relative); @@ -98,7 +57,8 @@ export async function runTrainer(file?: string): Promise { const mod = (await import(pathToFileURL(abs).href)) as Record; const trainer = extractTrainer(mod); - const removeShutdownHandlers = installShutdownHandlers(trainer); + const removeShutdown = installShutdownHandlers(trainer); + const removeCallbackReload = installCallbackReloadHandler(trainer, abs); try { const { jobId } = await trainer.start(); process.stdout.write(`Started job ${jobId}\n`); @@ -107,6 +67,7 @@ export async function runTrainer(file?: string): Promise { `Job ${result.job.id} finished with status=${result.job.status}\n`, ); } finally { - removeShutdownHandlers(); + removeShutdown(); + removeCallbackReload(); } } diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts new file mode 100644 index 00000000..45712248 --- /dev/null +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -0,0 +1,193 @@ +import { describe, it, expect, beforeEach, afterEach, vi } from "vitest"; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + installCallbackReloadHandler, + installShutdownHandlers, +} from "./runnerSignals"; +import type { Trainer, TrainerCallbacks } from "./types"; +import { attachTrainerInspection } from "./trainerInspection"; + +let cwd: string; + +beforeEach(() => { + cwd = mkdtempSync(join(tmpdir(), "arkor-signals-test-")); +}); + +afterEach(() => { + rmSync(cwd, { recursive: true, force: true }); +}); + +function makeTrainer(): Trainer & { + __earlyStop: { calls: number }; + __replace: { lastCallbacks: Partial | null }; +} { + const earlyStop = { calls: 0 }; + const replace = { lastCallbacks: null as Partial | null }; + const trainer: Trainer = { + name: "n", + async start() { + return { jobId: "j" }; + }, + async wait() { + throw new Error("not used"); + }, + async cancel() {}, + async requestEarlyStop() { + earlyStop.calls += 1; + }, + replaceCallbacks(callbacks) { + replace.lastCallbacks = callbacks; + }, + }; + return Object.assign(trainer, { + __earlyStop: earlyStop, + __replace: replace, + }); +} + +describe("installShutdownHandlers", () => { + it("calls trainer.requestEarlyStop on the first SIGTERM and exit(0)", async () => { + const trainer = makeTrainer(); + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation((() => undefined as never) as typeof process.exit); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const dispose = installShutdownHandlers(trainer); + try { + process.emit("SIGTERM", "SIGTERM"); + await new Promise((r) => setTimeout(r, 10)); + expect(trainer.__earlyStop.calls).toBe(1); + expect(exitSpy).toHaveBeenCalledWith(0); + } finally { + dispose(); + exitSpy.mockRestore(); + stdoutSpy.mockRestore(); + } + }); + + it("second SIGTERM exits 143 without re-invoking requestEarlyStop", async () => { + const trainer = makeTrainer(); + const exitCodes: number[] = []; + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((code?: number) => { + exitCodes.push(code ?? 0); + return undefined as never; + }) as typeof process.exit); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const dispose = installShutdownHandlers(trainer); + try { + process.emit("SIGTERM", "SIGTERM"); + await new Promise((r) => setTimeout(r, 10)); + process.emit("SIGTERM", "SIGTERM"); + await new Promise((r) => setTimeout(r, 10)); + expect(trainer.__earlyStop.calls).toBe(1); + expect(exitCodes).toContain(0); + expect(exitCodes).toContain(143); + } finally { + dispose(); + exitSpy.mockRestore(); + stdoutSpy.mockRestore(); + } + }); +}); + +describe("installCallbackReloadHandler", () => { + function writeUserBundle(label: string): string { + const file = join(cwd, "entry.mjs"); + // Inline a fake trainer that wears the inspection brand. The + // SIGUSR2 handler dynamic-imports this file and pulls the + // callbacks reference off via `getTrainerInspection`. + const src = ` + const KEY = Symbol.for("arkor.trainer.inspect"); + const callbacks = { onLog: (ctx) => globalThis.__arkor_callbackProbe?.(${JSON.stringify(label)}, ctx) }; + const trainer = { + name: "t", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + requestEarlyStop: async () => {}, + replaceCallbacks: () => {}, + }; + Object.defineProperty(trainer, KEY, { + value: () => ({ name: "t", config: { model: "m", datasetSource: { type: "huggingface", name: "x" } }, callbacks }), + enumerable: false, + }); + export const arkor = Object.freeze({ _kind: "arkor", trainer }); + `; + writeFileSync(file, src); + return file; + } + + it("re-imports the bundle and forwards the new callbacks via replaceCallbacks", async () => { + const trainer = makeTrainer(); + // Brand the trainer too so the import path-side has a reference shape. + attachTrainerInspection(trainer, () => ({ + name: "n", + config: { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }, + callbacks: {}, + })); + + const file = writeUserBundle("v1"); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const stderrSpy = vi + .spyOn(process.stderr, "write") + .mockImplementation((() => true) as typeof process.stderr.write); + const dispose = installCallbackReloadHandler(trainer, file); + mkdirSync(join(cwd, "src"), { recursive: true }); + try { + // Rewrite the entry to "v2" callbacks before signalling. + writeUserBundle("v2"); + process.emit("SIGUSR2", "SIGUSR2"); + // Wait for the dynamic import + replaceCallbacks to settle. + for (let i = 0; i < 50 && trainer.__replace.lastCallbacks === null; i++) { + await new Promise((r) => setTimeout(r, 10)); + } + expect(trainer.__replace.lastCallbacks).not.toBeNull(); + expect(typeof trainer.__replace.lastCallbacks?.onLog).toBe("function"); + } finally { + dispose(); + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + } + }); + + it("logs a skip warning when the bundle has no inspectable trainer", async () => { + const trainer = makeTrainer(); + const file = join(cwd, "no-trainer.mjs"); + writeFileSync(file, "export const nothing = true;\n"); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const stderrChunks: string[] = []; + const stderrSpy = vi + .spyOn(process.stderr, "write") + .mockImplementation(((chunk: unknown) => { + stderrChunks.push(String(chunk)); + return true; + }) as typeof process.stderr.write); + const dispose = installCallbackReloadHandler(trainer, file); + try { + process.emit("SIGUSR2", "SIGUSR2"); + // Give the dynamic import a few ticks. + await new Promise((r) => setTimeout(r, 50)); + expect(stderrChunks.join("")).toMatch(/no inspectable trainer/i); + expect(trainer.__replace.lastCallbacks).toBeNull(); + } finally { + dispose(); + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + } + }); +}); diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts new file mode 100644 index 00000000..c715ee4f --- /dev/null +++ b/packages/arkor/src/core/runnerSignals.ts @@ -0,0 +1,124 @@ +import { pathToFileURL } from "node:url"; +import { isArkor } from "./arkor"; +import { getTrainerInspection } from "./trainerInspection"; +import type { Trainer, TrainerCallbacks } from "./types"; + +const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"] as const; +const CALLBACK_RELOAD_SIGNAL = "SIGUSR2" as const; + +/** + * Two-stage shutdown handling so HMR rebuilds (Studio sends SIGTERM) + * preserve the in-flight checkpoint work: + * + * - 1st signal → `trainer.requestEarlyStop()`. The trainer keeps + * running, lets the next `checkpoint.saved` event land, then issues + * `cancel()`. + * - 2nd signal → immediate `process.exit(143)`. Escape hatch for an + * impatient operator or a hung early-stop. + * + * The returned dispose function removes the handlers so a normal + * `wait()` completion doesn't leave stale listeners behind — important + * because `runTrainer` can be called multiple times in tests within a + * single Node process. + */ +export function installShutdownHandlers(trainer: Trainer): () => void { + let signalCount = 0; + const handler = (signal: NodeJS.Signals): void => { + signalCount += 1; + if (signalCount > 1) { + process.stdout.write( + `Received second ${signal}; exiting without waiting for checkpoint.\n`, + ); + process.exit(143); + // Explicit return so test mocks of process.exit (which don't + // actually terminate the worker) don't fall through into the + // early-stop path. + return; + } + process.stdout.write( + `Received ${signal}; early-stopping at next checkpoint…\n`, + ); + trainer + .requestEarlyStop() + .catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`requestEarlyStop failed: ${msg}\n`); + }) + .finally(() => process.exit(0)); + }; + for (const sig of SHUTDOWN_SIGNALS) process.on(sig, handler); + return () => { + for (const sig of SHUTDOWN_SIGNALS) process.off(sig, handler); + }; +} + +/** + * SIGUSR2 handler: re-import the freshly-rebuilt artefact and call + * `Trainer.replaceCallbacks` with the new callbacks. The cloud-side + * training run is untouched — only the in-process callbacks rotate. + * + * Studio sends SIGUSR2 from the `/api/dev/events` HMR pipeline when + * (and only when) the rebuilt bundle's `JobConfig` hash matches the + * one captured at spawn time. A mismatch produces SIGTERM instead, which + * goes through `installShutdownHandlers` above. + */ +export function installCallbackReloadHandler( + trainer: Trainer, + entryPath: string, +): () => void { + const handler = (): void => { + const url = `${pathToFileURL(entryPath).href}?t=${Date.now()}`; + void (async () => { + try { + const mod = (await import(url)) as Record; + const callbacks = extractCallbacks(mod); + if (!callbacks) { + process.stderr.write( + "Callback reload skipped: rebuilt bundle has no inspectable trainer.\n", + ); + return; + } + trainer.replaceCallbacks(callbacks); + process.stdout.write( + "Callbacks hot-reloaded; training run continues.\n", + ); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`Callback reload failed: ${msg}\n`); + } + })(); + }; + process.on(CALLBACK_RELOAD_SIGNAL, handler); + return () => { + process.off(CALLBACK_RELOAD_SIGNAL, handler); + }; +} + +/** + * Extract the user-supplied callbacks reference from a re-imported + * bundle. Mirrors `runner.ts`'s entry-extraction precedence (named + * `arkor` export → bare `trainer` → default-export shapes) but pulls + * callbacks via `getTrainerInspection` so we get the current cell of + * `currentCallbacks` at re-import time. Returns `null` when the new + * bundle has no inspectable trainer. + */ +function extractCallbacks( + mod: Record, +): Partial | null { + const candidates: unknown[] = []; + if (isArkor(mod.arkor) && mod.arkor.trainer) candidates.push(mod.arkor.trainer); + if (mod.trainer) candidates.push(mod.trainer); + if (isArkor(mod.default) && mod.default.trainer) candidates.push(mod.default.trainer); + if ( + mod.default && + typeof mod.default === "object" && + "trainer" in (mod.default as Record) + ) { + candidates.push((mod.default as Record).trainer); + } + for (const c of candidates) { + const inspection = getTrainerInspection(c); + if (inspection) return inspection.callbacks; + } + return null; +} diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 13e8145c..6934729b 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1502,6 +1502,89 @@ describe("createTrainer (early stop)", () => { await trainer.requestEarlyStop({ timeoutMs: 1 }); }); + it("replaceCallbacks swaps the dispatched callbacks on the next event", async () => { + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 1, + })}\n\n`, + `id: 3\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + step: 2, + loss: 0.5, + })}\n\n`, + `id: 4\nevent: training.completed\ndata: ${JSON.stringify({ + type: "training.completed", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:04Z", + })}\n\n`, + ]; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const calls: string[] = []; + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: ({ step }) => { + calls.push(`v1:onLog(${step})`); + // After the first onLog call, swap to v2 callbacks. The next + // event must dispatch via the new callbacks object. + if (step === 1) { + trainer.replaceCallbacks({ + onLog: ({ step: s }) => void calls.push(`v2:onLog(${s})`), + }); + } + }, + }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + await trainer.wait(); + } finally { + globalThis.fetch = original; + } + expect(calls).toEqual(["v1:onLog(1)", "v2:onLog(2)"]); + }); + it("is idempotent — repeated calls share the same in-flight promise", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index f5e88ca5..d3d598c7 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -6,11 +6,13 @@ import { type Credentials, } from "./credentials"; import { ensureProjectState } from "./projectState"; +import { attachTrainerInspection } from "./trainerInspection"; import type { CheckpointContext, InferArgs, JobConfig, Trainer, + TrainerCallbacks, TrainerInput, TrainingJob, TrainingLogContext, @@ -139,6 +141,13 @@ export function createTrainer( let scope: { orgSlug: string; projectSlug: string } | null = null; let clientPromise: Promise | null = null; + // Mutable callbacks slot. Each `dispatch()` invocation reads this fresh, + // so `replaceCallbacks(...)` takes effect on the next event. Events + // already mid-await keep their old reference until they resolve, which + // matches the "replace, don't interrupt" contract documented on + // `Trainer.replaceCallbacks`. + let currentCallbacks: Partial = input.callbacks ?? {}; + // Early-stop state. `requestEarlyStop()` arms the latch; the next // `checkpoint.saved` dispatch (or the timeout, whichever fires first) // calls cancel() and resolves the deferred. Idempotent across repeat @@ -208,7 +217,10 @@ export function createTrainer( throw new Error("Trainer is in an inconsistent state"); } const client = await getClient(); - const callbacks = input.callbacks ?? {}; + // Read once per dispatch so a `replaceCallbacks` between events takes + // effect on the next dispatch, but doesn't change identity inside a + // single in-flight handler. + const callbacks = currentCallbacks; switch (event.type) { case "training.started": { @@ -412,6 +424,10 @@ export function createTrainer( await client.cancelJob(startedJob.id, scope); }, + replaceCallbacks(callbacks: Partial): void { + currentCallbacks = callbacks ?? {}; + }, + async requestEarlyStop(opts: { timeoutMs?: number } = {}): Promise { // Nothing in flight: cleanup any prior latch and resolve. if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { @@ -454,5 +470,15 @@ export function createTrainer( }, }; + // Brand the trainer with an inspection accessor so the Studio server can + // (a) hash the cloud-side config to decide HMR strategy and (b) read the + // current callbacks reference when hot-swapping. See `trainerInspection.ts` + // for why this uses `Symbol.for` instead of a module-local WeakMap. + attachTrainerInspection(trainer, () => ({ + name: input.name, + config, + callbacks: currentCallbacks, + })); + return trainer; } diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts new file mode 100644 index 00000000..3c384df1 --- /dev/null +++ b/packages/arkor/src/core/trainerInspection.ts @@ -0,0 +1,75 @@ +import type { JobConfig, TrainerCallbacks } from "./types"; + +/** + * Snapshot of a trainer's identity and cloud-side config that the Studio + * server reads in order to (a) compute a stable hash for HMR's + * "callbacks-only vs full restart" decision and (b) extract the new + * callbacks reference when hot-swapping. + */ +export interface TrainerInspection { + /** Run name (mirror of `Trainer.name`, copied for forward compatibility). */ + name: string; + /** The cloud-side `JobConfig` this trainer would submit on `start()`. */ + config: JobConfig; + /** Whatever the user passed in `input.callbacks`. May be empty. */ + callbacks: Partial; +} + +/** + * The CLI runtime (`dist/bin.mjs`) and the user's compiled bundle + * (`.arkor/build/index.mjs`, which keeps `arkor` external) end up loading + * two separate copies of this SDK as distinct ESM module records — so a + * module-local `WeakMap` would split into two halves that + * can't see each other. + * + * `Symbol.for(key)` is the cross-realm equivalent: the same key string + * resolves to the same symbol in any module instance, so the trainer + * created in the user's bundle exposes its inspection through the same + * property the Studio process reads. + */ +const TRAINER_INSPECTION_KEY = Symbol.for("arkor.trainer.inspect"); + +/** + * Stamp the inspection snapshot onto a freshly-built `Trainer` instance. + * Called once from `createTrainer`. Stored as a thunk so callers can + * read a fresh copy each time (defensive: the trainer's callbacks cell + * is mutable across the lifetime of a hot-swap). + */ +export function attachTrainerInspection( + trainer: object, + read: () => TrainerInspection, +): void { + Object.defineProperty(trainer, TRAINER_INSPECTION_KEY, { + value: read, + configurable: true, + enumerable: false, + writable: false, + }); +} + +/** + * Pull the snapshot off a Trainer-like value. Returns `null` for plain + * objects that don't carry the brand — used by the Studio server to + * gracefully ignore third-party wrappers or pre-SDK shapes. + */ +export function getTrainerInspection( + trainer: unknown, +): TrainerInspection | null { + if (!trainer || typeof trainer !== "object") return null; + const fn = (trainer as Record)[TRAINER_INSPECTION_KEY]; + if (typeof fn !== "function") return null; + try { + const result = (fn as () => unknown).call(trainer); + if ( + result && + typeof result === "object" && + "config" in result && + "name" in result + ) { + return result as TrainerInspection; + } + } catch { + // Inspection is best-effort; a thrown user callback shouldn't crash HMR. + } + return null; +} diff --git a/packages/arkor/src/core/types.ts b/packages/arkor/src/core/types.ts index c0ec4d31..c42be926 100644 --- a/packages/arkor/src/core/types.ts +++ b/packages/arkor/src/core/types.ts @@ -211,6 +211,18 @@ export interface Trainer { * resolves immediately without contacting the cloud API. */ requestEarlyStop(opts?: { timeoutMs?: number }): Promise; + /** + * Atomically swap the lifecycle callbacks while the run is in flight. The + * next dispatched event (`onLog`, `onCheckpoint`, ...) reads from the new + * object; events already mid-await keep their old reference until they + * resolve. Used by `arkor dev`'s HMR pipeline to hot-swap callback code + * without restarting the cloud-side training. + * + * Cloud-side config (model, dataset, hyperparameters) is fixed at + * `start()` time and **cannot** be changed via this method — for those + * use `requestEarlyStop()` and let HMR re-spawn the run. + */ + replaceCallbacks(callbacks: Partial): void; } /** diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 42a03078..9433bbdc 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -1,42 +1,50 @@ import { existsSync, statSync } from "node:fs"; -import { isAbsolute, resolve } from "node:path"; +import { pathToFileURL } from "node:url"; import { watch, type RolldownWatcher } from "rolldown"; +import { isArkor } from "../core/arkor"; +import { hashJobConfig } from "../core/configHash"; +import { + BUILD_DEFAULTS, + resolveBuildEntry, + rolldownInputOptions, + type BuildEntryOptions, +} from "../core/rolldownConfig"; +import { getTrainerInspection } from "../core/trainerInspection"; export type HmrEventType = "ready" | "rebuild" | "error"; export interface HmrEvent { type: HmrEventType; outFile?: string; - /** Short content fingerprint (mtime + size) so subscribers can dedupe. */ + /** + * Short fingerprint of the bundle artefact (mtime + size). Subscribers + * use this to dedupe replays of the same successful build. + */ hash?: string; + /** + * Stable hash of the trainer's cloud-side `JobConfig`. When this is + * unchanged across a rebuild, only the in-process callbacks moved and + * the Studio server can hot-swap them without restarting the run. + * `null` when the bundle has no discoverable trainer (e.g. the user's + * source has a syntax error or the Arkor manifest is missing). + */ + configHash?: string | null; + /** Run name pulled from the rebuilt manifest. */ + trainerName?: string | null; /** Human-readable error message; only present on `type === "error"`. */ message?: string; } export interface HmrCoordinator { /** - * Receive the current cached state immediately, then every subsequent event. - * Returns an unsubscribe function. + * Receive the current cached state immediately, then every subsequent + * event. Returns an unsubscribe function. */ subscribe(fn: (event: HmrEvent) => void): () => void; dispose(): Promise; } -export interface HmrOptions { - cwd: string; - /** Defaults to `src/arkor/index.ts`. */ - entry?: string; - /** Defaults to `.arkor/build`. */ - outDir?: string; -} - -const DEFAULT_ENTRY = "src/arkor/index.ts"; -const DEFAULT_OUT_DIR = ".arkor/build"; - -function resolveNodeTarget(): string { - const [major = "22", minor = "6"] = process.versions.node.split("."); - return `node${major}.${minor}`; -} +export type HmrOptions = BuildEntryOptions; function fingerprint(outFile: string): string { try { @@ -47,6 +55,32 @@ function fingerprint(outFile: string): string { } } +/** + * Dynamic-import the freshly-built bundle and pull a `TrainerInspection` + * snapshot off the discovered trainer. Cache-bust the URL so Node's ESM + * loader returns the new module text rather than a stale evaluation. Best- + * effort: a missing/malformed manifest or a thrown user constructor returns + * `null` and the caller treats the rebuild as "config-unknown". + */ +async function inspectBundle( + outFile: string, +): Promise<{ configHash: string; trainerName: string } | null> { + try { + const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; + const mod = (await import(url)) as Record; + const candidate = mod.arkor ?? mod.default; + if (!isArkor(candidate)) return null; + const inspection = getTrainerInspection(candidate.trainer); + if (!inspection) return null; + return { + configHash: hashJobConfig(inspection.config), + trainerName: inspection.name, + }; + } catch { + return null; + } +} + /** * Spin up a rolldown watcher over the user's `src/arkor` entry, broadcasting * `ready` / `rebuild` / `error` to subscribers. Used by `arkor dev` to push @@ -54,18 +88,13 @@ function fingerprint(outFile: string): string { * * Lazy: the watcher only starts on the first `subscribe` call so a Studio * launch in a project without `src/arkor/index.ts` doesn't immediately fail - * — the watcher kicks in once the user creates the file and the SPA opens an - * EventSource. After every successful build the watcher caches the latest - * state and replays it to new subscribers so a late-mounting component still - * sees the trainer. + * — the watcher kicks in once the user creates the file and the SPA opens + * an EventSource. After every successful build the watcher caches the + * latest state and replays it to new subscribers so a late-mounting + * component still sees the trainer. */ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { - const cwd = opts.cwd; - const entryRel = opts.entry ?? DEFAULT_ENTRY; - const entry = isAbsolute(entryRel) ? entryRel : resolve(cwd, entryRel); - const outDirRel = opts.outDir ?? DEFAULT_OUT_DIR; - const outDir = isAbsolute(outDirRel) ? outDirRel : resolve(cwd, outDirRel); - const outFile = resolve(outDir, "index.mjs"); + const resolved = resolveBuildEntry(opts); const subscribers = new Set<(event: HmrEvent) => void>(); let lastEvent: HmrEvent | null = null; @@ -78,35 +107,37 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { try { fn(event); } catch { - // Subscribers are SSE controllers — a thrown error usually means the - // connection closed mid-flight. Drop it so one bad subscriber can't - // poison the broadcast for the rest. + // Subscribers are SSE controllers — a thrown error usually means + // the connection closed mid-flight. Drop it so one bad subscriber + // can't poison the broadcast for the rest. } } } + async function emitBuildSucceeded(eventType: HmrEventType): Promise { + if (disposed) return; + const inspection = await inspectBundle(resolved.outFile); + broadcast({ + type: eventType, + outFile: resolved.outFile, + hash: fingerprint(resolved.outFile), + configHash: inspection?.configHash ?? null, + trainerName: inspection?.trainerName ?? null, + }); + } + function startWatcher(): void { if (watcher || disposed) return; - if (!existsSync(entry)) { + if (!existsSync(resolved.entry)) { broadcast({ type: "error", - message: `Build entry not found: ${entry}. Create ${DEFAULT_ENTRY} or pass an explicit entry argument.`, + message: `Build entry not found: ${resolved.entry}. Create ${BUILD_DEFAULTS.entry} or pass an explicit entry argument.`, }); return; } watcher = watch({ - input: entry, - cwd, - platform: "node", - logLevel: "warn", - transform: { target: resolveNodeTarget() }, - external: (id, _importer, isResolved) => { - if (isResolved) return false; - if (id.startsWith(".")) return false; - if (isAbsolute(id)) return false; - return true; - }, - output: { file: outFile, format: "esm" }, + ...rolldownInputOptions(resolved), + output: { file: resolved.outFile, format: "esm" }, }); let firstBuild = true; watcher.on("event", (event) => { @@ -115,12 +146,15 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { event.result.close().catch(() => {}); const type: HmrEventType = firstBuild ? "ready" : "rebuild"; firstBuild = false; - broadcast({ type, outFile, hash: fingerprint(outFile) }); + void emitBuildSucceeded(type); } else if (event.code === "ERROR") { event.result.close().catch(() => {}); broadcast({ type: "error", - message: event.error instanceof Error ? event.error.message : String(event.error), + message: + event.error instanceof Error + ? event.error.message + : String(event.error), }); } }); diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index 72452da8..ef024e15 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -1,6 +1,8 @@ import { pathToFileURL } from "node:url"; import { runBuild } from "../cli/commands/build"; import { isArkor } from "../core/arkor"; +import { hashJobConfig } from "../core/configHash"; +import { getTrainerInspection } from "../core/trainerInspection"; /** * Wire-friendly snapshot of the user's `createArkor({...})` manifest. Mirrors @@ -9,28 +11,57 @@ import { isArkor } from "../core/arkor"; */ export interface ManifestSummary { trainer: { name: string } | null; + /** + * Stable hash of the trainer's cloud-side `JobConfig`. Used by HMR to + * decide whether a rebuild only changed in-process callbacks (hash + * unchanged → hot-swap) or also touched cloud-side training config + * (hash changed → restart with `requestEarlyStop`). `null` when no + * inspectable trainer is present. + */ + configHash: string | null; // future: deploy: { name: string } | null; // future: eval: { name: string } | null; } -const EMPTY: ManifestSummary = { trainer: null }; +const EMPTY: ManifestSummary = { trainer: null, configHash: null }; /** - * Build the user's `src/arkor/index.ts` and import the artifact to extract a - * serialisable summary of its manifest. The Studio UI hits this on home-page - * load to show *what* the project contains (just the trainer name today; - * deploy / eval slots when those primitives land). + * Dynamic-import an already-built artefact and pull a serialisable + * summary off its trainer. Cache-bust the URL so Node's ESM loader + * returns the fresh module text rather than a stale evaluation. * - * Each call rebuilds and re-imports so edits to the user's source surface - * without restarting Studio. The import URL carries a cache-bust query so - * Node's ESM cache doesn't return a stale module. + * Split out of `readManifestSummary` so callers that already triggered a + * build (the HMR coordinator hands the SPA a `outFile` after each + * `BUNDLE_END`) can inspect the artefact without paying for a redundant + * `runBuild()`. */ -export async function readManifestSummary(cwd: string): Promise { - const { outFile } = await runBuild({ cwd, quiet: true }); +export async function summariseBuiltManifest( + outFile: string, +): Promise { const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; const mod = (await import(url)) as Record; const candidate = mod.arkor ?? mod.default; if (!isArkor(candidate)) return EMPTY; - const trainer = candidate.trainer ? { name: candidate.trainer.name } : null; - return { trainer }; + const trainer = candidate.trainer + ? { name: candidate.trainer.name } + : null; + const inspection = getTrainerInspection(candidate.trainer); + const configHash = inspection ? hashJobConfig(inspection.config) : null; + return { trainer, configHash }; +} + +/** + * Build the user's `src/arkor/index.ts` and import the artifact to + * extract a serialisable summary of its manifest. The Studio UI hits + * this on home-page load to show *what* the project contains (just the + * trainer name today; deploy / eval slots when those primitives land). + * + * Each call rebuilds and re-imports so edits to the user's source + * surface without restarting Studio. + */ +export async function readManifestSummary( + cwd: string, +): Promise { + const { outFile } = await runBuild({ cwd, quiet: true }); + return summariseBuiltManifest(outFile); } diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index d7f744ee..27579232 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -1,4 +1,4 @@ -import { spawn, type ChildProcess } from "node:child_process"; +import { spawn } from "node:child_process"; import { readFile, realpath } from "node:fs/promises"; import { timingSafeEqual } from "node:crypto"; import { Hono } from "hono"; @@ -15,8 +15,9 @@ import { recordDeprecation, tapDeprecation } from "../core/deprecation"; import { SDK_VERSION } from "../core/version"; import { ensureProjectState } from "../core/projectState"; import { readState } from "../core/state"; -import { readManifestSummary } from "./manifest"; +import { readManifestSummary, summariseBuiltManifest } from "./manifest"; import type { HmrCoordinator, HmrEvent } from "./hmr"; +import { TrainRegistry, type RestartTarget } from "./trainRegistry"; const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { @@ -294,31 +295,9 @@ export function buildStudioApp(options: StudioServerOptions) { return new Response(upstream.body, { status: upstream.status, headers }); }); - // Active `/api/train` subprocesses. HMR rebuilds iterate this map and - // SIGTERM each entry so its in-process signal handler (see - // `runTrainer`) can call `trainer.requestEarlyStop()`. Keyed by pid so - // tests can introspect. - interface ActiveTrain { - child: ChildProcess; - trainFile?: string; - } - const activeTrains = new Map(); - - function requestEarlyStopOnActive(): Array<{ - pid: number; - trainFile?: string; - }> { - const targets: Array<{ pid: number; trainFile?: string }> = []; - for (const [pid, entry] of activeTrains) { - try { - entry.child.kill("SIGTERM"); - } catch { - // child may have already exited between the iterator and the kill - } - targets.push({ pid, trainFile: entry.trainFile }); - } - return targets; - } + // Active `/api/train` subprocesses. The registry encapsulates the + // signal-dispatch policy — see `studio/trainRegistry.ts`. + const activeTrains = new TrainRegistry(); app.post("/api/train", async (c) => { const body = (await c.req.json().catch(() => ({}))) as { file?: string }; @@ -347,28 +326,40 @@ export function buildStudioApp(options: StudioServerOptions) { } trainFile = abs; } + // Read the current manifest before spawn so the configHash is on + // hand for HMR's "hot-swap vs restart" decision later. Building the + // manifest also pre-warms `.arkor/build/index.mjs` for the + // subprocess's `runStart`. A failure here is non-fatal — the spawn + // proceeds with `configHash: null`, which forces SIGTERM (full + // restart) on the next rebuild. + let configHash: string | null = null; + try { + const manifest = await readManifestSummary(trainCwd); + configHash = manifest.configHash; + } catch { + // ignore — `arkor start` will surface its own build error to the + // SPA via stderr; we only needed configHash for HMR routing. + } const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); const child = spawn(process.execPath, args, { stdio: "pipe", cwd: trainCwd, }); - if (typeof child.pid === "number") { - activeTrains.set(child.pid, { child, trainFile }); - } + activeTrains.register(child, { trainFile, configHash }); const stream = new ReadableStream({ start(controller) { const enc = new TextEncoder(); child.stdout.on("data", (d) => controller.enqueue(enc.encode(d))); child.stderr.on("data", (d) => controller.enqueue(enc.encode(d))); child.on("close", (code) => { - if (typeof child.pid === "number") activeTrains.delete(child.pid); + activeTrains.unregister(child.pid); controller.enqueue(enc.encode(`\n---\nexit=${code}\n`)); controller.close(); }); }, cancel() { - if (typeof child.pid === "number") activeTrains.delete(child.pid); + activeTrains.unregister(child.pid); child.kill(); }, }); @@ -400,7 +391,9 @@ export function buildStudioApp(options: StudioServerOptions) { const send = ( event: HmrEvent & { restart?: boolean; - restartTargets?: Array<{ pid: number; trainFile?: string }>; + hotSwap?: boolean; + restartTargets?: RestartTarget[]; + hotSwapTargets?: RestartTarget[]; }, ) => { const payload = JSON.stringify(event); @@ -414,12 +407,27 @@ export function buildStudioApp(options: StudioServerOptions) { } }; unsubscribe = hmr.subscribe((event) => { - if (event.type === "rebuild" && activeTrains.size > 0) { - const restartTargets = requestEarlyStopOnActive(); - send({ ...event, restart: true, restartTargets }); - } else { + if (event.type !== "rebuild" || activeTrains.size === 0) { send(event); + return; } + // Per-child decision: if the rebuilt bundle's `configHash` + // matches the child's spawn-time hash, the cloud-side run + // is unaffected — SIGUSR2 lets the runner re-import and + // call `Trainer.replaceCallbacks`. Otherwise SIGTERM + // triggers `Trainer.requestEarlyStop` so the next + // checkpoint lands before the SPA re-spawns. + const nextHash = event.configHash ?? null; + const hotSwapTargets = activeTrains.notifyCallbackReload(nextHash); + const restartTargets = + activeTrains.requestEarlyStopOnMismatch(nextHash); + send({ + ...event, + hotSwap: hotSwapTargets.length > 0, + hotSwapTargets, + restart: restartTargets.length > 0, + restartTargets, + }); }); }, cancel() { diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts new file mode 100644 index 00000000..a1f7c2a1 --- /dev/null +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -0,0 +1,118 @@ +import { describe, it, expect, vi } from "vitest"; +import type { ChildProcess } from "node:child_process"; +import { TrainRegistry } from "./trainRegistry"; + +interface FakeChild { + pid: number; + kill: ReturnType; +} + +function fakeChild(pid: number): FakeChild { + return { pid, kill: vi.fn(() => true) }; +} + +describe("TrainRegistry", () => { + it("ignores children without a pid (already-exited spawns)", () => { + const reg = new TrainRegistry(); + reg.register({ pid: undefined } as unknown as ChildProcess, { + configHash: "h1", + }); + expect(reg.size).toBe(0); + }); + + it("notifyCallbackReload SIGUSR2s only matching configHashes", () => { + const reg = new TrainRegistry(); + const a = fakeChild(101); + const b = fakeChild(102); + const c = fakeChild(103); + reg.register(a as unknown as ChildProcess, { configHash: "match" }); + reg.register(b as unknown as ChildProcess, { + configHash: "different", + trainFile: "/tmp/b.ts", + }); + reg.register(c as unknown as ChildProcess, { configHash: "match" }); + + const signalled = reg.notifyCallbackReload("match"); + expect(signalled).toEqual([ + { pid: 101, trainFile: undefined }, + { pid: 103, trainFile: undefined }, + ]); + expect(a.kill).toHaveBeenCalledWith("SIGUSR2"); + expect(c.kill).toHaveBeenCalledWith("SIGUSR2"); + expect(b.kill).not.toHaveBeenCalled(); + }); + + it("notifyCallbackReload is a no-op when nextConfigHash is null", () => { + const reg = new TrainRegistry(); + const a = fakeChild(201); + reg.register(a as unknown as ChildProcess, { configHash: null }); + expect(reg.notifyCallbackReload(null)).toEqual([]); + expect(a.kill).not.toHaveBeenCalled(); + }); + + it("requestEarlyStopOnMismatch SIGTERMs only mismatched children", () => { + const reg = new TrainRegistry(); + const same = fakeChild(301); + const diff = fakeChild(302); + reg.register(same as unknown as ChildProcess, { configHash: "h" }); + reg.register(diff as unknown as ChildProcess, { + configHash: "x", + trainFile: "/tmp/diff.ts", + }); + + const targets = reg.requestEarlyStopOnMismatch("h"); + expect(targets).toEqual([{ pid: 302, trainFile: "/tmp/diff.ts" }]); + expect(same.kill).not.toHaveBeenCalled(); + expect(diff.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("requestEarlyStopOnMismatch SIGTERMs everything when nextConfigHash is null", () => { + const reg = new TrainRegistry(); + const a = fakeChild(401); + const b = fakeChild(402); + reg.register(a as unknown as ChildProcess, { configHash: "h" }); + reg.register(b as unknown as ChildProcess, { configHash: null }); + + // null nextHash means "we couldn't inspect the new bundle" — be + // conservative and SIGTERM every active child. + const targets = reg.requestEarlyStopOnMismatch(null); + expect(targets).toHaveLength(2); + expect(a.kill).toHaveBeenCalledWith("SIGTERM"); + expect(b.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("requestEarlyStopOnMismatch SIGTERMs children whose stored hash is null", () => { + // A spawn that raced an in-flight build can land with `configHash: + // null`. It must not be hot-swapped — even if the new bundle's hash + // is known, we have no proof the spawned subprocess is running the + // same config. + const reg = new TrainRegistry(); + const a = fakeChild(501); + reg.register(a as unknown as ChildProcess, { configHash: null }); + const targets = reg.requestEarlyStopOnMismatch("h"); + expect(targets).toHaveLength(1); + expect(a.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("unregister removes the child from the policy decisions", () => { + const reg = new TrainRegistry(); + const a = fakeChild(601); + reg.register(a as unknown as ChildProcess, { configHash: "h" }); + reg.unregister(601); + expect(reg.size).toBe(0); + expect(reg.notifyCallbackReload("h")).toEqual([]); + }); + + it("survives kill() throwing (child exited mid-iteration)", () => { + const reg = new TrainRegistry(); + const a = fakeChild(701); + a.kill.mockImplementation(() => { + throw new Error("ESRCH"); + }); + reg.register(a as unknown as ChildProcess, { configHash: "h" }); + // Both code paths should swallow the throw and continue with their + // bookkeeping so a single dead child can't break HMR for siblings. + expect(() => reg.notifyCallbackReload("h")).not.toThrow(); + expect(() => reg.requestEarlyStopOnMismatch("x")).not.toThrow(); + }); +}); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts new file mode 100644 index 00000000..af4e4329 --- /dev/null +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -0,0 +1,117 @@ +import type { ChildProcess } from "node:child_process"; + +/** + * Per-active-train state tracked alongside the spawned `arkor start` + * subprocess. The Studio server records this at spawn time so HMR + * rebuilds can decide, per child, between: + * + * - **SIGUSR2** (callback hot-swap) when the new bundle's `configHash` + * matches the one captured at spawn time — the cloud-side run is + * unaffected, only in-process callbacks need to update. + * - **SIGTERM** (graceful early-stop + restart) when the configs + * diverge — `Trainer.requestEarlyStop` lets the next checkpoint + * finish, the subprocess exits, and the SPA re-spawns with the + * rebuilt artefact. + */ +export interface ActiveTrain { + child: ChildProcess; + trainFile?: string; + /** Cloud-side config hash captured at spawn time (may be null if the + * manifest wasn't inspectable yet — e.g. spawn raced an in-flight + * build). A null entry forces SIGTERM on the next rebuild because we + * can't prove the configs match. */ + configHash: string | null; +} + +export interface RestartTarget { + pid: number; + trainFile?: string; +} + +/** + * Encapsulates the set of `/api/train`-spawned subprocesses and the + * signal-dispatch decision rule for HMR rebuilds. Pulled out of + * `buildStudioApp` so the policy is testable in isolation and so future + * additions (e.g. a `cancel-all` admin endpoint) have a clear seam. + */ +export class TrainRegistry { + private readonly entries = new Map(); + + register(child: ChildProcess, init: Omit): void { + if (typeof child.pid !== "number") return; + this.entries.set(child.pid, { child, ...init }); + } + + unregister(pid: number | undefined): void { + if (typeof pid === "number") this.entries.delete(pid); + } + + get size(): number { + return this.entries.size; + } + + /** Read-only snapshot, mostly for tests / observability. */ + list(): ReadonlyArray { + return [...this.entries.values()]; + } + + /** + * Send a callback hot-swap signal (SIGUSR2) to every child whose + * stored `configHash` matches `nextConfigHash`. The child's runner + * (`installCallbackReloadHandler`) re-imports the rebuilt bundle and + * calls `Trainer.replaceCallbacks`. Returns the list of children + * actually signalled, so the SSE event payload can include them for + * SPA-side telemetry. + */ + notifyCallbackReload( + nextConfigHash: string | null, + ): Array<{ pid: number; trainFile?: string }> { + if (nextConfigHash === null) return []; + const signalled: Array<{ pid: number; trainFile?: string }> = []; + for (const [pid, entry] of this.entries) { + if (entry.configHash !== null && entry.configHash === nextConfigHash) { + try { + entry.child.kill("SIGUSR2"); + signalled.push({ pid, trainFile: entry.trainFile }); + } catch { + // child may have just exited; the close handler will clean + // up the entry on its own. + } + } + } + return signalled; + } + + /** + * Send a graceful early-stop signal (SIGTERM) to every child whose + * stored `configHash` differs from `nextConfigHash`. The child's + * runner (`installShutdownHandlers`) calls `Trainer.requestEarlyStop` + * which preserves the in-flight checkpoint before exiting. Returns + * the list of children signalled so the SPA can re-spawn them with + * the new bundle. + * + * If `nextConfigHash` is null (the new bundle has no inspectable + * trainer), every active child is SIGTERM'd defensively — we can't + * prove their configs are unaffected. + */ + requestEarlyStopOnMismatch( + nextConfigHash: string | null, + ): RestartTarget[] { + const targets: RestartTarget[] = []; + for (const [pid, entry] of this.entries) { + if ( + nextConfigHash === null || + entry.configHash === null || + entry.configHash !== nextConfigHash + ) { + try { + entry.child.kill("SIGTERM"); + } catch { + // child already exited; close handler will clean up. + } + targets.push({ pid, trainFile: entry.trainFile }); + } + } + return targets; + } +} diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index b5f46ab5..63c82e73 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -12,7 +12,7 @@ export function RunTraining() { const [log, setLog] = useState(""); const [manifest, setManifest] = useState(null); const [hmrStatus, setHmrStatus] = useState< - "idle" | "rebuilding" | "early-stopping" | "restarting" + "idle" | "rebuilding" | "early-stopping" | "restarting" | "hot-swapped" >("idle"); const boxRef = useRef(null); const lastTrainFileRef = useRef(undefined); @@ -70,6 +70,14 @@ export function RunTraining() { // re-spawns with the same args. restartPendingRef.current = true; setHmrStatus(runningRef.current ? "early-stopping" : "idle"); + } else if (payload.hotSwap) { + // Callbacks were swapped in place — the cloud-side run is + // unaffected. Flash a brief "hot-swapped" indicator so users + // know the new code is live. + setHmrStatus("hot-swapped"); + window.setTimeout(() => { + setHmrStatus((s) => (s === "hot-swapped" ? "idle" : s)); + }, 1500); } else { setHmrStatus("idle"); } @@ -151,6 +159,9 @@ export function RunTraining() { {hmrStatus === "restarting" && ( Restarting with updated code… )} + {hmrStatus === "hot-swapped" && ( + Callbacks hot-swapped — run continues. + )}
         {log || "Output will appear here."}
       
diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index 3cf6ab6d..230504a1 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -32,6 +32,8 @@ export interface Job { */ export interface ManifestSummary { trainer: { name: string } | null; + /** Present when an inspectable trainer is loaded; otherwise null. */ + configHash?: string | null; } export interface ManifestError { @@ -117,9 +119,18 @@ export interface DevEvent { type: "ready" | "rebuild" | "error"; outFile?: string; hash?: string; + /** Cloud-side `JobConfig` hash; null when the bundle has no inspectable trainer. */ + configHash?: string | null; + /** Run name pulled from the rebuilt manifest. */ + trainerName?: string | null; message?: string; + /** True when the rebuild changed cloud-side config and a child was SIGTERM'd. */ restart?: boolean; restartTargets?: Array<{ pid: number; trainFile?: string }>; + /** True when the rebuild only changed callbacks and one or more children + * were SIGUSR2'd to hot-swap their callback closures in place. */ + hotSwap?: boolean; + hotSwapTargets?: Array<{ pid: number; trainFile?: string }>; } export function openDevEvents(): EventSource { From a2e92eec7613eb4f516e20a3e3d7d011d6f833cb Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 2 May 2026 21:40:18 +0900 Subject: [PATCH 03/55] Refactor Trainer API to remove replaceCallbacks method and implement internal callback swapping - Removed the public `replaceCallbacks` method from the Trainer interface to prevent exposure of the hot-swapping functionality. - Introduced an internal mechanism for callback swapping using a `Symbol.for`-keyed brand, allowing for dynamic updates during training runs without affecting the public API. - Updated signal handling to ensure seamless integration with the new callback swapping logic, enhancing the hot module replacement (HMR) experience. - Revised documentation to reflect changes in the Trainer API and clarify the internal callback management process. --- AGENTS.md | 2 +- docs/concepts/studio.mdx | 2 +- docs/ja/concepts/studio.mdx | 2 +- docs/ja/sdk/trainer-control.mdx | 15 +----- docs/sdk/trainer-control.mdx | 15 +----- packages/arkor/src/core/arkor.test.ts | 1 - packages/arkor/src/core/runner.test.ts | 24 +++++---- packages/arkor/src/core/runnerSignals.test.ts | 15 ++++-- packages/arkor/src/core/runnerSignals.ts | 20 +++++-- packages/arkor/src/core/trainer.test.ts | 12 +++-- packages/arkor/src/core/trainer.ts | 22 ++++---- packages/arkor/src/core/trainerInspection.ts | 52 +++++++++++++++++++ packages/arkor/src/core/types.ts | 12 ----- 13 files changed, 116 insertions(+), 78 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index dce272be..a4740767 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -79,7 +79,7 @@ When touching the Studio server or SPA fetch layer, preserve: token via header f When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server makes a per-child decision in [packages/arkor/src/studio/trainRegistry.ts](packages/arkor/src/studio/trainRegistry.ts): -- **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and calls `Trainer.replaceCallbacks` in place. The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. +- **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and rotates the trainer's callback cell via the internal `Symbol.for("arkor.trainer.replaceCallbacks")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts). The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. Don't add a `replaceCallbacks()` method to the public `Trainer` interface — keeping the mutator behind a `Symbol.for` brand is what stops the dev-only HMR primitive from leaking into the SDK's published surface. - **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` calls `Trainer.requestEarlyStop()`, which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index 6d186b73..900a02d3 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -15,7 +15,7 @@ Three jobs: A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` and pushes rebuild notifications to the SPA over a Server-Sent Events stream (`/api/dev/events`). Edit a file, save, and the Run training button updates with the new trainer name without a refresh. If a training run is in flight, the Studio compares the new bundle's cloud-side `JobConfig` hash to the one captured when the run was spawned: -- **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and calls `Trainer.replaceCallbacks` in place. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. +- **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and rotates the trainer's callback cell in place via an internal HMR brand. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. - **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; `Trainer.requestEarlyStop` lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. ## Where Studio runs diff --git a/docs/ja/concepts/studio.mdx b/docs/ja/concepts/studio.mdx index a60df962..ed58779f 100644 --- a/docs/ja/concepts/studio.mdx +++ b/docs/ja/concepts/studio.mdx @@ -15,7 +15,7 @@ Studio は `arkor dev` 実行時に得られるローカル Web UI です。サ dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッチャを `src/arkor/` 上で常駐させ、再ビルド通知を Server-Sent Events ストリーム (`/api/dev/events`) で SPA に push します。ファイルを編集して保存すれば、Run training ボタンのトレーナー名表示はリロード無しで更新されます。学習が走っている最中であれば、Studio は再ビルドしたバンドルの Cloud 側 `JobConfig` ハッシュを、spawn 時に保存したハッシュと比較します。 -- **ハッシュ一致(コールバックのみ変更)。** ランナーへ SIGUSR2 を送ります。ランナーは再ビルドされた成果物を再 import し、その場で `Trainer.replaceCallbacks` を呼びます。Cloud 側の学習はそのまま継続し、GPU 時間を無駄にせず、SPA には "Callbacks hot-swapped" と短く表示されます。 +- **ハッシュ一致(コールバックのみ変更)。** ランナーへ SIGUSR2 を送ります。ランナーは再ビルドされた成果物を再 import し、内部 HMR ブランド経由でトレーナーのコールバック cell をその場で差し替えます。Cloud 側の学習はそのまま継続し、GPU 時間を無駄にせず、SPA には "Callbacks hot-swapped" と短く表示されます。 - **ハッシュ不一致(モデル / データセット / ハイパーパラメータが変わった)。** ランナーへ SIGTERM を送ります。`Trainer.requestEarlyStop` が次のチェックポイントのアップロードを待ってから `cancel()` を発火し、SPA が再ビルドした成果物で再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移するので、ここまでの学習成果は artifact として保全されます。 ## Studio が動く場所 diff --git a/docs/ja/sdk/trainer-control.mdx b/docs/ja/sdk/trainer-control.mdx index 2c02d59c..8683d3d5 100644 --- a/docs/ja/sdk/trainer-control.mdx +++ b/docs/ja/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start、wait、cancel、abortSignal、再接続の仕組み。" # トレーナー制御 -`createTrainer` は次の 5 メソッドを持つ `Trainer` オブジェクトを返します: +`createTrainer` は次の 4 メソッドを持つ `Trainer` オブジェクトを返します: ```ts interface Trainer { @@ -14,7 +14,6 @@ interface Trainer { wait(): Promise; cancel(): Promise; requestEarlyStop(opts?: { timeoutMs?: number }): Promise; - replaceCallbacks(callbacks: Partial): void; } interface TrainingResult { @@ -75,18 +74,6 @@ await trainer.requestEarlyStop({ timeoutMs: 60_000 }); 自前のコード(プログラム的な two-process パターンなど)から `requestEarlyStop()` を直接呼ぶこともできます。Cookbook の [Early Stopping](/ja/cookbook/early-stopping) レシピが `onCheckpoint` + `abortSignal` + `cancel()` で組み立てているのと同じ「実行中ステップを捨てずに止める」セマンティクスを、ワンショットで提供します。レシピ版の方が柔軟(メトリクス次第で abort のタイミングを決めるなど)ですが、こちらは「次のチェックポイントで止める」という典型ケースの便利フックです。 -## `replaceCallbacks()` - -```ts -trainer.replaceCallbacks({ - onLog: ({ step, loss }) => myMetrics.record(step, loss), -}); -``` - -実行中の run のままライフサイクルコールバックを atomic に差し替えます。次にディスパッチされるイベント(`onLog` / `onCheckpoint` …)は新しいオブジェクトから読みます。すでに `await` 中のハンドラは resolve するまで古い参照を保持します。Cloud 側の config(モデル、データセット、ハイパーパラメータ)は `start()` 時点で確定しており、このメソッド経由では **変更できません** — それらを変えたい場合は `requestEarlyStop()` を呼んで再投入してください。 - -これは `arkor dev` の「コールバックのみ HMR」パスで使われている SDK プリミティブです。実行中にソースを保存すると Studio は再ビルドした `JobConfig` のハッシュを spawn 時に保存したハッシュと比較します。一致 → SIGUSR2 → ランナーが再 import して `replaceCallbacks()` を呼ぶ(Cloud 側の学習は無傷)。不一致 → SIGTERM → 既存の `requestEarlyStop()` 経路に切り替わって、新しいバンドルで SPA が再投入します。 - ## `abortSignal` ```ts diff --git a/docs/sdk/trainer-control.mdx b/docs/sdk/trainer-control.mdx index 3f664cd2..c43a404b 100644 --- a/docs/sdk/trainer-control.mdx +++ b/docs/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start, wait, cancel, abortSignal, and how reconnects work." # Trainer control -`createTrainer` returns a `Trainer` object with five methods: +`createTrainer` returns a `Trainer` object with four methods: ```ts interface Trainer { @@ -14,7 +14,6 @@ interface Trainer { wait(): Promise; cancel(): Promise; requestEarlyStop(opts?: { timeoutMs?: number }): Promise; - replaceCallbacks(callbacks: Partial): void; } interface TrainingResult { @@ -75,18 +74,6 @@ This is what `arkor dev`'s HMR pipeline uses internally: when you save a source You can use `requestEarlyStop()` directly from your own code (e.g. in a programmatic two-process pattern) if you want the same "stop, but don't throw away the in-flight step" semantics that the cookbook's [Early stopping](/cookbook/early-stopping) recipe builds out of `onCheckpoint` + `abortSignal` + `cancel()`. The recipe is more flexible (you decide when to abort based on metrics); this method is the convenience hook for the common "stop after the next checkpoint" case. -## `replaceCallbacks()` - -```ts -trainer.replaceCallbacks({ - onLog: ({ step, loss }) => myMetrics.record(step, loss), -}); -``` - -Atomically swap the lifecycle callbacks while a run is in flight. The next dispatched event (`onLog`, `onCheckpoint`, ...) reads from the new object; events already mid-`await` keep their old reference until they resolve. Cloud-side config (model, dataset, hyperparameters) is fixed at `start()` time and **cannot** be changed via this method — for those, use `requestEarlyStop()` and re-spawn. - -This is the SDK primitive `arkor dev` uses for the "callback-only HMR" path: when you save a source file mid-run, Studio diffs the rebuilt `JobConfig` against the spawn-time hash. Equal hashes → SIGUSR2, the runner re-imports and calls `replaceCallbacks()`, the cloud-side training run is untouched. Different hashes → SIGTERM, the existing `requestEarlyStop()` flow takes over and the SPA re-spawns with the new bundle. - ## `abortSignal` ```ts diff --git a/packages/arkor/src/core/arkor.test.ts b/packages/arkor/src/core/arkor.test.ts index 0b353786..d3dc41a7 100644 --- a/packages/arkor/src/core/arkor.test.ts +++ b/packages/arkor/src/core/arkor.test.ts @@ -24,7 +24,6 @@ function fakeTrainer(name = "run"): Trainer { }, async cancel() {}, async requestEarlyStop() {}, - replaceCallbacks() {}, }; } diff --git a/packages/arkor/src/core/runner.test.ts b/packages/arkor/src/core/runner.test.ts index cde91c56..c6f0a087 100644 --- a/packages/arkor/src/core/runner.test.ts +++ b/packages/arkor/src/core/runner.test.ts @@ -34,7 +34,6 @@ function fakeTrainer(onStart?: () => void, onWait?: () => void): Trainer { }, async cancel() {}, async requestEarlyStop() {}, - replaceCallbacks() {}, }; } @@ -257,16 +256,19 @@ describe("runTrainer — shutdown signal handling", () => { try { const runPromise = runTrainer("src/arkor/index.mjs"); // Wait for import + start() to settle so the handler is registered - // before we synthesise SIGTERM. The fake's `wait()` hangs forever, so - // the run remains in flight throughout the assertions. - await new Promise((r) => setTimeout(r, 25)); - - const probe = (globalThis as unknown as { - __test_signalProbe: { - earlyStopCalls: number; - finishWait: () => void; - }; - }).__test_signalProbe; + // before we synthesise SIGTERM. Poll for the probe rather than + // relying on a fixed timer — under load (e.g. running alongside + // sibling test files in turbo) the dynamic import + top-level + // body can take longer than a hardcoded 25 ms window. + type Probe = { earlyStopCalls: number; finishWait: () => void }; + let probe: Probe | undefined; + for (let i = 0; i < 40; i++) { + probe = (globalThis as unknown as { __test_signalProbe?: Probe }) + .__test_signalProbe; + if (probe) break; + await new Promise((r) => setTimeout(r, 25)); + } + if (!probe) throw new Error("Probe not installed by user bundle"); // 1st SIGTERM → requestEarlyStop is called, exit(0) scheduled in the // promise's `.finally`. diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index 45712248..847c8900 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -7,7 +7,10 @@ import { installShutdownHandlers, } from "./runnerSignals"; import type { Trainer, TrainerCallbacks } from "./types"; -import { attachTrainerInspection } from "./trainerInspection"; +import { + attachTrainerCallbackReplacer, + attachTrainerInspection, +} from "./trainerInspection"; let cwd: string; @@ -37,10 +40,13 @@ function makeTrainer(): Trainer & { async requestEarlyStop() { earlyStop.calls += 1; }, - replaceCallbacks(callbacks) { - replace.lastCallbacks = callbacks; - }, }; + // Wire the internal callback-replacer brand the same way `createTrainer` + // does. The SIGUSR2 path looks the brand up via `replaceTrainerCallbacks` + // — there's no public method on `Trainer` for this any more. + attachTrainerCallbackReplacer(trainer, (cbs) => { + replace.lastCallbacks = cbs; + }); return Object.assign(trainer, { __earlyStop: earlyStop, __replace: replace, @@ -113,7 +119,6 @@ describe("installCallbackReloadHandler", () => { wait: async () => ({ job: {}, artifacts: [] }), cancel: async () => {}, requestEarlyStop: async () => {}, - replaceCallbacks: () => {}, }; Object.defineProperty(trainer, KEY, { value: () => ({ name: "t", config: { model: "m", datasetSource: { type: "huggingface", name: "x" } }, callbacks }), diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index c715ee4f..92f53f44 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -1,6 +1,9 @@ import { pathToFileURL } from "node:url"; import { isArkor } from "./arkor"; -import { getTrainerInspection } from "./trainerInspection"; +import { + getTrainerInspection, + replaceTrainerCallbacks, +} from "./trainerInspection"; import type { Trainer, TrainerCallbacks } from "./types"; const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"] as const; @@ -53,9 +56,10 @@ export function installShutdownHandlers(trainer: Trainer): () => void { } /** - * SIGUSR2 handler: re-import the freshly-rebuilt artefact and call - * `Trainer.replaceCallbacks` with the new callbacks. The cloud-side - * training run is untouched — only the in-process callbacks rotate. + * SIGUSR2 handler: re-import the freshly-rebuilt artefact and rotate + * the trainer's callback cell via the internal + * `Symbol.for("arkor.trainer.replaceCallbacks")` brand. The cloud-side + * training run is untouched — only the in-process callbacks change. * * Studio sends SIGUSR2 from the `/api/dev/events` HMR pipeline when * (and only when) the rebuilt bundle's `JobConfig` hash matches the @@ -78,7 +82,13 @@ export function installCallbackReloadHandler( ); return; } - trainer.replaceCallbacks(callbacks); + const swapped = replaceTrainerCallbacks(trainer, callbacks); + if (!swapped) { + process.stderr.write( + "Callback reload skipped: running trainer doesn't carry the callback-replacer brand.\n", + ); + return; + } process.stdout.write( "Callbacks hot-reloaded; training run continues.\n", ); diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 6934729b..ae0e5d2b 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -3,6 +3,7 @@ import { mkdtempSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createTrainer } from "./trainer"; +import { replaceTrainerCallbacks } from "./trainerInspection"; import { writeState } from "./state"; import type { AnonymousCredentials } from "./credentials"; @@ -1502,7 +1503,7 @@ describe("createTrainer (early stop)", () => { await trainer.requestEarlyStop({ timeoutMs: 1 }); }); - it("replaceCallbacks swaps the dispatched callbacks on the next event", async () => { + it("replaceTrainerCallbacks (internal HMR brand) swaps the dispatched callbacks on the next event", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, cwd, @@ -1563,12 +1564,15 @@ describe("createTrainer (early stop)", () => { callbacks: { onLog: ({ step }) => { calls.push(`v1:onLog(${step})`); - // After the first onLog call, swap to v2 callbacks. The next - // event must dispatch via the new callbacks object. + // After the first onLog call, swap to v2 callbacks via the + // internal `Symbol.for("arkor.trainer.replaceCallbacks")` + // brand — the same brand `arkor dev`'s SIGUSR2 handler + // uses. The next event must dispatch via the new object. if (step === 1) { - trainer.replaceCallbacks({ + const swapped = replaceTrainerCallbacks(trainer, { onLog: ({ step: s }) => void calls.push(`v2:onLog(${s})`), }); + expect(swapped).toBe(true); } }, }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index d3d598c7..8cde4346 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -6,7 +6,10 @@ import { type Credentials, } from "./credentials"; import { ensureProjectState } from "./projectState"; -import { attachTrainerInspection } from "./trainerInspection"; +import { + attachTrainerCallbackReplacer, + attachTrainerInspection, +} from "./trainerInspection"; import type { CheckpointContext, InferArgs, @@ -424,10 +427,6 @@ export function createTrainer( await client.cancelJob(startedJob.id, scope); }, - replaceCallbacks(callbacks: Partial): void { - currentCallbacks = callbacks ?? {}; - }, - async requestEarlyStop(opts: { timeoutMs?: number } = {}): Promise { // Nothing in flight: cleanup any prior latch and resolve. if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { @@ -470,15 +469,20 @@ export function createTrainer( }, }; - // Brand the trainer with an inspection accessor so the Studio server can - // (a) hash the cloud-side config to decide HMR strategy and (b) read the - // current callbacks reference when hot-swapping. See `trainerInspection.ts` - // for why this uses `Symbol.for` instead of a module-local WeakMap. + // Brand the trainer with the HMR control surface so the Studio server + // can (a) hash the cloud-side config to decide between hot-swap and + // restart and (b) atomically swap the callbacks cell from the runner + // subprocess. Both brands live behind `Symbol.for` keys so they don't + // appear on the public `Trainer` interface — see + // `trainerInspection.ts` for the rationale. attachTrainerInspection(trainer, () => ({ name: input.name, config, callbacks: currentCallbacks, })); + attachTrainerCallbackReplacer(trainer, (callbacks) => { + currentCallbacks = callbacks ?? {}; + }); return trainer; } diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 3c384df1..343ae491 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -5,6 +5,14 @@ import type { JobConfig, TrainerCallbacks } from "./types"; * server reads in order to (a) compute a stable hash for HMR's * "callbacks-only vs full restart" decision and (b) extract the new * callbacks reference when hot-swapping. + * + * **Internal API — not part of the user-facing SDK surface.** Both this + * snapshot and the companion `replaceTrainerCallbacks` mutator are + * exposed only via `Symbol.for(...)`-keyed properties on the trainer + * object so they don't appear on the public `Trainer` type. They exist + * to let `arkor dev`'s HMR pipeline hot-swap callbacks without + * restarting cloud-side training; user code shouldn't call them + * directly. */ export interface TrainerInspection { /** Run name (mirror of `Trainer.name`, copied for forward compatibility). */ @@ -28,6 +36,9 @@ export interface TrainerInspection { * property the Studio process reads. */ const TRAINER_INSPECTION_KEY = Symbol.for("arkor.trainer.inspect"); +const TRAINER_REPLACE_CALLBACKS_KEY = Symbol.for( + "arkor.trainer.replaceCallbacks", +); /** * Stamp the inspection snapshot onto a freshly-built `Trainer` instance. @@ -73,3 +84,44 @@ export function getTrainerInspection( } return null; } + +/** + * Wire the trainer's mutable callbacks slot to a `Symbol.for`-keyed + * brand so the runner subprocess can hot-swap callbacks without us + * exposing the operation on the public `Trainer` interface. Called once + * from `createTrainer`. + */ +export function attachTrainerCallbackReplacer( + trainer: object, + replace: (callbacks: Partial) => void, +): void { + Object.defineProperty(trainer, TRAINER_REPLACE_CALLBACKS_KEY, { + value: replace, + configurable: true, + enumerable: false, + writable: false, + }); +} + +/** + * Replace the trainer's lifecycle callbacks atomically. Returns `true` + * when the call landed (the trainer carried the brand), `false` + * otherwise — callers (the SIGUSR2 hot-swap path in `runnerSignals`) + * use the return value to decide whether to log a skip warning. + */ +export function replaceTrainerCallbacks( + trainer: unknown, + callbacks: Partial, +): boolean { + if (!trainer || typeof trainer !== "object") return false; + const fn = (trainer as Record)[ + TRAINER_REPLACE_CALLBACKS_KEY + ]; + if (typeof fn !== "function") return false; + try { + (fn as (cbs: Partial) => void).call(trainer, callbacks); + return true; + } catch { + return false; + } +} diff --git a/packages/arkor/src/core/types.ts b/packages/arkor/src/core/types.ts index c42be926..c0ec4d31 100644 --- a/packages/arkor/src/core/types.ts +++ b/packages/arkor/src/core/types.ts @@ -211,18 +211,6 @@ export interface Trainer { * resolves immediately without contacting the cloud API. */ requestEarlyStop(opts?: { timeoutMs?: number }): Promise; - /** - * Atomically swap the lifecycle callbacks while the run is in flight. The - * next dispatched event (`onLog`, `onCheckpoint`, ...) reads from the new - * object; events already mid-await keep their old reference until they - * resolve. Used by `arkor dev`'s HMR pipeline to hot-swap callback code - * without restarting the cloud-side training. - * - * Cloud-side config (model, dataset, hyperparameters) is fixed at - * `start()` time and **cannot** be changed via this method — for those - * use `requestEarlyStop()` and let HMR re-spawn the run. - */ - replaceCallbacks(callbacks: Partial): void; } /** From a617a5609659ebfb51afdbf3a6446b7dc519c910 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 2 May 2026 21:54:06 +0900 Subject: [PATCH 04/55] Refactor Trainer API to remove public requestEarlyStop method and implement internal early-stop handling - Removed the public `requestEarlyStop` method from the Trainer interface to prevent exposure of the early-stop functionality. - Introduced an internal mechanism for early stopping using a `Symbol.for`-keyed brand, allowing for graceful stopping after the next checkpoint without affecting the public API. - Updated signal handling to ensure seamless integration with the new early-stop logic, enhancing the hot module replacement (HMR) experience. - Revised documentation to reflect changes in the Trainer API and clarify the internal early-stop management process. --- AGENTS.md | 4 +- docs/concepts/studio.mdx | 4 +- docs/ja/concepts/studio.mdx | 4 +- docs/ja/sdk/trainer-control.mdx | 22 +--- docs/sdk/trainer-control.mdx | 22 +--- packages/arkor/src/core/arkor.test.ts | 1 - packages/arkor/src/core/runner.test.ts | 14 ++- packages/arkor/src/core/runnerSignals.test.ts | 15 +-- packages/arkor/src/core/runnerSignals.ts | 15 ++- packages/arkor/src/core/trainer.test.ts | 24 +++-- packages/arkor/src/core/trainer.ts | 100 ++++++++++-------- packages/arkor/src/core/trainerInspection.ts | 57 ++++++++++ packages/arkor/src/core/types.ts | 11 -- 13 files changed, 171 insertions(+), 122 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index a4740767..b98fb196 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -80,9 +80,9 @@ When touching the Studio server or SPA fetch layer, preserve: token via header f When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server makes a per-child decision in [packages/arkor/src/studio/trainRegistry.ts](packages/arkor/src/studio/trainRegistry.ts): - **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and rotates the trainer's callback cell via the internal `Symbol.for("arkor.trainer.replaceCallbacks")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts). The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. Don't add a `replaceCallbacks()` method to the public `Trainer` interface — keeping the mutator behind a `Symbol.for` brand is what stops the dev-only HMR primitive from leaking into the SDK's published surface. -- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` calls `Trainer.requestEarlyStop()`, which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. +- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` drives the trainer's internal early-stop entry point via the `Symbol.for("arkor.trainer.requestEarlyStop")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts), which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. A third-party trainer that doesn't carry the brand falls back to a plain `cancel()`. -Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. +Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. Don't surface `requestEarlyStop()` (or `replaceCallbacks()`) as a method on the public `Trainer` interface — both are dev-only HMR primitives, and keeping them behind `Symbol.for` brands is what stops them from leaking into the published SDK shape; user code that wants similar semantics should compose `abortSignal` + `cancel()` per the cookbook. ### Project entry-point discovery diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index 900a02d3..8b687e9f 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -16,7 +16,9 @@ Three jobs: A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher over `src/arkor/` and pushes rebuild notifications to the SPA over a Server-Sent Events stream (`/api/dev/events`). Edit a file, save, and the Run training button updates with the new trainer name without a refresh. If a training run is in flight, the Studio compares the new bundle's cloud-side `JobConfig` hash to the one captured when the run was spawned: - **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and rotates the trainer's callback cell in place via an internal HMR brand. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. -- **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; `Trainer.requestEarlyStop` lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. +- **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; the trainer's internal early-stop entry point lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. + +If you want this "stop after the next checkpoint" behaviour from your own code (rather than from the dev loop), build it on top of the public [`abortSignal` + `cancel()`](/sdk/trainer-control#abortsignal) pair — the [Early stopping recipe](/cookbook/early-stopping) walks through it. The HMR pipeline uses an internal SDK primitive of the same name, but it isn't part of the public `Trainer` surface. ## Where Studio runs diff --git a/docs/ja/concepts/studio.mdx b/docs/ja/concepts/studio.mdx index ed58779f..8c862571 100644 --- a/docs/ja/concepts/studio.mdx +++ b/docs/ja/concepts/studio.mdx @@ -16,7 +16,9 @@ Studio は `arkor dev` 実行時に得られるローカル Web UI です。サ dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッチャを `src/arkor/` 上で常駐させ、再ビルド通知を Server-Sent Events ストリーム (`/api/dev/events`) で SPA に push します。ファイルを編集して保存すれば、Run training ボタンのトレーナー名表示はリロード無しで更新されます。学習が走っている最中であれば、Studio は再ビルドしたバンドルの Cloud 側 `JobConfig` ハッシュを、spawn 時に保存したハッシュと比較します。 - **ハッシュ一致(コールバックのみ変更)。** ランナーへ SIGUSR2 を送ります。ランナーは再ビルドされた成果物を再 import し、内部 HMR ブランド経由でトレーナーのコールバック cell をその場で差し替えます。Cloud 側の学習はそのまま継続し、GPU 時間を無駄にせず、SPA には "Callbacks hot-swapped" と短く表示されます。 -- **ハッシュ不一致(モデル / データセット / ハイパーパラメータが変わった)。** ランナーへ SIGTERM を送ります。`Trainer.requestEarlyStop` が次のチェックポイントのアップロードを待ってから `cancel()` を発火し、SPA が再ビルドした成果物で再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移するので、ここまでの学習成果は artifact として保全されます。 +- **ハッシュ不一致(モデル / データセット / ハイパーパラメータが変わった)。** ランナーへ SIGTERM を送ります。トレーナー内部の early-stop エントリが次のチェックポイントのアップロードを待ってから `cancel()` を発火し、SPA が再ビルドした成果物で再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移するので、ここまでの学習成果は artifact として保全されます。 + +自前のコードから(dev ループではなく)この「次のチェックポイントで止める」挙動が欲しい場合は、公開 API の [`abortSignal` + `cancel()`](/ja/sdk/trainer-control#abortsignal) を組み合わせて書いてください。具体的な手順は [Early Stopping レシピ](/ja/cookbook/early-stopping) にあります。HMR パイプラインは同名の SDK 内部プリミティブを使っていますが、公開の `Trainer` インターフェイスには含まれていません。 ## Studio が動く場所 diff --git a/docs/ja/sdk/trainer-control.mdx b/docs/ja/sdk/trainer-control.mdx index 8683d3d5..6a6bd74f 100644 --- a/docs/ja/sdk/trainer-control.mdx +++ b/docs/ja/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start、wait、cancel、abortSignal、再接続の仕組み。" # トレーナー制御 -`createTrainer` は次の 4 メソッドを持つ `Trainer` オブジェクトを返します: +`createTrainer` は次の 3 メソッドを持つ `Trainer` オブジェクトを返します: ```ts interface Trainer { @@ -13,7 +13,6 @@ interface Trainer { start(): Promise<{ jobId: string }>; wait(): Promise; cancel(): Promise; - requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } interface TrainingResult { @@ -55,25 +54,6 @@ await trainer.cancel(); - そうでなければバックエンドにキャンセルリクエストを送ります。 - **ベストエフォート。** SDK は終端ステータスでショートサーキットしません。学習が既に completed / failed / cancelled なら、バックエンドは non-2xx を返すことがあり `cancel()` は reject します。投機的に呼ぶなら `try / catch` で囲んでください。 -## `requestEarlyStop()` - -```ts -await trainer.requestEarlyStop(); -// もしくはチェックポイント待ちのデッドラインを指定: -await trainer.requestEarlyStop({ timeoutMs: 60_000 }); -``` - -「直近のチェックポイントを保全する」 `cancel()` の兄弟版です。 - -- ラッチを armed にします。トレーナーは **次の** `checkpoint.saved` イベントが来るまで実行を続けます。チェックポイントが永続化された時点で SDK が代わりに `cancel()` を呼び、戻り値の Promise を resolve します。 -- `timeoutMs`(デフォルト: 5 分)以内にチェックポイントが来なかった場合は即時 `cancel()` にフォールバックします。`saveSteps` の間隔がそれより長い場合はこの値を調整してください。 -- 冪等: 連続して呼んでも同じ in-flight Promise を共有し、`cancel()` は 1 度しか発火しません。 -- `start()` 前、もしくはジョブが既に終端ステータスに達している場合は何もしません。 - -これは `arkor dev` の HMR パイプラインが内部で使っている API です。実行中にソースファイルを保存すると Studio が spawn 済みの `arkor start` プロセスに `SIGTERM` を送り、シグナルハンドラが `requestEarlyStop()` を呼んでチェックポイントのアップロード完了後にクリーンに終了します。Cloud 側のジョブは `cancelled` ステータスで完了します。 - -自前のコード(プログラム的な two-process パターンなど)から `requestEarlyStop()` を直接呼ぶこともできます。Cookbook の [Early Stopping](/ja/cookbook/early-stopping) レシピが `onCheckpoint` + `abortSignal` + `cancel()` で組み立てているのと同じ「実行中ステップを捨てずに止める」セマンティクスを、ワンショットで提供します。レシピ版の方が柔軟(メトリクス次第で abort のタイミングを決めるなど)ですが、こちらは「次のチェックポイントで止める」という典型ケースの便利フックです。 - ## `abortSignal` ```ts diff --git a/docs/sdk/trainer-control.mdx b/docs/sdk/trainer-control.mdx index c43a404b..ef40cf46 100644 --- a/docs/sdk/trainer-control.mdx +++ b/docs/sdk/trainer-control.mdx @@ -5,7 +5,7 @@ description: "start, wait, cancel, abortSignal, and how reconnects work." # Trainer control -`createTrainer` returns a `Trainer` object with four methods: +`createTrainer` returns a `Trainer` object with three methods: ```ts interface Trainer { @@ -13,7 +13,6 @@ interface Trainer { start(): Promise<{ jobId: string }>; wait(): Promise; cancel(): Promise; - requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } interface TrainingResult { @@ -55,25 +54,6 @@ await trainer.cancel(); - Otherwise it sends a cancel request to the backend. - **Best-effort.** The SDK does not short-circuit on terminal status; if the run already completed, failed, or was cancelled, the backend may return a non-2xx and `cancel()` rejects. Wrap in `try / catch` if you call it speculatively. -## `requestEarlyStop()` - -```ts -await trainer.requestEarlyStop(); -// or with a custom checkpoint deadline: -await trainer.requestEarlyStop({ timeoutMs: 60_000 }); -``` - -The "preserve the latest checkpoint" sibling of `cancel()`: - -- Arms a latch. The trainer keeps running until the **next** `checkpoint.saved` event lands. Once the checkpoint is durable, the SDK calls `cancel()` for you and resolves the returned promise. -- If no checkpoint arrives within `timeoutMs` (default: 5 minutes), falls back to `cancel()` immediately. Tune this if your `saveSteps` cadence is longer than 5 min. -- Idempotent — repeat calls share the same in-flight promise and only fire `cancel()` once. -- A no-op when called before `start()` or after the job has already reached a terminal status. - -This is what `arkor dev`'s HMR pipeline uses internally: when you save a source file mid-run, Studio sends `SIGTERM` to the spawned `arkor start` process; that process catches the signal, calls `requestEarlyStop()`, and exits cleanly once the checkpoint is uploaded. The Cloud-side job ends in the `cancelled` status. - -You can use `requestEarlyStop()` directly from your own code (e.g. in a programmatic two-process pattern) if you want the same "stop, but don't throw away the in-flight step" semantics that the cookbook's [Early stopping](/cookbook/early-stopping) recipe builds out of `onCheckpoint` + `abortSignal` + `cancel()`. The recipe is more flexible (you decide when to abort based on metrics); this method is the convenience hook for the common "stop after the next checkpoint" case. - ## `abortSignal` ```ts diff --git a/packages/arkor/src/core/arkor.test.ts b/packages/arkor/src/core/arkor.test.ts index d3dc41a7..64e5e82e 100644 --- a/packages/arkor/src/core/arkor.test.ts +++ b/packages/arkor/src/core/arkor.test.ts @@ -23,7 +23,6 @@ function fakeTrainer(name = "run"): Trainer { }; }, async cancel() {}, - async requestEarlyStop() {}, }; } diff --git a/packages/arkor/src/core/runner.test.ts b/packages/arkor/src/core/runner.test.ts index c6f0a087..bfa63525 100644 --- a/packages/arkor/src/core/runner.test.ts +++ b/packages/arkor/src/core/runner.test.ts @@ -33,7 +33,6 @@ function fakeTrainer(onStart?: () => void, onWait?: () => void): Trainer { }; }, async cancel() {}, - async requestEarlyStop() {}, }; } @@ -215,7 +214,12 @@ describe("runTrainer — shutdown signal handling", () => { // (via a global helper). This lets us hold the run in flight long // enough to assert both signal-handling branches without racing the // `finally` block that removes the listeners. + // The fake trainer wears the early-stop brand + // (`Symbol.for("arkor.trainer.requestEarlyStop")`) so the runner's + // SIGTERM handler invokes it the same way the SDK-provided trainer + // does. No public `requestEarlyStop` method exists any more. const trainerSrc = ` + const KEY = Symbol.for("arkor.trainer.requestEarlyStop"); let earlyStopCalls = 0; let resolveWait; const waitPromise = new Promise((r) => { resolveWait = r; }); @@ -231,13 +235,17 @@ describe("runTrainer — shutdown signal handling", () => { artifacts: [], }), }; - export const trainer = { + const trainer = { name: "n", start: async () => ({ jobId: "j1" }), wait: () => waitPromise, cancel: async () => {}, - requestEarlyStop: async () => { earlyStopCalls++; }, }; + Object.defineProperty(trainer, KEY, { + value: async () => { earlyStopCalls++; }, + enumerable: false, + }); + export { trainer }; `; const entry = join(cwd, "src/arkor/index.mjs"); mkdirSync(join(cwd, "src/arkor"), { recursive: true }); diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index 847c8900..aeac9ab4 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -9,6 +9,7 @@ import { import type { Trainer, TrainerCallbacks } from "./types"; import { attachTrainerCallbackReplacer, + attachTrainerEarlyStopper, attachTrainerInspection, } from "./trainerInspection"; @@ -37,16 +38,17 @@ function makeTrainer(): Trainer & { throw new Error("not used"); }, async cancel() {}, - async requestEarlyStop() { - earlyStop.calls += 1; - }, }; - // Wire the internal callback-replacer brand the same way `createTrainer` - // does. The SIGUSR2 path looks the brand up via `replaceTrainerCallbacks` - // — there's no public method on `Trainer` for this any more. + // Wire the internal callback-replacer + early-stop brands the same + // way `createTrainer` does. SIGUSR2 looks them up via + // `replaceTrainerCallbacks` and SIGTERM via `requestTrainerEarlyStop` + // — there are no public methods on `Trainer` for either any more. attachTrainerCallbackReplacer(trainer, (cbs) => { replace.lastCallbacks = cbs; }); + attachTrainerEarlyStopper(trainer, async () => { + earlyStop.calls += 1; + }); return Object.assign(trainer, { __earlyStop: earlyStop, __replace: replace, @@ -118,7 +120,6 @@ describe("installCallbackReloadHandler", () => { start: async () => ({ jobId: "j" }), wait: async () => ({ job: {}, artifacts: [] }), cancel: async () => {}, - requestEarlyStop: async () => {}, }; Object.defineProperty(trainer, KEY, { value: () => ({ name: "t", config: { model: "m", datasetSource: { type: "huggingface", name: "x" } }, callbacks }), diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index 92f53f44..fcaaed3b 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -3,6 +3,7 @@ import { isArkor } from "./arkor"; import { getTrainerInspection, replaceTrainerCallbacks, + requestTrainerEarlyStop, } from "./trainerInspection"; import type { Trainer, TrainerCallbacks } from "./types"; @@ -41,8 +42,18 @@ export function installShutdownHandlers(trainer: Trainer): () => void { process.stdout.write( `Received ${signal}; early-stopping at next checkpoint…\n`, ); - trainer - .requestEarlyStop() + // Drive the trainer's internal early-stop entry point via the + // `Symbol.for("arkor.trainer.requestEarlyStop")` brand. A trainer + // that doesn't carry the brand (third-party shape, pre-SDK trainer) + // returns `null`; fall back to `cancel()` directly so we still + // close out the cloud-side job before exiting. + const stop = + requestTrainerEarlyStop(trainer) ?? + trainer.cancel().catch((err: unknown) => { + const msg = err instanceof Error ? err.message : String(err); + process.stderr.write(`cancel failed: ${msg}\n`); + }); + Promise.resolve(stop) .catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); process.stderr.write(`requestEarlyStop failed: ${msg}\n`); diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index ae0e5d2b..9e1c3726 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -3,7 +3,10 @@ import { mkdtempSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { createTrainer } from "./trainer"; -import { replaceTrainerCallbacks } from "./trainerInspection"; +import { + replaceTrainerCallbacks, + requestTrainerEarlyStop, +} from "./trainerInspection"; import { writeState } from "./state"; import type { AnonymousCredentials } from "./credentials"; @@ -1394,7 +1397,7 @@ describe("createTrainer (early stop)", () => { // dispatch loop isn't blocked waiting for the latch's own // checkpoint trigger to arrive. onLog: () => { - void trainer.requestEarlyStop({ timeoutMs: 60_000 }); + void requestTrainerEarlyStop(trainer, { timeoutMs: 60_000 }); }, }, }, @@ -1482,8 +1485,9 @@ describe("createTrainer (early stop)", () => { try { await trainer.start(); // Tiny timeout so the test doesn't actually wait 5 minutes. - const stopPromise = trainer.requestEarlyStop({ timeoutMs: 5 }); - await stopPromise; + const stopPromise = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); + expect(stopPromise).not.toBeNull(); + await stopPromise!; expect(cancelCalls).toBe(1); } finally { globalThis.fetch = original; @@ -1500,7 +1504,7 @@ describe("createTrainer (early stop)", () => { { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, ); // Should resolve without contacting cloud-api at all. - await trainer.requestEarlyStop({ timeoutMs: 1 }); + await requestTrainerEarlyStop(trainer, { timeoutMs: 1 }); }); it("replaceTrainerCallbacks (internal HMR brand) swaps the dispatched callbacks on the next event", async () => { @@ -1629,11 +1633,13 @@ describe("createTrainer (early stop)", () => { globalThis.fetch = fetcher; try { await trainer.start(); - const a = trainer.requestEarlyStop({ timeoutMs: 5 }); - const b = trainer.requestEarlyStop({ timeoutMs: 5 }); - await Promise.all([a, b]); + const a = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); + const b = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); + expect(a).not.toBeNull(); + expect(b).not.toBeNull(); + await Promise.all([a!, b!]); // The fallback timer fires once, so cancel is called once even though - // requestEarlyStop was called twice. + // the early-stop brand was invoked twice. expect(cancelCalls).toBe(1); } finally { globalThis.fetch = original; diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index 8cde4346..ae657610 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -8,7 +8,9 @@ import { import { ensureProjectState } from "./projectState"; import { attachTrainerCallbackReplacer, + attachTrainerEarlyStopper, attachTrainerInspection, + type RequestEarlyStopOptions, } from "./trainerInspection"; import type { CheckpointContext, @@ -426,55 +428,66 @@ export function createTrainer( const client = await getClient(); await client.cancelJob(startedJob.id, scope); }, + }; - async requestEarlyStop(opts: { timeoutMs?: number } = {}): Promise { - // Nothing in flight: cleanup any prior latch and resolve. - if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { - if (earlyStopDeferred) { - if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); - earlyStopDeferred.resolve(); - earlyStopDeferred = null; - } - earlyStopRequested = false; - return; + /** + * Internal "stop after next checkpoint" entry point. Hidden behind a + * `Symbol.for` brand so the runner subprocess's SIGTERM handler (in + * `runnerSignals.ts`) can drive a graceful early-stop without us + * exposing the operation on the public `Trainer` interface. User code + * that wants the same semantics should compose `abortSignal` + + * `cancel()` per `docs/cookbook/early-stopping.mdx`. + */ + async function requestEarlyStop( + opts: RequestEarlyStopOptions = {}, + ): Promise { + // Nothing in flight: cleanup any prior latch and resolve. + if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { + if (earlyStopDeferred) { + if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); + earlyStopDeferred.resolve(); + earlyStopDeferred = null; } - // Idempotent: a second call piggybacks on the first. - if (earlyStopDeferred) return earlyStopDeferred.promise; + earlyStopRequested = false; + return; + } + // Idempotent: a second call piggybacks on the first. + if (earlyStopDeferred) return earlyStopDeferred.promise; - earlyStopRequested = true; - let resolveFn!: () => void; - const promise = new Promise((resolve) => { - resolveFn = resolve; - }); - const timeoutMs = opts.timeoutMs ?? DEFAULT_EARLY_STOP_TIMEOUT_MS; - const timer = setTimeout(() => { - // Timed out waiting for a checkpoint — fall back to immediate cancel. - // Capture the active deferred reference: by the time the cancel POST - // resolves, the checkpoint branch may have nulled out the shared - // slot, but this fallback path still owns the deferred it created. - const active = earlyStopDeferred; - trainer - .cancel() - .catch(() => {}) - .finally(() => { - if (active) active.resolve(); - if (earlyStopDeferred === active) earlyStopDeferred = null; - }); - }, timeoutMs); - // `Timer.unref` keeps the early-stop timer from blocking process exit - // when the host runtime finishes for unrelated reasons. - timer.unref?.(); - earlyStopDeferred = { promise, resolve: resolveFn, timer }; - return promise; - }, - }; + earlyStopRequested = true; + let resolveFn!: () => void; + const promise = new Promise((resolve) => { + resolveFn = resolve; + }); + const timeoutMs = opts.timeoutMs ?? DEFAULT_EARLY_STOP_TIMEOUT_MS; + const timer = setTimeout(() => { + // Timed out waiting for a checkpoint — fall back to immediate cancel. + // Capture the active deferred reference: by the time the cancel POST + // resolves, the checkpoint branch may have nulled out the shared + // slot, but this fallback path still owns the deferred it created. + const active = earlyStopDeferred; + trainer + .cancel() + .catch(() => {}) + .finally(() => { + if (active) active.resolve(); + if (earlyStopDeferred === active) earlyStopDeferred = null; + }); + }, timeoutMs); + // `Timer.unref` keeps the early-stop timer from blocking process exit + // when the host runtime finishes for unrelated reasons. + timer.unref?.(); + earlyStopDeferred = { promise, resolve: resolveFn, timer }; + return promise; + } // Brand the trainer with the HMR control surface so the Studio server // can (a) hash the cloud-side config to decide between hot-swap and - // restart and (b) atomically swap the callbacks cell from the runner - // subprocess. Both brands live behind `Symbol.for` keys so they don't - // appear on the public `Trainer` interface — see - // `trainerInspection.ts` for the rationale. + // restart, (b) atomically swap the callbacks cell from the runner + // subprocess on SIGUSR2, and (c) drive a graceful "stop after the + // next checkpoint" on SIGTERM. All three brands live behind + // `Symbol.for` keys so they don't appear on the public `Trainer` + // interface — see `trainerInspection.ts` for the rationale. attachTrainerInspection(trainer, () => ({ name: input.name, config, @@ -483,6 +496,7 @@ export function createTrainer( attachTrainerCallbackReplacer(trainer, (callbacks) => { currentCallbacks = callbacks ?? {}; }); + attachTrainerEarlyStopper(trainer, requestEarlyStop); return trainer; } diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 343ae491..9181f8c5 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -39,6 +39,14 @@ const TRAINER_INSPECTION_KEY = Symbol.for("arkor.trainer.inspect"); const TRAINER_REPLACE_CALLBACKS_KEY = Symbol.for( "arkor.trainer.replaceCallbacks", ); +const TRAINER_REQUEST_EARLY_STOP_KEY = Symbol.for( + "arkor.trainer.requestEarlyStop", +); + +export interface RequestEarlyStopOptions { + /** Default: 5 min. Falls back to immediate cancel if no checkpoint arrives. */ + timeoutMs?: number; +} /** * Stamp the inspection snapshot onto a freshly-built `Trainer` instance. @@ -125,3 +133,52 @@ export function replaceTrainerCallbacks( return false; } } + +/** + * Wire an early-stop entry point onto a `Trainer` so the SIGTERM handler + * in the runner subprocess can request a graceful "stop after the next + * checkpoint" without us exposing the operation on the public `Trainer` + * interface. User code that wants the same semantics should compose + * the cookbook's `abortSignal` + `cancel()` recipe instead — see + * `docs/cookbook/early-stopping.mdx`. + */ +export function attachTrainerEarlyStopper( + trainer: object, + requestStop: (opts?: RequestEarlyStopOptions) => Promise, +): void { + Object.defineProperty(trainer, TRAINER_REQUEST_EARLY_STOP_KEY, { + value: requestStop, + configurable: true, + enumerable: false, + writable: false, + }); +} + +/** + * Request that the trainer stop after the next saved checkpoint. + * Returns the same promise the underlying implementation hands out — + * resolves once `cancel()` has been accepted by the cloud API, or + * after `timeoutMs` if no checkpoint arrived in time. + * + * Returns `null` when the trainer doesn't carry the early-stop brand + * (third-party wrapper / pre-SDK shape) so callers can decide whether + * to fall back to a hard kill. + */ +export function requestTrainerEarlyStop( + trainer: unknown, + opts?: RequestEarlyStopOptions, +): Promise | null { + if (!trainer || typeof trainer !== "object") return null; + const fn = (trainer as Record)[ + TRAINER_REQUEST_EARLY_STOP_KEY + ]; + if (typeof fn !== "function") return null; + try { + const result = ( + fn as (opts?: RequestEarlyStopOptions) => Promise + ).call(trainer, opts); + return Promise.resolve(result); + } catch (err) { + return Promise.reject(err); + } +} diff --git a/packages/arkor/src/core/types.ts b/packages/arkor/src/core/types.ts index c0ec4d31..e5fe1f26 100644 --- a/packages/arkor/src/core/types.ts +++ b/packages/arkor/src/core/types.ts @@ -200,17 +200,6 @@ export interface Trainer { wait(): Promise; /** Best-effort cancel; resolves once the cloud API accepts the request. */ cancel(): Promise; - /** - * Stop after the next saved checkpoint. The trainer keeps running, lets the - * in-flight step finish + checkpoint upload complete, then issues `cancel()`. - * Resolves once the cancel POST has been accepted. Falls back to immediate - * cancel if no checkpoint arrives within `timeoutMs` (default: 5 min). - * - * Idempotent: repeat calls return the same in-flight promise. If the job - * has not been `start()`ed or has already reached a terminal status, this - * resolves immediately without contacting the cloud API. - */ - requestEarlyStop(opts?: { timeoutMs?: number }): Promise; } /** From ef6757f3e0cdc013fab6ce1dd2ecf108d43dd1ae Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 2 May 2026 22:02:01 +0900 Subject: [PATCH 05/55] Refactor Trainer API to ensure internal handling of early-stop and callback replacement - Updated the Trainer API to remove public exposure of `requestEarlyStop` and `replaceCallbacks` methods, enhancing encapsulation. - Implemented internal mechanisms for early stopping and callback swapping using `Symbol.for`-keyed brands, ensuring seamless integration during training runs. - Revised signal handling to improve the hot module replacement (HMR) experience and maintain clean resource management. - Updated documentation to reflect these changes and clarify the internal management processes for developers. --- AGENTS.md | 2 +- docs/concepts/studio.mdx | 2 +- docs/ja/concepts/studio.mdx | 2 +- packages/arkor/src/core/runnerSignals.ts | 23 ++------ packages/arkor/src/core/trainer.test.ts | 11 ++-- packages/arkor/src/core/trainerInspection.ts | 56 +++++++------------- 6 files changed, 31 insertions(+), 65 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index b98fb196..346423ce 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -80,7 +80,7 @@ When touching the Studio server or SPA fetch layer, preserve: token via header f When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server makes a per-child decision in [packages/arkor/src/studio/trainRegistry.ts](packages/arkor/src/studio/trainRegistry.ts): - **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and rotates the trainer's callback cell via the internal `Symbol.for("arkor.trainer.replaceCallbacks")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts). The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. Don't add a `replaceCallbacks()` method to the public `Trainer` interface — keeping the mutator behind a `Symbol.for` brand is what stops the dev-only HMR primitive from leaking into the SDK's published surface. -- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` drives the trainer's internal early-stop entry point via the `Symbol.for("arkor.trainer.requestEarlyStop")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts), which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. A third-party trainer that doesn't carry the brand falls back to a plain `cancel()`. +- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` drives the trainer's internal early-stop entry point via the `Symbol.for("arkor.trainer.requestEarlyStop")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts), which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. Don't surface `requestEarlyStop()` (or `replaceCallbacks()`) as a method on the public `Trainer` interface — both are dev-only HMR primitives, and keeping them behind `Symbol.for` brands is what stops them from leaking into the published SDK shape; user code that wants similar semantics should compose `abortSignal` + `cancel()` per the cookbook. diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index 8b687e9f..b91b831c 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -18,7 +18,7 @@ A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher ov - **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and rotates the trainer's callback cell in place via an internal HMR brand. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. - **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; the trainer's internal early-stop entry point lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. -If you want this "stop after the next checkpoint" behaviour from your own code (rather than from the dev loop), build it on top of the public [`abortSignal` + `cancel()`](/sdk/trainer-control#abortsignal) pair — the [Early stopping recipe](/cookbook/early-stopping) walks through it. The HMR pipeline uses an internal SDK primitive of the same name, but it isn't part of the public `Trainer` surface. +If you want this "stop after the next checkpoint" behaviour from your own code (rather than from the dev loop), build it on top of the public [`abortSignal` + `cancel()`](/sdk/trainer-control#abortsignal) pair — the [Early stopping recipe](/cookbook/early-stopping) walks through it. ## Where Studio runs diff --git a/docs/ja/concepts/studio.mdx b/docs/ja/concepts/studio.mdx index 8c862571..0f8534f6 100644 --- a/docs/ja/concepts/studio.mdx +++ b/docs/ja/concepts/studio.mdx @@ -18,7 +18,7 @@ dev ループのメモ: Studio は [Rolldown](https://rolldown.rs) のウォッ - **ハッシュ一致(コールバックのみ変更)。** ランナーへ SIGUSR2 を送ります。ランナーは再ビルドされた成果物を再 import し、内部 HMR ブランド経由でトレーナーのコールバック cell をその場で差し替えます。Cloud 側の学習はそのまま継続し、GPU 時間を無駄にせず、SPA には "Callbacks hot-swapped" と短く表示されます。 - **ハッシュ不一致(モデル / データセット / ハイパーパラメータが変わった)。** ランナーへ SIGTERM を送ります。トレーナー内部の early-stop エントリが次のチェックポイントのアップロードを待ってから `cancel()` を発火し、SPA が再ビルドした成果物で再投入します。Cloud 側の以前のジョブはチェックポイントのアップロード完了後に `cancelled` 状態に遷移するので、ここまでの学習成果は artifact として保全されます。 -自前のコードから(dev ループではなく)この「次のチェックポイントで止める」挙動が欲しい場合は、公開 API の [`abortSignal` + `cancel()`](/ja/sdk/trainer-control#abortsignal) を組み合わせて書いてください。具体的な手順は [Early Stopping レシピ](/ja/cookbook/early-stopping) にあります。HMR パイプラインは同名の SDK 内部プリミティブを使っていますが、公開の `Trainer` インターフェイスには含まれていません。 +自前のコードから(dev ループではなく)この「次のチェックポイントで止める」挙動が欲しい場合は、公開 API の [`abortSignal` + `cancel()`](/ja/sdk/trainer-control#abortsignal) を組み合わせて書いてください。具体的な手順は [Early Stopping レシピ](/ja/cookbook/early-stopping) にあります。 ## Studio が動く場所 diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index fcaaed3b..d30f429f 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -43,17 +43,10 @@ export function installShutdownHandlers(trainer: Trainer): () => void { `Received ${signal}; early-stopping at next checkpoint…\n`, ); // Drive the trainer's internal early-stop entry point via the - // `Symbol.for("arkor.trainer.requestEarlyStop")` brand. A trainer - // that doesn't carry the brand (third-party shape, pre-SDK trainer) - // returns `null`; fall back to `cancel()` directly so we still - // close out the cloud-side job before exiting. - const stop = - requestTrainerEarlyStop(trainer) ?? - trainer.cancel().catch((err: unknown) => { - const msg = err instanceof Error ? err.message : String(err); - process.stderr.write(`cancel failed: ${msg}\n`); - }); - Promise.resolve(stop) + // `Symbol.for("arkor.trainer.requestEarlyStop")` brand attached by + // `createTrainer`. The runner only reaches this handler with a + // discovered SDK trainer, so the brand is guaranteed to be present. + requestTrainerEarlyStop(trainer) .catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); process.stderr.write(`requestEarlyStop failed: ${msg}\n`); @@ -93,13 +86,7 @@ export function installCallbackReloadHandler( ); return; } - const swapped = replaceTrainerCallbacks(trainer, callbacks); - if (!swapped) { - process.stderr.write( - "Callback reload skipped: running trainer doesn't carry the callback-replacer brand.\n", - ); - return; - } + replaceTrainerCallbacks(trainer, callbacks); process.stdout.write( "Callbacks hot-reloaded; training run continues.\n", ); diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 9e1c3726..b56d60d0 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1485,9 +1485,7 @@ describe("createTrainer (early stop)", () => { try { await trainer.start(); // Tiny timeout so the test doesn't actually wait 5 minutes. - const stopPromise = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); - expect(stopPromise).not.toBeNull(); - await stopPromise!; + await requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); expect(cancelCalls).toBe(1); } finally { globalThis.fetch = original; @@ -1573,10 +1571,9 @@ describe("createTrainer (early stop)", () => { // brand — the same brand `arkor dev`'s SIGUSR2 handler // uses. The next event must dispatch via the new object. if (step === 1) { - const swapped = replaceTrainerCallbacks(trainer, { + replaceTrainerCallbacks(trainer, { onLog: ({ step: s }) => void calls.push(`v2:onLog(${s})`), }); - expect(swapped).toBe(true); } }, }, @@ -1635,9 +1632,7 @@ describe("createTrainer (early stop)", () => { await trainer.start(); const a = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); const b = requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); - expect(a).not.toBeNull(); - expect(b).not.toBeNull(); - await Promise.all([a!, b!]); + await Promise.all([a, b]); // The fallback timer fires once, so cancel is called once even though // the early-stop brand was invoked twice. expect(cancelCalls).toBe(1); diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 9181f8c5..1966aa9e 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -1,4 +1,4 @@ -import type { JobConfig, TrainerCallbacks } from "./types"; +import type { JobConfig, Trainer, TrainerCallbacks } from "./types"; /** * Snapshot of a trainer's identity and cloud-side config that the Studio @@ -112,26 +112,20 @@ export function attachTrainerCallbackReplacer( } /** - * Replace the trainer's lifecycle callbacks atomically. Returns `true` - * when the call landed (the trainer carried the brand), `false` - * otherwise — callers (the SIGUSR2 hot-swap path in `runnerSignals`) - * use the return value to decide whether to log a skip warning. + * Replace the trainer's lifecycle callbacks atomically. The brand is + * unconditionally attached by `createTrainer` in this same SDK package, + * so this can assume the brand is present — there's no documented + * public path that produces a brand-less trainer, and the helper itself + * is never called on user-controlled values. */ export function replaceTrainerCallbacks( - trainer: unknown, + trainer: Trainer, callbacks: Partial, -): boolean { - if (!trainer || typeof trainer !== "object") return false; - const fn = (trainer as Record)[ +): void { + const fn = (trainer as unknown as Record)[ TRAINER_REPLACE_CALLBACKS_KEY - ]; - if (typeof fn !== "function") return false; - try { - (fn as (cbs: Partial) => void).call(trainer, callbacks); - return true; - } catch { - return false; - } + ] as (cbs: Partial) => void; + fn.call(trainer, callbacks); } /** @@ -156,29 +150,19 @@ export function attachTrainerEarlyStopper( /** * Request that the trainer stop after the next saved checkpoint. - * Returns the same promise the underlying implementation hands out — - * resolves once `cancel()` has been accepted by the cloud API, or + * Resolves once `cancel()` has been accepted by the cloud API, or * after `timeoutMs` if no checkpoint arrived in time. * - * Returns `null` when the trainer doesn't carry the early-stop brand - * (third-party wrapper / pre-SDK shape) so callers can decide whether - * to fall back to a hard kill. + * The brand is unconditionally attached by `createTrainer` and the + * runner only ever calls this on a discovered SDK trainer — there's no + * branch for "brand missing". */ export function requestTrainerEarlyStop( - trainer: unknown, + trainer: Trainer, opts?: RequestEarlyStopOptions, -): Promise | null { - if (!trainer || typeof trainer !== "object") return null; - const fn = (trainer as Record)[ +): Promise { + const fn = (trainer as unknown as Record)[ TRAINER_REQUEST_EARLY_STOP_KEY - ]; - if (typeof fn !== "function") return null; - try { - const result = ( - fn as (opts?: RequestEarlyStopOptions) => Promise - ).call(trainer, opts); - return Promise.resolve(result); - } catch (err) { - return Promise.reject(err); - } + ] as (opts?: RequestEarlyStopOptions) => Promise; + return fn.call(trainer, opts); } From 31f510869c3c01b0dfd9d2920d329aebc587df26 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sun, 3 May 2026 18:56:19 +0900 Subject: [PATCH 06/55] fix(hmr.test): update late subscriber event assertion to handle spurious BUNDLE_END on macOS - Modified the test to assert that the late subscriber receives the same event as the prior subscriber, addressing issues with spurious BUNDLE_END events in rolldown@1.0.0-rc.17 on macOS. - Added comments to clarify the reasoning behind the change and noted a future task to revisit the assertion after rolldown stabilizes. --- packages/arkor/src/studio/hmr.test.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 2ea6392c..29052420 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -110,10 +110,21 @@ describe("createHmrCoordinator", () => { await nextEvent(firstEvents, (e) => e.type === "ready"); // A new subscriber should receive the cached state synchronously // before any new build is triggered. + // + // We assert "the late subscriber sees the same event the prior one + // saw last" rather than literally "ready" because rolldown@1.0.0-rc.17 + // on macOS occasionally fires a spurious second BUNDLE_END (FSEvents + // coalescing inside the watcher) — there, `firstEvents` already + // contains the spurious `rebuild` by the time we late-subscribe, and + // the contract under test (replay of the cached state) holds either + // way. + // TODO(rolldown 1.0): re-check after rolldown leaves RC. If the + // spurious BUNDLE_END is gone on macOS, tighten this back to + // expect(lateEvents[0]?.type).toBe("ready"); const lateEvents: HmrEvent[] = []; hmr.subscribe((e) => lateEvents.push(e)); expect(lateEvents.length).toBeGreaterThanOrEqual(1); - expect(lateEvents[0]?.type).toBe("ready"); + expect(lateEvents[0]).toEqual(firstEvents[firstEvents.length - 1]); } finally { await hmr.dispose(); } From 55b4a2e8f6e64d4a358aa504ff45b82f40aabffd Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sun, 3 May 2026 21:18:39 +0900 Subject: [PATCH 07/55] fix(dev): ensure SIGINT handler remains active during token persistence failures - Added a test to verify that the SIGINT exit handler is armed even if the `persistStudioToken` function fails, preventing the dev server from idling indefinitely. - Updated the `runDev` function to register the studio-token cleanup hook unconditionally, ensuring it executes on process termination signals regardless of token persistence success. - Enhanced cleanup logic to maintain proper signal handling during development sessions, improving overall stability and user experience. --- packages/arkor/src/cli/commands/dev.test.ts | 41 +++++++ packages/arkor/src/cli/commands/dev.ts | 18 +++- packages/arkor/src/core/trainer.test.ts | 13 ++- packages/arkor/src/core/trainer.ts | 12 +++ packages/arkor/src/studio/hmr.test.ts | 26 +++++ packages/arkor/src/studio/hmr.ts | 39 +++++++ packages/arkor/src/studio/server.test.ts | 63 ++++++++++- packages/arkor/src/studio/server.ts | 100 +++++++++++------- .../arkor/src/studio/trainRegistry.test.ts | 58 ++++++++++ packages/arkor/src/studio/trainRegistry.ts | 52 +++++++-- .../studio-app/src/components/RunTraining.tsx | 23 ++-- 11 files changed, 383 insertions(+), 62 deletions(-) diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index 1104489c..eb630cb9 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -704,6 +704,47 @@ describe("runDev", () => { } }); + it("keeps the SIGINT exit handler armed even when persisting the studio token fails", async () => { + // Regression: if `persistStudioToken` threw, the previous code + // skipped `scheduleStudioTokenCleanup` — and that was the *only* + // hook that called `process.exit(0)` on SIGINT. The leftover HMR + // hook overrides Node's default "exit on SIGINT" behaviour, so the + // dev server would idle in the foreground forever. The fix + // registers the token cleanup unconditionally; here we make + // persist throw and verify SIGINT still terminates. + if (typeof process.getuid === "function" && process.getuid() === 0) { + // Root bypasses chmod permission checks — skip on root containers. + return; + } + chmodSync(join(fakeHome, ".arkor"), 0o555); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + try { + await runDev({ port: 4206 }); + } finally { + stdoutSpy.mockRestore(); + chmodSync(join(fakeHome, ".arkor"), 0o755); + } + + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((_code?: number) => { + return undefined as never; + }) as typeof process.exit); + try { + const sigintListeners = process.listeners("SIGINT"); + const handler = sigintListeners[sigintListeners.length - 1] as () => void; + handler(); + // Even though the token file was never written, the cleanup hook + // ran (best-effort `unlinkSync` swallows ENOENT) and the + // exit-on-signal arm fired. + expect(exitSpy).toHaveBeenCalledWith(0); + } finally { + exitSpy.mockRestore(); + } + }); + it("registers a cleanup listener that removes the studio-token file on exit", async () => { const stdoutSpy = vi .spyOn(process.stdout, "write") diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index 89dd7f5e..077fe8f7 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -213,16 +213,28 @@ export async function runDev(options: DevOptions = {}): Promise { const hmr = createHmrCoordinator({ cwd: process.cwd() }); scheduleHmrCleanup(hmr); + // Register the studio-token cleanup *unconditionally* up-front. The hook + // is the only one that calls `process.exit(0)` on SIGINT/SIGTERM/SIGHUP + // (the HMR hook above only disposes), and `registerCleanupHook` overrides + // Node's default "exit on signal" behaviour for any signal it listens + // on. If we were to gate this hook behind a successful `persistStudioToken` + // and the persist threw, Ctrl-C would run the HMR dispose and then leave + // the server idle in the foreground — no exit ever fires. Registering + // first means the hook is in place even if persist fails; the cleanup + // body is best-effort (`unlinkSync` in a try/catch) so calling it on a + // file that was never written is a silent no-op. + const tokenPath = studioTokenPath(); + scheduleStudioTokenCleanup(tokenPath); + // Persisting the token to disk is *only* needed for the Vite SPA dev // workflow. The bundled `:port` flow injects the meta tag at request time // via `buildStudioApp`, so a failure here (read-only $HOME on Docker / // locked-down CI / restrictive umask) must not block the server. try { - const tokenPath = await persistStudioToken(studioToken); - scheduleStudioTokenCleanup(tokenPath); + await persistStudioToken(studioToken); } catch (err) { ui.log.warn( - `Could not write ${studioTokenPath()} (${ + `Could not write ${tokenPath} (${ err instanceof Error ? err.message : String(err) }). The Studio at http://localhost:${port} is unaffected, but the Vite SPA dev workflow will see 403s on /api/*.`, ); diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index b56d60d0..577596b0 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1405,12 +1405,23 @@ describe("createTrainer (early stop)", () => { ); const original = globalThis.fetch; globalThis.fetch = fetcher; + let result: Awaited>; try { - await trainer.wait(); + result = await trainer.wait(); } finally { globalThis.fetch = original; } expect(cancelCalls).toBe(1); + // Regression: the early-stop checkpoint branch returns + // `{ terminal: true }` to break out of `wait()`'s loop without + // waiting for a cloud-side terminal event. The `TrainingResult` + // it resolves with must therefore reflect a terminal status + // locally — otherwise `wait()` violates its documented contract + // ("Resolve when the job reaches a terminal status") and a + // subsequent `requestEarlyStop` wouldn't see the + // `TERMINAL_STATUSES` short-circuit. + expect(result.job.status).toBe("cancelled"); + expect(result.job.completedAt).toBe("2026-01-01T00:00:03Z"); }); it("falls back to immediate cancel when no checkpoint arrives within timeoutMs", async () => { diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index ae657610..4c95cf06 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -277,9 +277,21 @@ export function createTrainer( // is durable. Cancel the cloud job and end `wait()` cleanly. if (earlyStopRequested && earlyStopDeferred) { await trainer.cancel(); + // Reflect the cancellation locally so `wait()`'s resolved + // `TrainingResult.job.status` is a terminal status (per the + // documented contract). Without this update the result would + // surface as `status: "running"`, and a subsequent + // `requestEarlyStop` would not see the + // `TERMINAL_STATUSES.has(...)` short-circuit it relies on. + startedJob = { + ...startedJob, + status: "cancelled", + completedAt: event.timestamp, + }; if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); earlyStopDeferred.resolve(); earlyStopDeferred = null; + earlyStopRequested = false; return { terminal: true, artifacts: terminalResult?.artifacts ?? [] }; } return { terminal: false, artifacts: terminalResult?.artifacts ?? [] }; diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 29052420..76c7e1b3 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -99,6 +99,32 @@ describe("createHmrCoordinator", () => { } }); + it("transitions from `error` to `ready` once the entry appears, without re-subscribing", async () => { + // Regression: previously `startWatcher` bailed out and never + // retried, so an SPA already connected to `/api/dev/events` against + // a fresh scaffold would be stuck on the initial `error` event + // forever — EventSource doesn't reconnect on application-level + // errors. The coordinator now polls for the entry file in the + // background and starts the watcher the moment it appears. + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + await nextEvent(events, (e) => e.type === "error", 1000); + // Same subscriber — no reconnect, no second `subscribe` call. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + const ready = await nextEvent( + events, + (e) => e.type === "ready", + 4000, + ); + expect(ready.outFile).toMatch(/index\.mjs$/); + } finally { + await hmr.dispose(); + } + }); + it("replays the latest event to late subscribers", async () => { mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 9433bbdc..ca0dc8e8 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -100,6 +100,15 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { let lastEvent: HmrEvent | null = null; let watcher: RolldownWatcher | null = null; let disposed = false; + /** + * When `startWatcher` runs against a project that doesn't have an + * entry file yet, a poll timer takes over and waits for the file to + * appear. Without this, an SPA that opened `/api/dev/events` against + * a fresh scaffold would hang on the initial `error` event forever + * — `startWatcher` is only re-entered on `subscribe()`, but EventSource + * doesn't reconnect on application-level errors. + */ + let entryWaitTimer: ReturnType | null = null; function broadcast(event: HmrEvent): void { lastEvent = event; @@ -133,8 +142,34 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { type: "error", message: `Build entry not found: ${resolved.entry}. Create ${BUILD_DEFAULTS.entry} or pass an explicit entry argument.`, }); + // Hand off to a low-frequency poll so an SPA already connected to + // `/api/dev/events` transitions from "error" to "ready" the moment + // the user creates the entry file — no manual reconnect required. + // The poll is `unref()`'d so it never blocks process exit, and + // `dispose()` clears it. + if (!entryWaitTimer) { + entryWaitTimer = setInterval(() => { + if (disposed || watcher) { + if (entryWaitTimer) clearInterval(entryWaitTimer); + entryWaitTimer = null; + return; + } + if (existsSync(resolved.entry)) { + if (entryWaitTimer) clearInterval(entryWaitTimer); + entryWaitTimer = null; + startWatcher(); + } + }, 1000); + entryWaitTimer.unref?.(); + } return; } + // The entry exists now — clear any leftover poll timer from a prior + // failed startWatcher invocation. + if (entryWaitTimer) { + clearInterval(entryWaitTimer); + entryWaitTimer = null; + } watcher = watch({ ...rolldownInputOptions(resolved), output: { file: resolved.outFile, format: "esm" }, @@ -172,6 +207,10 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { async dispose() { disposed = true; subscribers.clear(); + if (entryWaitTimer) { + clearInterval(entryWaitTimer); + entryWaitTimer = null; + } if (watcher) { const w = watcher; watcher = null; diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 21cbf94d..ee180eeb 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -1404,16 +1404,35 @@ process.exit(0); cwd: trainCwd, hmr: fake.coordinator, }); + // The server subscribes to the HMR coordinator exactly once at + // build time (so multiple SSE clients don't fan signal dispatch + // out to the same child N times). Per-client cleanup happens on + // the SSE listener set, not against the coordinator — so + // `fake.subscriberCount` stays at 1 across the connection + // lifecycle. We assert that here rather than expect the + // pre-refactor "0 after cancel" behaviour. + expect(fake.subscriberCount).toBe(1); const res = await app.request( `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, { headers: { host: "127.0.0.1:4000" } }, ); expect(res.status).toBe(200); expect(res.headers.get("content-type")).toBe("text/event-stream"); - // Cancelling the body's reader should release the subscriber. const reader = res.body!.getReader(); await reader.cancel(); - expect(fake.subscriberCount).toBe(0); + // Cancel doesn't unsubscribe the server-level listener; emitting + // an event after cancel must still be safe (the SSE listener that + // was registered for this connection is removed, so the + // controller-closed try/catch in `send` is never exercised). + expect(() => + fake.emit({ + type: "rebuild", + outFile: "/tmp/x", + hash: "h", + configHash: null, + trainerName: null, + }), + ).not.toThrow(); }); it("rejects /api/dev/events when host header is non-loopback", async () => { @@ -1433,6 +1452,46 @@ process.exit(0); expect(res.status).toBe(403); }); + it("dispatches HMR signals exactly once per rebuild regardless of connected SSE client count", async () => { + // Regression: previously each `/api/dev/events` connection + // attached its own `hmr.subscribe(...)` callback, so a rebuild + // with N open Studio tabs fanned out into N × SIGUSR2 / N × + // SIGTERM per child. The runner's shutdown handler interprets a + // *second* SIGTERM as the emergency `exit(143)` fast-path, which + // would defeat checkpoint preservation. The server now subscribes + // to the coordinator exactly once and broadcasts the augmented + // payload to every SSE client; we assert that subscriber count + // doesn't grow when extra connections are opened. + const fake = fakeHmr(); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fake.coordinator, + }); + expect(fake.subscriberCount).toBe(1); + const r1 = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "127.0.0.1:4000" } }, + ); + const r2 = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "127.0.0.1:4000" } }, + ); + // Pump the streams so their `start()` runs, registering the + // per-client SSE listeners on the server side. + const reader1 = r1.body!.getReader(); + const reader2 = r2.body!.getReader(); + // Even with two concurrent SSE clients the HMR coordinator still + // sees exactly the one server-level subscriber. + expect(fake.subscriberCount).toBe(1); + await reader1.cancel(); + await reader2.cancel(); + expect(fake.subscriberCount).toBe(1); + }); + it("forwards rebuild events as SSE frames", async () => { const fake = fakeHmr(); const app = buildStudioApp({ diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 27579232..7380420b 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -15,7 +15,7 @@ import { recordDeprecation, tapDeprecation } from "../core/deprecation"; import { SDK_VERSION } from "../core/version"; import { ensureProjectState } from "../core/projectState"; import { readState } from "../core/state"; -import { readManifestSummary, summariseBuiltManifest } from "./manifest"; +import { readManifestSummary } from "./manifest"; import type { HmrCoordinator, HmrEvent } from "./hmr"; import { TrainRegistry, type RestartTarget } from "./trainRegistry"; @@ -383,56 +383,78 @@ export function buildStudioApp(options: StudioServerOptions) { } if (options.hmr) { const hmr = options.hmr; - app.get("/api/dev/events", (c) => { - let unsubscribe: (() => void) | null = null; + /** Augmented event = raw HMR event + the per-child signal results we + * computed for it. We compute these once per rebuild (not once per + * connected SSE client) so opening multiple Studio tabs doesn't fan + * out into N × SIGTERM / N × SIGUSR2 to each child. */ + type AugmentedEvent = HmrEvent & { + restart?: boolean; + hotSwap?: boolean; + restartTargets?: RestartTarget[]; + hotSwapTargets?: RestartTarget[]; + }; + const sseListeners = new Set<(event: AugmentedEvent) => void>(); + let lastAugmented: AugmentedEvent | null = null; + + // Single subscription against the HMR coordinator: this handler does + // signal dispatch + augmentation exactly once per rebuild, then fans + // the augmented payload out to every connected SSE client. Late- + // mounting clients receive `lastAugmented` instead of triggering a + // fresh signal pass against the same rebuild. + hmr.subscribe((event) => { + let augmented: AugmentedEvent = event; + if (event.type === "rebuild" && activeTrains.size > 0) { + // Per-child decision: if the rebuilt bundle's `configHash` + // matches the child's spawn-time hash, the cloud-side run is + // unaffected — SIGUSR2 lets the runner re-import and rotate the + // callbacks cell. Otherwise SIGTERM triggers the trainer's + // internal early-stop so the next checkpoint lands before the + // SPA re-spawns. + const nextHash = event.configHash ?? null; + const hotSwapTargets = activeTrains.notifyCallbackReload(nextHash); + const restartTargets = + activeTrains.requestEarlyStopOnMismatch(nextHash); + augmented = { + ...event, + hotSwap: hotSwapTargets.length > 0, + hotSwapTargets, + restart: restartTargets.length > 0, + restartTargets, + }; + } + lastAugmented = augmented; + for (const fn of sseListeners) { + try { + fn(augmented); + } catch { + // listener controller closed mid-write — the cancel hook + // below takes care of removing it from the set. + } + } + }); + + app.get("/api/dev/events", () => { + const enc = new TextEncoder(); + let listener: ((event: AugmentedEvent) => void) | null = null; const stream = new ReadableStream({ start(controller) { - const enc = new TextEncoder(); - const send = ( - event: HmrEvent & { - restart?: boolean; - hotSwap?: boolean; - restartTargets?: RestartTarget[]; - hotSwapTargets?: RestartTarget[]; - }, - ) => { + const send = (event: AugmentedEvent): void => { const payload = JSON.stringify(event); try { controller.enqueue( enc.encode(`event: ${event.type}\ndata: ${payload}\n\n`), ); } catch { - // controller closed mid-write; the unsubscribe path below - // takes care of the rest. + // controller closed mid-write; cancel() removes us. } }; - unsubscribe = hmr.subscribe((event) => { - if (event.type !== "rebuild" || activeTrains.size === 0) { - send(event); - return; - } - // Per-child decision: if the rebuilt bundle's `configHash` - // matches the child's spawn-time hash, the cloud-side run - // is unaffected — SIGUSR2 lets the runner re-import and - // call `Trainer.replaceCallbacks`. Otherwise SIGTERM - // triggers `Trainer.requestEarlyStop` so the next - // checkpoint lands before the SPA re-spawns. - const nextHash = event.configHash ?? null; - const hotSwapTargets = activeTrains.notifyCallbackReload(nextHash); - const restartTargets = - activeTrains.requestEarlyStopOnMismatch(nextHash); - send({ - ...event, - hotSwap: hotSwapTargets.length > 0, - hotSwapTargets, - restart: restartTargets.length > 0, - restartTargets, - }); - }); + if (lastAugmented) send(lastAugmented); + listener = send; + sseListeners.add(send); }, cancel() { - unsubscribe?.(); - unsubscribe = null; + if (listener) sseListeners.delete(listener); + listener = null; }, }); return new Response(stream, { diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index a1f7c2a1..db6acae1 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -115,4 +115,62 @@ describe("TrainRegistry", () => { expect(() => reg.notifyCallbackReload("h")).not.toThrow(); expect(() => reg.requestEarlyStopOnMismatch("x")).not.toThrow(); }); + + it("requestEarlyStopOnMismatch omits dead-on-kill children from the restart targets", () => { + // Regression: previously the implementation always pushed onto + // `targets` even when `kill()` threw, so a child that had already + // exited would still be reported back to the SPA as a restart + // target — the SPA would then wait forever for the (already- + // delivered) `exit=...` line and never re-spawn. + const reg = new TrainRegistry(); + const dead = fakeChild(801); + dead.kill.mockImplementation(() => { + throw new Error("ESRCH"); + }); + reg.register(dead as unknown as ChildProcess, { + configHash: "stale", + trainFile: "/tmp/d.ts", + }); + expect(reg.requestEarlyStopOnMismatch("fresh")).toEqual([]); + }); + + it("requestEarlyStopOnMismatch sends SIGTERM at most once per child across rebuilds", () => { + // Regression: under rapid edits the dev loop can fire multiple + // rebuilds before the child reaches its next checkpoint. The + // runner's shutdown handler treats a *second* SIGTERM as the + // emergency `exit(143)` fast-path, which would defeat the whole + // point of preserving the in-flight checkpoint. The registry now + // tracks per-child early-stop state and skips children it has + // already signalled. + const reg = new TrainRegistry(); + const a = fakeChild(901); + reg.register(a as unknown as ChildProcess, { + configHash: "h1", + trainFile: "/tmp/a.ts", + }); + + const first = reg.requestEarlyStopOnMismatch("h2"); + expect(first).toEqual([{ pid: 901, trainFile: "/tmp/a.ts" }]); + expect(a.kill).toHaveBeenCalledTimes(1); + + // Second mismatching rebuild before the child has exited: must NOT + // re-send SIGTERM and must NOT re-list the child as a restart + // target (the SPA already has a pending re-spawn for it). + const second = reg.requestEarlyStopOnMismatch("h3"); + expect(second).toEqual([]); + expect(a.kill).toHaveBeenCalledTimes(1); + + // After the child exits and is unregistered, a fresh spawn in its + // place starts from a clean slate. + reg.unregister(901); + const respawn = fakeChild(902); + reg.register(respawn as unknown as ChildProcess, { + configHash: "h3", + trainFile: "/tmp/a.ts", + }); + expect(reg.requestEarlyStopOnMismatch("h4")).toEqual([ + { pid: 902, trainFile: "/tmp/a.ts" }, + ]); + expect(respawn.kill).toHaveBeenCalledTimes(1); + }); }); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index af4e4329..484d01d4 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -21,6 +21,16 @@ export interface ActiveTrain { * build). A null entry forces SIGTERM on the next rebuild because we * can't prove the configs match. */ configHash: string | null; + /** + * `true` once we've already SIGTERM'd this child for an HMR-driven + * early-stop. Subsequent rebuilds (which can land before the child + * has reached its next checkpoint) must NOT re-send SIGTERM — + * the runner's shutdown handler treats a second SIGTERM as the + * emergency `process.exit(143)` escape hatch, which would defeat + * the whole point of preserving the in-flight checkpoint. Kept + * internal to the registry; consumers shouldn't manage it. + */ + earlyStopRequested?: boolean; } export interface RestartTarget { @@ -37,9 +47,16 @@ export interface RestartTarget { export class TrainRegistry { private readonly entries = new Map(); - register(child: ChildProcess, init: Omit): void { + register( + child: ChildProcess, + init: Omit, + ): void { if (typeof child.pid !== "number") return; - this.entries.set(child.pid, { child, ...init }); + this.entries.set(child.pid, { + child, + ...init, + earlyStopRequested: false, + }); } unregister(pid: number | undefined): void { @@ -84,21 +101,31 @@ export class TrainRegistry { /** * Send a graceful early-stop signal (SIGTERM) to every child whose - * stored `configHash` differs from `nextConfigHash`. The child's - * runner (`installShutdownHandlers`) calls `Trainer.requestEarlyStop` - * which preserves the in-flight checkpoint before exiting. Returns - * the list of children signalled so the SPA can re-spawn them with - * the new bundle. + * stored `configHash` differs from `nextConfigHash` AND that hasn't + * already been signalled. The child's runner + * (`installShutdownHandlers`) calls the trainer's internal + * early-stop entry point which preserves the in-flight checkpoint + * before exiting. Returns the list of children we actually + * signalled this call so the SPA can re-spawn them with the new + * bundle. * * If `nextConfigHash` is null (the new bundle has no inspectable - * trainer), every active child is SIGTERM'd defensively — we can't - * prove their configs are unaffected. + * trainer), every active not-yet-signalled child is SIGTERM'd + * defensively — we can't prove their configs are unaffected. + * + * Re-signal protection: a second SIGTERM hits the runner's + * emergency `exit(143)` fast-path and would defeat checkpoint + * preservation. Children flagged `earlyStopRequested` here are + * skipped on subsequent rebuilds; the entry is removed from the + * registry when the child exits, so the next spawn starts from a + * clean slate. */ requestEarlyStopOnMismatch( nextConfigHash: string | null, ): RestartTarget[] { const targets: RestartTarget[] = []; for (const [pid, entry] of this.entries) { + if (entry.earlyStopRequested) continue; if ( nextConfigHash === null || entry.configHash === null || @@ -106,10 +133,15 @@ export class TrainRegistry { ) { try { entry.child.kill("SIGTERM"); + entry.earlyStopRequested = true; + // Push only after a successful kill; a thrown `kill` means + // the child has already exited and is not a real restart + // target (the SPA would otherwise wait forever for an + // exit message that never comes). + targets.push({ pid, trainFile: entry.trainFile }); } catch { // child already exited; close handler will clean up. } - targets.push({ pid, trainFile: entry.trainFile }); } } return targets; diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 63c82e73..923e793a 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -12,7 +12,7 @@ export function RunTraining() { const [log, setLog] = useState(""); const [manifest, setManifest] = useState(null); const [hmrStatus, setHmrStatus] = useState< - "idle" | "rebuilding" | "early-stopping" | "restarting" | "hot-swapped" + "idle" | "early-stopping" | "restarting" | "hot-swapped" >("idle"); const boxRef = useRef(null); const lastTrainFileRef = useRef(undefined); @@ -64,12 +64,21 @@ export function RunTraining() { }); }); if (payload.restart) { - // Training run is early-stopping; the active stream will resolve - // once the next checkpoint lands and the subprocess exits cleanly. - // The `finally` block of `run()` picks up the pending flag and - // re-spawns with the same args. - restartPendingRef.current = true; - setHmrStatus(runningRef.current ? "early-stopping" : "idle"); + // `/api/dev/events` is a broadcast — every open Studio tab gets + // this event. Only flip the auto-restart latch when *this* tab + // is actually running a stream right now; otherwise a passive + // tab would silently auto-spawn an extra job the next time the + // user clicks Run training, doubling cloud spend. + if (runningRef.current) { + // Training run is early-stopping; the active stream will + // resolve once the next checkpoint lands and the subprocess + // exits cleanly. The `finally` block of `run()` picks up the + // pending flag and re-spawns with the same args. + restartPendingRef.current = true; + setHmrStatus("early-stopping"); + } else { + setHmrStatus("idle"); + } } else if (payload.hotSwap) { // Callbacks were swapped in place — the cloud-side run is // unaffected. Flash a brief "hot-swapped" indicator so users From 1eedccf234cbb04c74feed5e7e0b70170be345f0 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 02:39:51 +0900 Subject: [PATCH 08/55] feat: enhance HMR handling and subprocess management - Refactor manifest summary to utilize a new `findTrainerInModule` helper for improved trainer detection. - Introduce `X-Arkor-Train-Pid` response header to scope HMR restart events to the correct subprocess, preventing cross-tab interference. - Optimize server logic to read the spawn-time `configHash` from the HMR coordinator, eliminating unnecessary rebuilds. - Consolidate signal dispatching for HMR rebuilds into a single `dispatchRebuild` method in `TrainRegistry`, improving clarity and efficiency. - Update client-side logic to handle new HMR event structures, ensuring proper response to hot-swaps and restarts based on the active tab's subprocess. - Ensure robust error handling for subprocess termination, preventing crashes on unexpected exit scenarios. --- packages/arkor/src/cli/cleanupHooks.test.ts | 104 +++++++++++ packages/arkor/src/cli/cleanupHooks.ts | 96 +++++++--- packages/arkor/src/cli/commands/dev.test.ts | 21 ++- packages/arkor/src/core/runnerSignals.ts | 30 +--- packages/arkor/src/core/trainer.test.ts | 102 +++++++++++ packages/arkor/src/core/trainer.ts | 16 +- .../arkor/src/core/trainerInspection.test.ts | 117 +++++++++++++ packages/arkor/src/core/trainerInspection.ts | 86 ++++++++- packages/arkor/src/studio/hmr.test.ts | 33 ++++ packages/arkor/src/studio/hmr.ts | 38 +++- packages/arkor/src/studio/manifest.ts | 33 ++-- packages/arkor/src/studio/server.test.ts | 112 +++++++++++- packages/arkor/src/studio/server.ts | 74 +++++--- .../arkor/src/studio/trainRegistry.test.ts | 164 +++++++++++------- packages/arkor/src/studio/trainRegistry.ts | 164 +++++++++++------- .../studio-app/src/components/RunTraining.tsx | 112 ++++++++---- packages/studio-app/src/lib/api.ts | 22 ++- 17 files changed, 1067 insertions(+), 257 deletions(-) create mode 100644 packages/arkor/src/cli/cleanupHooks.test.ts create mode 100644 packages/arkor/src/core/trainerInspection.test.ts diff --git a/packages/arkor/src/cli/cleanupHooks.test.ts b/packages/arkor/src/cli/cleanupHooks.test.ts new file mode 100644 index 00000000..9428481f --- /dev/null +++ b/packages/arkor/src/cli/cleanupHooks.test.ts @@ -0,0 +1,104 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; +import { registerCleanupHook } from "./cleanupHooks"; + +// Each test that emits a signal also installs new listeners on +// `process` for the lifetime of this worker. We can't `process.off` +// the listeners (they're closures inside `registerCleanupHook`) but +// we can ensure each test fires its own per-registration handler and +// process.exit is mocked so the worker survives. + +let exitSpy: ReturnType | null = null; +let stdoutSpy: ReturnType | null = null; + +afterEach(() => { + exitSpy?.mockRestore(); + stdoutSpy?.mockRestore(); + exitSpy = null; + stdoutSpy = null; +}); + +function mockExit(): number[] { + const codes: number[] = []; + exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((code?: number) => { + codes.push(code ?? 0); + return undefined as never; + }) as typeof process.exit); + return codes; +} + +function flushMicrotasks(): Promise { + return new Promise((resolve) => setImmediate(resolve)); +} + +describe("registerCleanupHook", () => { + it("waits for an async sibling cleanup to settle before exitOnSignal fires", async () => { + // Regression: previously the signal handler called + // `process.exit(0)` immediately after kicking off cleanup, so a + // sibling registration's async dispose (`hmr.dispose()`) got cut + // off mid-promise. The fix coordinates via a module-level + // in-flight set so the exit-owning hook awaits every other + // registered cleanup before terminating. + const order: string[] = []; + let resolveSlowDispose!: () => void; + const slowDispose = new Promise((resolve) => { + resolveSlowDispose = resolve; + }); + + registerCleanupHook({ + cleanup: () => + slowDispose.then(() => { + order.push("async-cleanup-finished"); + }), + }); + registerCleanupHook({ + cleanup: () => { + order.push("sync-cleanup"); + }, + exitOnSignal: true, + }); + + const codes = mockExit(); + process.emit("SIGINT", "SIGINT"); + + // Sync cleanup body has already fired; async one is still pending, + // and exit must NOT have been called yet. + expect(order).toEqual(["sync-cleanup"]); + expect(codes).toEqual([]); + + // Resolve the slow dispose; one microtask later the coordinator + // fires process.exit(0). + resolveSlowDispose(); + await flushMicrotasks(); + await flushMicrotasks(); + + expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); + expect(codes).toEqual([0]); + }); + + it("is idempotent against repeated signals (done latch + bounded exit)", async () => { + let invocations = 0; + registerCleanupHook({ + cleanup: () => { + invocations += 1; + }, + exitOnSignal: true, + }); + + const codes = mockExit(); + process.emit("SIGINT", "SIGINT"); + process.emit("SIGINT", "SIGINT"); + process.emit("SIGINT", "SIGINT"); + await flushMicrotasks(); + await flushMicrotasks(); + + // Cleanup body runs once even if the signal fires multiple times. + expect(invocations).toBe(1); + // Exit may be called multiple times (once per signal handler + // that armed it), but the mock no-ops so the worker survives — + // verify at least one exit fired. + expect(codes.length).toBeGreaterThanOrEqual(1); + expect(codes[0]).toBe(0); + }); +}); diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index 473bcbc7..1e4cd33e 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -3,20 +3,38 @@ const TERMINATING_SIGNALS = ["SIGINT", "SIGTERM", "SIGHUP"] as const; export interface CleanupHookOptions { /** * Idempotent cleanup body. Wrapped with a `done` guard so a noisy - * shutdown (signal arriving while `process.exit` is already running an - * `exit` listener) doesn't trigger a double-cleanup. + * shutdown (signal arriving while `process.exit` is already running + * an `exit` listener) doesn't trigger a double-cleanup. May be sync + * or return a Promise; async cleanups are awaited (across **all + * registered hooks**) before `exitOnSignal` fires the final + * `process.exit`. */ cleanup: () => void | Promise; /** - * Whether the signal-handler arm of the registration should also call - * `process.exit(0)` after cleanup. Use `true` for the outermost - * cleanup responsible for terminating the process; `false` for inner - * cleanups that should pass control through to a sibling exit - * handler. Default: `false`. + * Whether the signal-handler arm of this registration should call + * `process.exit(0)` once every in-flight cleanup (this hook + any + * siblings registered in the same process) has settled. Use `true` + * for the outermost cleanup responsible for terminating the + * process; `false` for inner cleanups that should let a sibling + * own the exit. Default: `false`. */ exitOnSignal?: boolean; } +/** + * Module-scoped tracker of cleanup promises that haven't settled yet. + * The exit-owning hook waits on the union of (its own cleanup) + + * (every other in-flight cleanup) before calling `process.exit(0)`, + * so a fire-and-forget async cleanup in a sibling registration — + * `hmr.dispose()` is the canonical example — isn't cut off by an + * eager exit. + * + * Auto-prunes via the `.finally(() => inFlightCleanups.delete(...))` + * each `run()` attaches, so the set doesn't grow without bound across + * multiple `runDev()` invocations in the same process (tests). + */ +const inFlightCleanups = new Set>(); + /** * Register a cleanup hook that fires on `process.exit` and on * SIGINT / SIGTERM / SIGHUP. Used by `runDev` to dispose long-lived @@ -24,39 +42,63 @@ export interface CleanupHookOptions { * call site re-implementing the same idempotent-guard + per-signal * registration boilerplate. * - * Registration order matters: Node fires listeners in the order they - * were attached, so the *first* `registerCleanupHook` call gets to run - * before subsequent ones. The Studio dev launcher relies on this to - * guarantee that "tear down HMR" lands before "remove studio-token". + * Per-registration signal listeners (rather than a singleton): each + * `runDev()` invocation gets its own listener wired to its own + * `done` latch. This matches the old behaviour and keeps test + * isolation simple (vitest's per-test cleanup doesn't have to reach + * into module state). + * + * `process.on("exit", ...)` listeners cannot be async — Node fires + * them right before the process terminates and discards any returned + * promise. We still register so sync cleanups (e.g. `unlinkSync`) run + * on a normal `process.exit(0)` path that never reached a signal + * handler. Async tails on this path are best-effort. The signal- + * handler path *does* await async tails before exiting. */ export function registerCleanupHook(options: CleanupHookOptions): void { let done = false; - // Synchronous wrapper so signal handlers preserve "cleanup landed - // before this function returns" — important for sync cleanups (e.g. - // `unlinkSync`) and for tests that assert the side effect right after - // invoking the handler. Async cleanups are fire-and-forget with a - // catch so a hung dispose doesn't block exit. - const run = (): void => { - if (done) return; + const run = (): Promise => { + if (done) return Promise.resolve(); done = true; + let promise: Promise; try { const result = options.cleanup(); - if (result && typeof (result as Promise).catch === "function") { - (result as Promise).catch(() => { - // best-effort: shutdown is racing other cleanup paths - }); - } + // Wrap so callers can await uniformly even when cleanup was + // synchronous. Catch is attached so a thrown async cleanup + // doesn't leave an unhandled rejection on the floor. + promise = Promise.resolve(result).catch(() => { + // best-effort: shutdown is racing other cleanup paths + }); } catch { - // best-effort + promise = Promise.resolve(); } + inFlightCleanups.add(promise); + void promise.finally(() => inFlightCleanups.delete(promise)); + return promise; }; - process.on("exit", run); + process.on("exit", () => { + void run(); + }); for (const sig of TERMINATING_SIGNALS) { process.on(sig, () => { - run(); - if (options.exitOnSignal) process.exit(0); + // Sync cleanup body fires inside this `run()` call before the + // returned promise resolves; that preserves "side effect is + // observable right after the handler returns" for sync + // cleanups like `unlinkSync` (and the existing tests that + // assert on it). + const my = run(); + if (!options.exitOnSignal) return; + // Wait for THIS hook's tail and every other in-flight cleanup + // (siblings registered in the same process) before exiting. + // Settled promises pass through Promise.allSettled in a single + // microtask, so a process whose hooks are all synchronous + // exits effectively immediately. + void Promise.allSettled([ + my, + ...inFlightCleanups, + ]).then(() => process.exit(0)); }); } } diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index eb630cb9..47299303 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -33,6 +33,19 @@ import { } from "../../core/credentials"; import { ensureCredentialsForStudio, runDev } from "./dev"; +/** + * Yield long enough for the cleanupHooks coordinator to settle its + * `Promise.allSettled(...)` chain and dispatch `process.exit(0)`. Two + * `setImmediate`-equivalent ticks cover the typical case (one for the + * `Promise.resolve(...)` wrapping inside `run()`, one for the + * `.then(() => process.exit(0))`); using `setImmediate` instead of + * `Promise.resolve` ensures the exit microtask actually runs before + * we resume. + */ +function flushMicrotasks(): Promise { + return new Promise((resolve) => setImmediate(resolve)); +} + let fakeHome: string; const ORIG_HOME = process.env.HOME; // `os.homedir()` reads USERPROFILE on Windows; HOME-only redirection leaves @@ -697,7 +710,12 @@ describe("runDev", () => { const sigintListeners = process.listeners("SIGINT"); const handler = sigintListeners[sigintListeners.length - 1] as () => void; handler(); + // Sync side effect (token unlink) lands inside the synchronous + // portion of the handler. expect(existsSync(studioTokenPath())).toBe(false); + // Exit fires after `Promise.allSettled(asyncCleanups)` resolves — + // a few microticks later. Flush to let the queued exit run. + await flushMicrotasks(); expect(exitSpy).toHaveBeenCalledWith(0); } finally { exitSpy.mockRestore(); @@ -738,7 +756,8 @@ describe("runDev", () => { handler(); // Even though the token file was never written, the cleanup hook // ran (best-effort `unlinkSync` swallows ENOENT) and the - // exit-on-signal arm fired. + // exit-on-signal arm fired (after async cleanup tails settle). + await flushMicrotasks(); expect(exitSpy).toHaveBeenCalledWith(0); } finally { exitSpy.mockRestore(); diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index d30f429f..0638ddfa 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -1,7 +1,6 @@ import { pathToFileURL } from "node:url"; -import { isArkor } from "./arkor"; import { - getTrainerInspection, + findInspectableTrainer, replaceTrainerCallbacks, requestTrainerEarlyStop, } from "./trainerInspection"; @@ -104,29 +103,14 @@ export function installCallbackReloadHandler( /** * Extract the user-supplied callbacks reference from a re-imported - * bundle. Mirrors `runner.ts`'s entry-extraction precedence (named - * `arkor` export → bare `trainer` → default-export shapes) but pulls - * callbacks via `getTrainerInspection` so we get the current cell of - * `currentCallbacks` at re-import time. Returns `null` when the new - * bundle has no inspectable trainer. + * bundle. Delegates the entry-shape walk to `findInspectableTrainer` + * so SIGUSR2's view of "what counts as a trainer" stays identical to + * the HMR coordinator's `inspectBundle` and `runner.ts`'s + * `extractTrainer`. Returns `null` when no candidate carries the + * inspection brand. */ function extractCallbacks( mod: Record, ): Partial | null { - const candidates: unknown[] = []; - if (isArkor(mod.arkor) && mod.arkor.trainer) candidates.push(mod.arkor.trainer); - if (mod.trainer) candidates.push(mod.trainer); - if (isArkor(mod.default) && mod.default.trainer) candidates.push(mod.default.trainer); - if ( - mod.default && - typeof mod.default === "object" && - "trainer" in (mod.default as Record) - ) { - candidates.push((mod.default as Record).trainer); - } - for (const c of candidates) { - const inspection = getTrainerInspection(c); - if (inspection) return inspection.callbacks; - } - return null; + return findInspectableTrainer(mod)?.callbacks ?? null; } diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 577596b0..0b9fcb24 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1424,6 +1424,108 @@ describe("createTrainer (early stop)", () => { expect(result.job.completedAt).toBe("2026-01-01T00:00:03Z"); }); + it("early-stop checkpoint branch still resolves the deferred when cancel() throws", async () => { + // Regression: previously, an `await trainer.cancel()` that threw + // (network failure / cloud-api 5xx during the cancel POST) would + // propagate out of the dispatch and leave `earlyStopDeferred` + // pending forever. The runner's SIGTERM handler awaits that + // promise before exiting, so the subprocess would hang on + // shutdown. The fix swallows the cancel throw best-effort and + // still marks the run terminal locally so the deferred resolves. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: checkpoint.saved\ndata: ${JSON.stringify({ + type: "checkpoint.saved", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + step: 10, + })}\n\n`, + ]; + let cancelAttempts = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelAttempts += 1; + // Simulate the cloud-api being unreachable mid-cancel. + throw new TypeError("fetch failed"); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: () => { + void requestTrainerEarlyStop(trainer, { timeoutMs: 60_000 }); + }, + }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + let stopPromiseResult: "resolved" | "rejected" | "pending" = "pending"; + const stopPromise = new Promise((resolve) => { + // Don't drive the early-stop ourselves — `onLog` arms it. We + // just want to verify that whichever code path ultimately drives + // it sees a resolved deferred even though cancel() throws. + const tick = setInterval(() => { + // Probe the trainer's state via a fresh requestEarlyStop call: + // once the cancel-after-checkpoint branch ran, status is + // "cancelled" and this returns instantly. + void requestTrainerEarlyStop(trainer, { timeoutMs: 1 }).then(() => { + clearInterval(tick); + stopPromiseResult = "resolved"; + resolve(); + }); + }, 25); + }); + try { + await trainer.wait(); + // Wait for the probe to confirm the deferred resolves. + await stopPromise; + } finally { + globalThis.fetch = original; + } + // cancel() was attempted (and threw) but the deferred still resolved. + expect(cancelAttempts).toBe(1); + expect(stopPromiseResult).toBe("resolved"); + }); + it("falls back to immediate cancel when no checkpoint arrives within timeoutMs", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index 4c95cf06..add5c8f0 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -276,7 +276,21 @@ export function createTrainer( // Early-stop latch: a checkpoint just landed, so the in-flight work // is durable. Cancel the cloud job and end `wait()` cleanly. if (earlyStopRequested && earlyStopDeferred) { - await trainer.cancel(); + // Best-effort `cancel()` — swallow throws so the deferred + // *always* resolves and the SIGTERM handler waiting on + // `requestEarlyStop()` can exit. Letting an error propagate + // here would leave the deferred pending and the runner + // process hung on shutdown; the local `startedJob.status` + // is set to `cancelled` regardless so subsequent + // `requestEarlyStop` calls see the terminal-status + // short-circuit. The cookbook already calls `cancel()` + // best-effort, so users tolerating a transient cloud-api + // failure here matches the documented contract. + try { + await trainer.cancel(); + } catch { + // intentionally ignored — see comment above. + } // Reflect the cancellation locally so `wait()`'s resolved // `TrainingResult.job.status` is a terminal status (per the // documented contract). Without this update the result would diff --git a/packages/arkor/src/core/trainerInspection.test.ts b/packages/arkor/src/core/trainerInspection.test.ts new file mode 100644 index 00000000..df557c6f --- /dev/null +++ b/packages/arkor/src/core/trainerInspection.test.ts @@ -0,0 +1,117 @@ +import { describe, it, expect } from "vitest"; +import { createArkor } from "./arkor"; +import { createTrainer } from "./trainer"; +import { + findInspectableTrainer, + findTrainerInModule, + getTrainerInspection, +} from "./trainerInspection"; + +function brandedTrainer(name: string) { + // Real `createTrainer` attaches the inspection brand. We only need + // a no-op trainer for these shape tests — `start`/`wait` etc. are + // never invoked. + return createTrainer({ + name, + model: "m", + dataset: { type: "huggingface", name: "x" }, + }); +} + +function unbrandedTrainer(name: string) { + // Hand-rolled trainer — passes the `start`/`wait`/`cancel` shape + // check `findTrainerInModule` requires but DOESN'T carry the SDK + // inspection brand. Mirrors a user who wraps or re-exports a + // trainer outside the SDK helpers. + return { + name, + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + }; +} + +describe("findTrainerInModule (trainer-shape walk)", () => { + it("finds shape #1: createArkor named export", () => { + const trainer = brandedTrainer("a"); + const found = findTrainerInModule({ arkor: createArkor({ trainer }) }); + expect(found).toBe(trainer); + }); + + it("finds shape #2: bare `trainer` named export", () => { + const trainer = brandedTrainer("b"); + const found = findTrainerInModule({ trainer }); + expect(found).toBe(trainer); + }); + + it("finds shape #3: default-export Arkor manifest", () => { + const trainer = brandedTrainer("c"); + const found = findTrainerInModule({ default: createArkor({ trainer }) }); + expect(found).toBe(trainer); + }); + + it("finds shape #4: default.trainer nested", () => { + const trainer = brandedTrainer("d"); + const found = findTrainerInModule({ default: { trainer } }); + expect(found).toBe(trainer); + }); + + it("works for hand-rolled (unbranded) trainers in any of the four shapes", () => { + const trainer = unbrandedTrainer("manual"); + expect(findTrainerInModule({ trainer })?.name).toBe("manual"); + expect(findTrainerInModule({ default: { trainer } })?.name).toBe("manual"); + }); + + it("returns null when no candidate looks like a trainer", () => { + expect(findTrainerInModule({})).toBeNull(); + expect(findTrainerInModule({ arkor: {} })).toBeNull(); + expect(findTrainerInModule({ trainer: { name: "no-methods" } })).toBeNull(); + expect(findTrainerInModule({ default: 42 })).toBeNull(); + }); +}); + +describe("findInspectableTrainer (brand-required path)", () => { + it("returns the inspection snapshot for a branded trainer in any shape", () => { + // Regression: previously HMR's `inspectBundle` only checked + // `mod.arkor ?? mod.default`, missing shapes #2 and #4. As a + // result, projects bare-exporting `trainer` always produced + // `configHash: null` and HMR conservatively SIGTERM-restarted on + // every rebuild — never hot-swapping callbacks. The fix routes + // through `findInspectableTrainer` which walks every supported + // shape via `findTrainerInModule` and pulls inspection off the + // discovered trainer. + const trainerA = brandedTrainer("from-arkor"); + const inspectionA = findInspectableTrainer({ + arkor: createArkor({ trainer: trainerA }), + }); + expect(inspectionA?.name).toBe("from-arkor"); + + const trainerB = brandedTrainer("bare-named"); + const inspectionB = findInspectableTrainer({ trainer: trainerB }); + expect(inspectionB?.name).toBe("bare-named"); + + const trainerC = brandedTrainer("default-arkor"); + const inspectionC = findInspectableTrainer({ + default: createArkor({ trainer: trainerC }), + }); + expect(inspectionC?.name).toBe("default-arkor"); + + const trainerD = brandedTrainer("default-nested"); + const inspectionD = findInspectableTrainer({ + default: { trainer: trainerD }, + }); + expect(inspectionD?.name).toBe("default-nested"); + }); + + it("returns null when only an unbranded trainer is present", () => { + // Hand-rolled trainers don't carry the SDK inspection brand, so + // HMR can't compute their `configHash`. The Studio still shows + // the trainer name (via `findTrainerInModule` in + // `summariseBuiltManifest`), but HMR routing falls back to the + // SIGTERM-restart-everything path — which is the documented + // safe behaviour when configs can't be diffed. + const trainer = unbrandedTrainer("plain"); + expect(findInspectableTrainer({ trainer })).toBeNull(); + expect(getTrainerInspection(trainer)).toBeNull(); + }); +}); diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 1966aa9e..9b6fc771 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -1,4 +1,5 @@ -import type { JobConfig, Trainer, TrainerCallbacks } from "./types"; +import { isArkor } from "./arkor"; +import type { Arkor, JobConfig, Trainer, TrainerCallbacks } from "./types"; /** * Snapshot of a trainer's identity and cloud-side config that the Studio @@ -166,3 +167,86 @@ export function requestTrainerEarlyStop( ] as (opts?: RequestEarlyStopOptions) => Promise; return fn.call(trainer, opts); } + +/** + * Trainer-shaped value pulled from a re-imported bundle. We don't + * import the public `Trainer` type here because consumers of this + * helper want to read minimal fields (`name` for display) without + * type-narrowing on the full SDK interface — many tests fabricate + * hand-rolled trainer literals that don't structurally match + * `Trainer` (no `requestEarlyStop` etc.) but are still legitimate + * user shapes the runner accepts. + */ +type TrainerLike = { name?: unknown; [key: string]: unknown }; + +function isTrainerLike(value: unknown): value is TrainerLike { + if (!value || typeof value !== "object") return false; + const v = value as Record; + return ( + typeof v.start === "function" && + typeof v.wait === "function" && + typeof v.cancel === "function" + ); +} + +/** + * Walk a freshly-imported user bundle in the same precedence order + * as `runner.ts`'s `extractTrainer` and return the first + * trainer-shaped value (anything that has `start`/`wait`/`cancel` + * functions). Doesn't require the SDK inspection brand — the + * manifest UI displays the trainer's `name` for hand-rolled trainers + * too, even when HMR can't compute a `configHash` for them. + * + * The four supported shapes: + * 1. `export const arkor = createArkor({ trainer })` + * 2. `export const trainer = createTrainer(...)` (bare named export) + * 3. `export default createArkor({ trainer })` + * 4. `export default { trainer: createTrainer(...) }` + */ +export function findTrainerInModule( + mod: Record, +): TrainerLike | null { + const candidates: unknown[] = []; + // 1: createArkor named export + if (isArkor(mod.arkor) && (mod.arkor as Arkor).trainer) { + candidates.push((mod.arkor as Arkor).trainer); + } + // 2: bare `trainer` named export + if (mod.trainer) candidates.push(mod.trainer); + // 3: default-export holding an Arkor manifest + if (isArkor(mod.default) && (mod.default as Arkor).trainer) { + candidates.push((mod.default as Arkor).trainer); + } + // 4: default.trainer nested + if ( + mod.default && + typeof mod.default === "object" && + "trainer" in (mod.default as Record) + ) { + candidates.push((mod.default as Record).trainer); + } + for (const c of candidates) { + if (isTrainerLike(c)) return c; + } + return null; +} + +/** + * Walk a freshly-imported user bundle and return the first inspection + * snapshot we can pull off a discovered trainer. Used by both + * `studio/hmr.ts` (computing the `configHash` for HMR routing) and + * `core/runnerSignals.ts` (extracting new callbacks for SIGUSR2 hot- + * swap) so the two paths stay in sync with the runner about which + * export shapes count as "a trainer is exported here". + * + * Returns `null` when none of the candidates carry the inspection + * brand — typically because the bundle has no SDK-built trainer + * (hand-rolled trainer, fresh scaffold, syntax error, or a + * third-party shape). + */ +export function findInspectableTrainer( + mod: Record, +): TrainerInspection | null { + const trainer = findTrainerInModule(mod); + return trainer ? getTrainerInspection(trainer) : null; +} diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 76c7e1b3..b91a4762 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -175,4 +175,37 @@ describe("createHmrCoordinator", () => { await new Promise((r) => setTimeout(r, 250)); expect(events.length).toBe(countAfterDispose); }); + + it("getCurrentConfigHash() returns the latest cached event's hash", async () => { + // Regression: `/api/train` previously called `readManifestSummary` + // and ran a redundant rebuild per spawn (racing the watcher). + // The new server flow reads the cached hash via + // `getCurrentConfigHash()`. We can't trigger a real build here + // (the user-bundle entry shape would need a working `arkor` + // resolution at import time), but we can verify the getter + // returns `null` before the watcher has emitted any event and + // tracks the cached event's `configHash` field once one lands. + // The integration of "configHash actually populated for all + // entry shapes" is covered by the unit test against + // `findInspectableTrainer` in `trainerInspection.test.ts`. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + // Before any subscriber attaches, no watcher is running and no + // event has been broadcast — getter must return null without + // throwing. + expect(hmr.getCurrentConfigHash()).toBeNull(); + hmr.subscribe((e) => events.push(e)); + try { + const ready = await nextEvent(events, (e) => e.type === "ready"); + // FAKE_MANIFEST is hand-rolled (no SDK brand) so the cached + // hash is null — but the *getter* must still return whatever + // the cached event carries, not throw. + expect(hmr.getCurrentConfigHash()).toBe(ready.configHash ?? null); + } finally { + await hmr.dispose(); + } + }); }); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index ca0dc8e8..00b10aaa 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -1,7 +1,6 @@ import { existsSync, statSync } from "node:fs"; import { pathToFileURL } from "node:url"; import { watch, type RolldownWatcher } from "rolldown"; -import { isArkor } from "../core/arkor"; import { hashJobConfig } from "../core/configHash"; import { BUILD_DEFAULTS, @@ -9,7 +8,7 @@ import { rolldownInputOptions, type BuildEntryOptions, } from "../core/rolldownConfig"; -import { getTrainerInspection } from "../core/trainerInspection"; +import { findInspectableTrainer } from "../core/trainerInspection"; export type HmrEventType = "ready" | "rebuild" | "error"; @@ -41,6 +40,15 @@ export interface HmrCoordinator { * event. Returns an unsubscribe function. */ subscribe(fn: (event: HmrEvent) => void): () => void; + /** + * Synchronous read of the most recent successful build's + * `configHash`. Used by `/api/train` to capture the hash that's + * about to be spawned so HMR routing on the *next* rebuild knows + * whether the new bundle changed cloud-side config. `null` when the + * watcher hasn't completed a successful build yet (e.g. fresh + * scaffold) or the latest event was an `error`. + */ + getCurrentConfigHash(): string | null; dispose(): Promise; } @@ -58,9 +66,19 @@ function fingerprint(outFile: string): string { /** * Dynamic-import the freshly-built bundle and pull a `TrainerInspection` * snapshot off the discovered trainer. Cache-bust the URL so Node's ESM - * loader returns the new module text rather than a stale evaluation. Best- - * effort: a missing/malformed manifest or a thrown user constructor returns - * `null` and the caller treats the rebuild as "config-unknown". + * loader returns the new module text rather than a stale evaluation. + * + * Walks every entry shape `runner.ts` accepts (named `arkor`, named + * `trainer`, `default` Arkor manifest, `default.trainer`) via the + * shared `findInspectableTrainer` helper — keeping inspection in sync + * with execution. Without this, projects that only `export const + * trainer` (a documented shortcut) would always produce `configHash: + * null` and the SPA would unnecessarily SIGTERM-restart on every + * rebuild. + * + * Best-effort: a missing/malformed manifest or a thrown user + * constructor returns `null` and the caller treats the rebuild as + * "config-unknown". */ async function inspectBundle( outFile: string, @@ -68,9 +86,7 @@ async function inspectBundle( try { const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; const mod = (await import(url)) as Record; - const candidate = mod.arkor ?? mod.default; - if (!isArkor(candidate)) return null; - const inspection = getTrainerInspection(candidate.trainer); + const inspection = findInspectableTrainer(mod); if (!inspection) return null; return { configHash: hashJobConfig(inspection.config), @@ -204,6 +220,12 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { subscribers.delete(fn); }; }, + getCurrentConfigHash() { + // `lastEvent` is `null` until the first BUNDLE_END (or null again + // if the most recent emission was an `error`); both cases are + // legitimate "we don't know the hash yet" signals to the caller. + return lastEvent?.configHash ?? null; + }, async dispose() { disposed = true; subscribers.clear(); diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index ef024e15..3699f8b4 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -1,8 +1,10 @@ import { pathToFileURL } from "node:url"; import { runBuild } from "../cli/commands/build"; -import { isArkor } from "../core/arkor"; import { hashJobConfig } from "../core/configHash"; -import { getTrainerInspection } from "../core/trainerInspection"; +import { + findTrainerInModule, + getTrainerInspection, +} from "../core/trainerInspection"; /** * Wire-friendly snapshot of the user's `createArkor({...})` manifest. Mirrors @@ -40,14 +42,25 @@ export async function summariseBuiltManifest( ): Promise { const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; const mod = (await import(url)) as Record; - const candidate = mod.arkor ?? mod.default; - if (!isArkor(candidate)) return EMPTY; - const trainer = candidate.trainer - ? { name: candidate.trainer.name } - : null; - const inspection = getTrainerInspection(candidate.trainer); - const configHash = inspection ? hashJobConfig(inspection.config) : null; - return { trainer, configHash }; + // Walk every trainer export shape `runner.ts` accepts via the + // shared helper (named `arkor`, named `trainer`, default Arkor + // manifest, `default.trainer`) so manifest summary, HMR routing, + // and runtime execution all agree about which exports count as a + // trainer. + const trainer = findTrainerInModule(mod); + if (!trainer) return EMPTY; + // Trainer name renders in the UI even for hand-rolled trainers + // that bypass `createTrainer` and therefore don't carry the SDK + // inspection brand. The brand is required only for the + // `configHash` used by HMR routing — without it, HMR conservatively + // SIGTERM-restarts on every rebuild (correct fallback). + const name = + typeof trainer.name === "string" ? trainer.name : "(unnamed trainer)"; + const inspection = getTrainerInspection(trainer); + return { + trainer: { name }, + configHash: inspection ? hashJobConfig(inspection.config) : null, + }; } /** diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index ee180eeb..000f8764 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -469,6 +469,12 @@ process.exit(0); body: JSON.stringify({}), }); expect(res.status).toBe(200); + // Regression: the spawned subprocess's pid is exposed via the + // `X-Arkor-Train-Pid` response header so the SPA can scope HMR + // restart events to its own child (a multi-tab broadcast can + // contain mixed restart/hot-swap targets across siblings). + const pidHeader = res.headers.get("x-arkor-train-pid"); + expect(pidHeader).toMatch(/^\d+$/); const text = await res.text(); expect(text).toContain("[fake-bin]"); // The bin receives `start` as the first non-flag arg. @@ -549,6 +555,94 @@ process.exit(0); expect(text).toContain("exit="); expect(text).not.toContain("exit=0"); }); + + it("captures the spawn-time configHash from the HMR coordinator (no extra rebuild)", async () => { + // Regression: `/api/train` previously called `readManifestSummary` + // which ran a full `runBuild()` per spawn — wasteful and racy + // against the HMR watcher writing the same `.arkor/build/index.mjs`. + // The new server reads the cached hash from + // `coordinator.getCurrentConfigHash()` instead. We assert the + // call happens (so a rebuild is *not* required) by exposing the + // spy count on the fake coordinator. + await writeCredentials(ANON_CREDS); + let getCurrentCalls = 0; + const fakeHmr = { + subscribe: () => () => undefined, + getCurrentConfigHash: () => { + getCurrentCalls += 1; + return "spawn-time-hash"; + }, + async dispose() {}, + }; + const fakeBin = join(trainCwd, "fake-bin.mjs"); + writeFileSync(fakeBin, `process.exit(0);\n`); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + hmr: fakeHmr, + }); + const res = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(res.status).toBe(200); + // Drain the body so the close handler runs and the test + // doesn't leak the subprocess. + await res.text(); + expect(getCurrentCalls).toBe(1); + }); + + it("/api/train cancel handler doesn't crash when child.kill() throws", async () => { + // Regression: `ReadableStream.cancel()` called `child.kill()` + // without a try/catch. If the child had already exited (ESRCH + // race against the cancel), the throw bubbled up as an + // unhandled exception and crashed the request handler. + await writeCredentials(ANON_CREDS); + const fakeBin = join(trainCwd, "fake-bin.mjs"); + // Bin exits immediately so the child is already dead by the + // time our cancel handler tries to signal it. + writeFileSync(fakeBin, `process.exit(0);\n`); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const res = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(res.status).toBe(200); + // Race: read enough of the body to see the close, then cancel. + // The cancel hook must not throw even when the underlying + // child is already gone. + const reader = res.body!.getReader(); + // Wait for `exit=` so we know the child died first. + let buf = ""; + const decoder = new TextDecoder(); + while (!buf.includes("exit=")) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + await expect(reader.cancel()).resolves.toBeUndefined(); + }); }); describe("auto-anonymous bootstrap", () => { @@ -1340,11 +1434,15 @@ process.exit(0); }); describe("/api/dev/events (HMR)", () => { - function fakeHmr() { - // Mirror the real HmrCoordinator surface but stay synchronous so the - // test doesn't depend on rolldown.watch starting up. `emit` is a test - // hook for pushing events into the SSE stream from the test body. + function fakeHmr(initialConfigHash: string | null = null) { + // Mirror the real HmrCoordinator surface but stay synchronous so + // the test doesn't depend on rolldown.watch starting up. `emit` + // is a test hook for pushing events into the SSE stream from the + // test body; `currentConfigHash` is a settable mock for what + // `/api/train` reads via `getCurrentConfigHash` to capture the + // spawned-config snapshot. const subs = new Set<(e: HmrEvent) => void>(); + let currentConfigHash: string | null = initialConfigHash; const coordinator: HmrCoordinator = { subscribe(fn) { subs.add(fn); @@ -1352,6 +1450,9 @@ process.exit(0); subs.delete(fn); }; }, + getCurrentConfigHash() { + return currentConfigHash; + }, async dispose() { subs.clear(); }, @@ -1361,6 +1462,9 @@ process.exit(0); emit(event: HmrEvent) { for (const fn of subs) fn(event); }, + setConfigHash(hash: string | null) { + currentConfigHash = hash; + }, get subscriberCount() { return subs.size; }, diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 7380420b..74ef3112 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -19,6 +19,12 @@ import { readManifestSummary } from "./manifest"; import type { HmrCoordinator, HmrEvent } from "./hmr"; import { TrainRegistry, type RestartTarget } from "./trainRegistry"; +/** Identify the spawned subprocess to the SPA without exposing it as + * a body frame (which would interleave with trainer stdout). The SPA + * reads this off `Response.headers` and uses it to scope HMR + * `restart` events to the run *this* tab actually started. */ +const TRAIN_PID_HEADER = "x-arkor-train-pid"; + const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { for (const name of DEPRECATION_HEADERS) { @@ -326,20 +332,21 @@ export function buildStudioApp(options: StudioServerOptions) { } trainFile = abs; } - // Read the current manifest before spawn so the configHash is on - // hand for HMR's "hot-swap vs restart" decision later. Building the - // manifest also pre-warms `.arkor/build/index.mjs` for the - // subprocess's `runStart`. A failure here is non-fatal — the spawn - // proceeds with `configHash: null`, which forces SIGTERM (full - // restart) on the next rebuild. - let configHash: string | null = null; - try { - const manifest = await readManifestSummary(trainCwd); - configHash = manifest.configHash; - } catch { - // ignore — `arkor start` will surface its own build error to the - // SPA via stderr; we only needed configHash for HMR routing. - } + // Snapshot the current `configHash` so HMR routing on the *next* + // rebuild can compare against this child's spawn-time config. + // + // When HMR is enabled, read it synchronously from the coordinator + // (which already maintains `lastEvent.configHash` for its watcher). + // Reading from the cache avoids triggering an extra `runBuild()` + // per train request — the previous implementation called + // `readManifestSummary(trainCwd)` here, which both wasted CPU and + // raced the watcher writing the same `.arkor/build/index.mjs`. + // + // When HMR is disabled the field is irrelevant (no rebuilds will + // happen) so we leave it null without paying for a build. + const configHash: string | null = options.hmr + ? options.hmr.getCurrentConfigHash() + : null; const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); const child = spawn(process.execPath, args, { @@ -360,12 +367,31 @@ export function buildStudioApp(options: StudioServerOptions) { }, cancel() { activeTrains.unregister(child.pid); - child.kill(); + // `ChildProcess.kill()` can throw (ESRCH if the process has + // already exited between this handler's invocation and the + // signal delivery). A throw here would surface as an unhandled + // exception in the request pipeline and crash the server + // handler — swallow it; the close handler above has already + // taken the entry out of the registry. + try { + child.kill(); + } catch { + // already gone; nothing to clean up. + } }, }); + // Expose the spawned pid via a response header so the SPA can + // tell its own child apart from other tabs' children when + // `/api/dev/events` broadcasts `restartTargets` / `hotSwapTargets`. + // Without this, a passive tab whose run was hot-swapped could + // misread a sibling tab's restart event as its own. + const pidHeader = typeof child.pid === "number" ? String(child.pid) : ""; return new Response(stream, { status: 200, - headers: { "content-type": "text/plain; charset=utf-8" }, + headers: { + "content-type": "text/plain; charset=utf-8", + [TRAIN_PID_HEADER]: pidHeader, + }, }); }); @@ -404,16 +430,14 @@ export function buildStudioApp(options: StudioServerOptions) { hmr.subscribe((event) => { let augmented: AugmentedEvent = event; if (event.type === "rebuild" && activeTrains.size > 0) { - // Per-child decision: if the rebuilt bundle's `configHash` - // matches the child's spawn-time hash, the cloud-side run is - // unaffected — SIGUSR2 lets the runner re-import and rotate the - // callbacks cell. Otherwise SIGTERM triggers the trainer's - // internal early-stop so the next checkpoint lands before the - // SPA re-spawns. + // Single per-child decision pass: hash match → SIGUSR2 (with + // a Windows fallback to SIGTERM since win32 doesn't deliver + // SIGUSR2), hash mismatch → SIGTERM. The registry returns + // both buckets so the SPA can react per-child rather than + // assuming one global outcome. const nextHash = event.configHash ?? null; - const hotSwapTargets = activeTrains.notifyCallbackReload(nextHash); - const restartTargets = - activeTrains.requestEarlyStopOnMismatch(nextHash); + const { hotSwapTargets, restartTargets } = + activeTrains.dispatchRebuild(nextHash); augmented = { ...event, hotSwap: hotSwapTargets.length > 0, diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index db6acae1..a08cd51a 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -8,6 +8,8 @@ interface FakeChild { } function fakeChild(pid: number): FakeChild { + // Default: `kill(sig)` returns `true`, mirroring Node's contract for + // a successful signal delivery to a still-running process. return { pid, kill: vi.fn(() => true) }; } @@ -20,7 +22,7 @@ describe("TrainRegistry", () => { expect(reg.size).toBe(0); }); - it("notifyCallbackReload SIGUSR2s only matching configHashes", () => { + it("dispatchRebuild SIGUSR2s only matching configHashes", () => { const reg = new TrainRegistry(); const a = fakeChild(101); const b = fakeChild(102); @@ -32,98 +34,83 @@ describe("TrainRegistry", () => { }); reg.register(c as unknown as ChildProcess, { configHash: "match" }); - const signalled = reg.notifyCallbackReload("match"); - expect(signalled).toEqual([ + const result = reg.dispatchRebuild("match"); + expect(result.hotSwapTargets).toEqual([ { pid: 101, trainFile: undefined }, { pid: 103, trainFile: undefined }, ]); + expect(result.restartTargets).toEqual([ + { pid: 102, trainFile: "/tmp/b.ts" }, + ]); expect(a.kill).toHaveBeenCalledWith("SIGUSR2"); expect(c.kill).toHaveBeenCalledWith("SIGUSR2"); - expect(b.kill).not.toHaveBeenCalled(); + expect(b.kill).toHaveBeenCalledWith("SIGTERM"); }); - it("notifyCallbackReload is a no-op when nextConfigHash is null", () => { + it("dispatchRebuild SIGTERMs everything when nextConfigHash is null", () => { + // null nextHash means "we couldn't inspect the new bundle" — be + // conservative and SIGTERM every active child since we can't + // prove their configs are unaffected. const reg = new TrainRegistry(); const a = fakeChild(201); - reg.register(a as unknown as ChildProcess, { configHash: null }); - expect(reg.notifyCallbackReload(null)).toEqual([]); - expect(a.kill).not.toHaveBeenCalled(); - }); - - it("requestEarlyStopOnMismatch SIGTERMs only mismatched children", () => { - const reg = new TrainRegistry(); - const same = fakeChild(301); - const diff = fakeChild(302); - reg.register(same as unknown as ChildProcess, { configHash: "h" }); - reg.register(diff as unknown as ChildProcess, { - configHash: "x", - trainFile: "/tmp/diff.ts", - }); - - const targets = reg.requestEarlyStopOnMismatch("h"); - expect(targets).toEqual([{ pid: 302, trainFile: "/tmp/diff.ts" }]); - expect(same.kill).not.toHaveBeenCalled(); - expect(diff.kill).toHaveBeenCalledWith("SIGTERM"); - }); - - it("requestEarlyStopOnMismatch SIGTERMs everything when nextConfigHash is null", () => { - const reg = new TrainRegistry(); - const a = fakeChild(401); - const b = fakeChild(402); + const b = fakeChild(202); reg.register(a as unknown as ChildProcess, { configHash: "h" }); reg.register(b as unknown as ChildProcess, { configHash: null }); - // null nextHash means "we couldn't inspect the new bundle" — be - // conservative and SIGTERM every active child. - const targets = reg.requestEarlyStopOnMismatch(null); - expect(targets).toHaveLength(2); + const result = reg.dispatchRebuild(null); + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toHaveLength(2); expect(a.kill).toHaveBeenCalledWith("SIGTERM"); expect(b.kill).toHaveBeenCalledWith("SIGTERM"); }); - it("requestEarlyStopOnMismatch SIGTERMs children whose stored hash is null", () => { + it("dispatchRebuild SIGTERMs children whose stored hash is null", () => { // A spawn that raced an in-flight build can land with `configHash: // null`. It must not be hot-swapped — even if the new bundle's hash // is known, we have no proof the spawned subprocess is running the // same config. const reg = new TrainRegistry(); - const a = fakeChild(501); + const a = fakeChild(301); reg.register(a as unknown as ChildProcess, { configHash: null }); - const targets = reg.requestEarlyStopOnMismatch("h"); - expect(targets).toHaveLength(1); + const result = reg.dispatchRebuild("h"); + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toHaveLength(1); expect(a.kill).toHaveBeenCalledWith("SIGTERM"); }); it("unregister removes the child from the policy decisions", () => { const reg = new TrainRegistry(); - const a = fakeChild(601); + const a = fakeChild(401); reg.register(a as unknown as ChildProcess, { configHash: "h" }); - reg.unregister(601); + reg.unregister(401); expect(reg.size).toBe(0); - expect(reg.notifyCallbackReload("h")).toEqual([]); + const result = reg.dispatchRebuild("h"); + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([]); }); it("survives kill() throwing (child exited mid-iteration)", () => { const reg = new TrainRegistry(); - const a = fakeChild(701); + const a = fakeChild(501); a.kill.mockImplementation(() => { throw new Error("ESRCH"); }); reg.register(a as unknown as ChildProcess, { configHash: "h" }); - // Both code paths should swallow the throw and continue with their + // Both the hot-swap branch (matching hash) and the restart branch + // (mismatched hash) must swallow the throw and continue with their // bookkeeping so a single dead child can't break HMR for siblings. - expect(() => reg.notifyCallbackReload("h")).not.toThrow(); - expect(() => reg.requestEarlyStopOnMismatch("x")).not.toThrow(); + expect(() => reg.dispatchRebuild("h")).not.toThrow(); + expect(() => reg.dispatchRebuild("x")).not.toThrow(); }); - it("requestEarlyStopOnMismatch omits dead-on-kill children from the restart targets", () => { + it("dispatchRebuild omits dead-on-kill children from the restart targets", () => { // Regression: previously the implementation always pushed onto // `targets` even when `kill()` threw, so a child that had already // exited would still be reported back to the SPA as a restart // target — the SPA would then wait forever for the (already- // delivered) `exit=...` line and never re-spawn. const reg = new TrainRegistry(); - const dead = fakeChild(801); + const dead = fakeChild(601); dead.kill.mockImplementation(() => { throw new Error("ESRCH"); }); @@ -131,10 +118,31 @@ describe("TrainRegistry", () => { configHash: "stale", trainFile: "/tmp/d.ts", }); - expect(reg.requestEarlyStopOnMismatch("fresh")).toEqual([]); + const result = reg.dispatchRebuild("fresh"); + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([]); + }); + + it("dispatchRebuild omits dead-on-kill children when kill returns false (no throw)", () => { + // Regression: `ChildProcess.kill()` returns `false` (without + // throwing) when the target process is already gone. The previous + // implementation treated any non-throw as success and reported the + // child as a restart target — the SPA would then wait forever for + // an exit line that already arrived. + const reg = new TrainRegistry(); + const gone = fakeChild(701); + gone.kill.mockReturnValue(false); + reg.register(gone as unknown as ChildProcess, { + configHash: "stale", + trainFile: "/tmp/g.ts", + }); + const result = reg.dispatchRebuild("fresh"); + expect(result.restartTargets).toEqual([]); + // We still attempted the kill — only the bookkeeping is skipped. + expect(gone.kill).toHaveBeenCalledWith("SIGTERM"); }); - it("requestEarlyStopOnMismatch sends SIGTERM at most once per child across rebuilds", () => { + it("dispatchRebuild sends SIGTERM at most once per child across rebuilds", () => { // Regression: under rapid edits the dev loop can fire multiple // rebuilds before the child reaches its next checkpoint. The // runner's shutdown handler treats a *second* SIGTERM as the @@ -143,34 +151,74 @@ describe("TrainRegistry", () => { // tracks per-child early-stop state and skips children it has // already signalled. const reg = new TrainRegistry(); - const a = fakeChild(901); + const a = fakeChild(801); reg.register(a as unknown as ChildProcess, { configHash: "h1", trainFile: "/tmp/a.ts", }); - const first = reg.requestEarlyStopOnMismatch("h2"); - expect(first).toEqual([{ pid: 901, trainFile: "/tmp/a.ts" }]); + const first = reg.dispatchRebuild("h2"); + expect(first.restartTargets).toEqual([ + { pid: 801, trainFile: "/tmp/a.ts" }, + ]); expect(a.kill).toHaveBeenCalledTimes(1); // Second mismatching rebuild before the child has exited: must NOT // re-send SIGTERM and must NOT re-list the child as a restart // target (the SPA already has a pending re-spawn for it). - const second = reg.requestEarlyStopOnMismatch("h3"); - expect(second).toEqual([]); + const second = reg.dispatchRebuild("h3"); + expect(second.restartTargets).toEqual([]); expect(a.kill).toHaveBeenCalledTimes(1); // After the child exits and is unregistered, a fresh spawn in its // place starts from a clean slate. - reg.unregister(901); - const respawn = fakeChild(902); + reg.unregister(801); + const respawn = fakeChild(802); reg.register(respawn as unknown as ChildProcess, { configHash: "h3", trainFile: "/tmp/a.ts", }); - expect(reg.requestEarlyStopOnMismatch("h4")).toEqual([ - { pid: 902, trainFile: "/tmp/a.ts" }, + const third = reg.dispatchRebuild("h4"); + expect(third.restartTargets).toEqual([ + { pid: 802, trainFile: "/tmp/a.ts" }, ]); expect(respawn.kill).toHaveBeenCalledTimes(1); }); + + it("dispatchRebuild degrades to SIGTERM-restart when SIGUSR2 is unsupported (Windows)", () => { + // Regression: Node's win32 build doesn't deliver SIGUSR2 (it + // throws "ENOSYS" inside `child.kill('SIGUSR2')`). The previous + // implementation silently swallowed that throw, so on Windows a + // hash-match rebuild produced neither hot-swap nor restart and + // callback edits never landed. Now we degrade to a SIGTERM-driven + // restart so the new code does take effect — at the cost of a + // brief gap rather than an in-place swap. + const reg = new TrainRegistry(); + const a = fakeChild(901); + a.kill.mockImplementation((sig?: string) => { + if (sig === "SIGUSR2") { + const err = new Error( + "kill ENOSYS", + ) as Error & { code?: string }; + err.code = "ENOSYS"; + throw err; + } + return true; // SIGTERM works + }); + reg.register(a as unknown as ChildProcess, { + configHash: "match", + trainFile: "/tmp/win.ts", + }); + const result = reg.dispatchRebuild("match"); + // Must not appear in hot-swap (signal failed) but must appear in + // restart (fallback succeeded) so the SPA re-spawns once the + // exit message arrives. + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([ + { pid: 901, trainFile: "/tmp/win.ts" }, + ]); + // Both signals were attempted in order: SIGUSR2 → fallback SIGTERM. + expect(a.kill).toHaveBeenNthCalledWith(1, "SIGUSR2"); + expect(a.kill).toHaveBeenNthCalledWith(2, "SIGTERM"); + }); }); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 484d01d4..97e1140a 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -9,9 +9,9 @@ import type { ChildProcess } from "node:child_process"; * matches the one captured at spawn time — the cloud-side run is * unaffected, only in-process callbacks need to update. * - **SIGTERM** (graceful early-stop + restart) when the configs - * diverge — `Trainer.requestEarlyStop` lets the next checkpoint - * finish, the subprocess exits, and the SPA re-spawns with the - * rebuilt artefact. + * diverge — the runner's internal early-stop entry point lets the + * next checkpoint finish, the subprocess exits, and the SPA + * re-spawns with the rebuilt artefact. */ export interface ActiveTrain { child: ChildProcess; @@ -38,6 +38,38 @@ export interface RestartTarget { trainFile?: string; } +export interface DispatchResult { + /** Children whose callbacks were rotated in place via SIGUSR2. */ + hotSwapTargets: RestartTarget[]; + /** + * Children that were SIGTERM'd for graceful early-stop and need to + * be re-spawned by the SPA after the train stream emits its + * `exit=...` line. Includes both config-mismatch matches and + * config-match cases that fell back here because the platform + * doesn't support SIGUSR2 (Windows). + */ + restartTargets: RestartTarget[]; +} + +/** + * Outcome of a single `child.kill(signal)` call. + * + * - `"ok"`: signal was delivered. + * - `"gone"`: process was already exited (`kill` returned `false`); no + * real signal was sent. + * - `"unsupported"`: the platform doesn't support this signal kind + * (Windows + `SIGUSR2`); `kill` threw. + */ +type KillResult = "ok" | "gone" | "unsupported"; + +function safeKill(child: ChildProcess, signal: NodeJS.Signals): KillResult { + try { + return child.kill(signal) ? "ok" : "gone"; + } catch { + return "unsupported"; + } +} + /** * Encapsulates the set of `/api/train`-spawned subprocesses and the * signal-dispatch decision rule for HMR rebuilds. Pulled out of @@ -73,77 +105,77 @@ export class TrainRegistry { } /** - * Send a callback hot-swap signal (SIGUSR2) to every child whose - * stored `configHash` matches `nextConfigHash`. The child's runner - * (`installCallbackReloadHandler`) re-imports the rebuilt bundle and - * calls `Trainer.replaceCallbacks`. Returns the list of children - * actually signalled, so the SSE event payload can include them for - * SPA-side telemetry. - */ - notifyCallbackReload( - nextConfigHash: string | null, - ): Array<{ pid: number; trainFile?: string }> { - if (nextConfigHash === null) return []; - const signalled: Array<{ pid: number; trainFile?: string }> = []; - for (const [pid, entry] of this.entries) { - if (entry.configHash !== null && entry.configHash === nextConfigHash) { - try { - entry.child.kill("SIGUSR2"); - signalled.push({ pid, trainFile: entry.trainFile }); - } catch { - // child may have just exited; the close handler will clean - // up the entry on its own. - } - } - } - return signalled; - } - - /** - * Send a graceful early-stop signal (SIGTERM) to every child whose - * stored `configHash` differs from `nextConfigHash` AND that hasn't - * already been signalled. The child's runner - * (`installShutdownHandlers`) calls the trainer's internal - * early-stop entry point which preserves the in-flight checkpoint - * before exiting. Returns the list of children we actually - * signalled this call so the SPA can re-spawn them with the new - * bundle. + * Single entry point for HMR rebuilds: per active child, decide + * between callback hot-swap (SIGUSR2) and graceful restart + * (SIGTERM), apply the signal, and report which children landed in + * each bucket so the SPA can update its UI / re-spawn restarted + * runs. + * + * Combines what was previously `notifyCallbackReload` + + * `requestEarlyStopOnMismatch` into one pass so the per-child + * decision is atomic — important because the hot-swap path can + * gracefully degrade into the restart path on platforms (Windows) + * where SIGUSR2 isn't supported, which is hard to express across + * two separate iterations of the registry. * - * If `nextConfigHash` is null (the new bundle has no inspectable - * trainer), every active not-yet-signalled child is SIGTERM'd - * defensively — we can't prove their configs are unaffected. + * Re-signal protection: children already flagged + * `earlyStopRequested` are skipped entirely. The flag is cleared + * naturally when the child exits and is unregistered. * - * Re-signal protection: a second SIGTERM hits the runner's - * emergency `exit(143)` fast-path and would defeat checkpoint - * preservation. Children flagged `earlyStopRequested` here are - * skipped on subsequent rebuilds; the entry is removed from the - * registry when the child exits, so the next spawn starts from a - * clean slate. + * Defensive corner cases: + * - `kill()` returns `false` (process already exited) → drop + * from the targets list, the registry's close handler will + * unregister it. + * - `kill("SIGUSR2")` throws on Windows → degrade to SIGTERM so + * callback edits still take effect (via a full restart) rather + * than silently being ignored. */ - requestEarlyStopOnMismatch( - nextConfigHash: string | null, - ): RestartTarget[] { - const targets: RestartTarget[] = []; + dispatchRebuild(nextConfigHash: string | null): DispatchResult { + const hotSwapTargets: RestartTarget[] = []; + const restartTargets: RestartTarget[] = []; + for (const [pid, entry] of this.entries) { if (entry.earlyStopRequested) continue; - if ( - nextConfigHash === null || - entry.configHash === null || - entry.configHash !== nextConfigHash - ) { - try { - entry.child.kill("SIGTERM"); + const target: RestartTarget = { pid, trainFile: entry.trainFile }; + const matches = + nextConfigHash !== null && + entry.configHash !== null && + entry.configHash === nextConfigHash; + + if (matches) { + const r = safeKill(entry.child, "SIGUSR2"); + if (r === "ok") { + hotSwapTargets.push(target); + continue; + } + if (r === "gone") { + // Child already exited; close handler will unregister. + continue; + } + // Windows fallback: SIGUSR2 isn't supported on win32 — degrade + // to a full restart so callback edits don't silently fail to + // apply. The user-visible result (callbacks reload after a + // brief restart) matches the design intent. + const fallback = safeKill(entry.child, "SIGTERM"); + if (fallback === "ok") { entry.earlyStopRequested = true; - // Push only after a successful kill; a thrown `kill` means - // the child has already exited and is not a real restart - // target (the SPA would otherwise wait forever for an - // exit message that never comes). - targets.push({ pid, trainFile: entry.trainFile }); - } catch { - // child already exited; close handler will clean up. + restartTargets.push(target); } + // "gone" / "unsupported" again → drop silently; the close + // handler (or operator-driven restart) will recover. + continue; } + + // Hash mismatch (or one side is null): graceful restart. + const r = safeKill(entry.child, "SIGTERM"); + if (r === "ok") { + entry.earlyStopRequested = true; + restartTargets.push(target); + } + // "gone": child already exited, drop. "unsupported": can't + // happen for SIGTERM on supported platforms; drop defensively. } - return targets; + + return { hotSwapTargets, restartTargets }; } } diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index e408a4c9..69827133 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -32,18 +32,37 @@ export function RunTraining() { // from Overview mid-stream tears the training stream down without // touching the (always-running) manifest poll. const trainingAbortRef = useRef(null); - // HMR auto-restart bookkeeping. `lastTrainFileRef` carries the same - // `file?` arg into the auto re-spawn; `restartPendingRef` is the - // latch the SSE listener trips when the dev loop SIGTERMs the - // current run for a config-mismatch rebuild; `runningRef` lets the - // listener tell "is this tab the one running training?" apart from - // a passive tab that should ignore the broadcast. + // HMR auto-restart bookkeeping: + // - lastTrainFileRef: carries the same `file?` arg into the auto + // re-spawn. + // - restartPendingRef: latch the SSE listener trips ONLY when *this + // tab's* current child landed in `restartTargets`. Without the + // pid scope, a tab whose run was hot-swapped (other tab's child + // in `restartTargets`) would still latch on the broadcast and + // auto-spawn a duplicate job after its own run completes. + // - runningRef: short-circuit for tabs not running anything. + // - currentPidRef: the spawned child's pid for the run currently + // in flight, set from the `X-Arkor-Train-Pid` response header. + // - hotSwapTimerRef: id for the "hot-swapped" status auto-clear + // timer so unmount-during-flash doesn't leak (or trigger a + // setState-after-unmount warning). const lastTrainFileRef = useRef(undefined); const restartPendingRef = useRef(false); const runningRef = useRef(false); + const currentPidRef = useRef(null); + // Browser `window.setTimeout` returns a numeric handle, not Node's + // `Timeout` object — explicit `number` so TS doesn't pick up the + // Node typing from the global `setTimeout`. + const hotSwapTimerRef = useRef(null); useEffect(() => { - return () => trainingAbortRef.current?.abort(); + return () => { + trainingAbortRef.current?.abort(); + if (hotSwapTimerRef.current !== null) { + clearTimeout(hotSwapTimerRef.current); + hotSwapTimerRef.current = null; + } + }; }, []); useEffect(() => { @@ -74,11 +93,18 @@ export function RunTraining() { }; }, []); - // HMR: listen for rebuild notifications from `arkor dev` and refresh the - // manifest. When a rebuild also early-stopped a running training run, the - // server flags `restart: true`; defer the actual re-invocation until the - // current `streamTraining` resolves so we don't run two cloud jobs at once. + // HMR: listen for rebuild notifications from `arkor dev` and refresh + // the manifest. When a rebuild also early-stopped *this tab's* + // training run, the server includes the spawned pid in + // `restartTargets`; defer the auto re-invocation until the current + // `streamTraining` resolves so we don't run two cloud jobs at once. + // + // Gated by `import.meta.env.DEV`: `/api/dev/events` returns 404 when + // the server runs in production mode, and `EventSource` would then + // retry indefinitely creating constant background traffic. The dev + // build is the only place this endpoint exists. useEffect(() => { + if (!import.meta.env.DEV) return; const es = openDevEvents(); const onMessage = (raw: MessageEvent) => { let payload: DevEvent; @@ -100,32 +126,47 @@ export function RunTraining() { error: err instanceof Error ? err.message : String(err), }); }); - if (payload.restart) { - // `/api/dev/events` is a broadcast — every open Studio tab gets - // this event. Only flip the auto-restart latch when *this* tab - // is actually running a stream right now; otherwise a passive - // tab would silently auto-spawn an extra job the next time the - // user clicks Run training, doubling cloud spend. - if (runningRef.current) { - // Training run is early-stopping; the active stream will - // resolve once the next checkpoint lands and the subprocess - // exits cleanly. The `finally` block of `run()` picks up the - // pending flag and re-spawns with the same args. - restartPendingRef.current = true; - setHmrStatus("early-stopping"); - } else { - setHmrStatus("idle"); - } - } else if (payload.hotSwap) { + // Per-child decision based on the spawned pid: a single rebuild + // can produce mixed outcomes (one child hot-swapped, another + // restarted), and `payload.restart` / `payload.hotSwap` reflect + // *aggregate* truth across all active children. Filter to "did + // *my* child land in this bucket?" so a tab whose run was + // hot-swapped doesn't latch onto a sibling tab's restart. + const myPid = currentPidRef.current; + const myRestart = + runningRef.current && + myPid !== null && + (payload.restartTargets?.some((t) => t.pid === myPid) ?? false); + const myHotSwap = + myPid !== null && + (payload.hotSwapTargets?.some((t) => t.pid === myPid) ?? false); + if (myRestart) { + // Training run is early-stopping; the active stream will + // resolve once the next checkpoint lands and the subprocess + // exits cleanly. The `finally` block of `run()` picks up the + // pending flag and re-spawns with the same args. + restartPendingRef.current = true; + setHmrStatus("early-stopping"); + } else if (myHotSwap) { // Callbacks were swapped in place — the cloud-side run is // unaffected. Flash a brief "hot-swapped" indicator so users - // know the new code is live. + // know the new code is live. The previous timer (if any) is + // cleared so two close-together rebuilds don't race for the + // status reset. setHmrStatus("hot-swapped"); - window.setTimeout(() => { + if (hotSwapTimerRef.current !== null) { + clearTimeout(hotSwapTimerRef.current); + } + hotSwapTimerRef.current = window.setTimeout(() => { setHmrStatus((s) => (s === "hot-swapped" ? "idle" : s)); + hotSwapTimerRef.current = null; }, 1500); } else { - setHmrStatus("idle"); + // Nothing pertaining to this tab's child — leave any in- + // progress status spans alone but make sure stale "early- + // stopping" / "restarting" labels from a prior run don't + // linger past the next quiet rebuild. + if (!runningRef.current) setHmrStatus("idle"); } }; es.addEventListener("ready", onMessage); @@ -143,6 +184,11 @@ export function RunTraining() { async function run(file?: string): Promise { runningRef.current = true; lastTrainFileRef.current = file; + // Reset the pid before each spawn so a stale value from a prior + // run can't accidentally match a new HMR restart broadcast in the + // window between this assignment and `streamTraining` invoking + // `onSpawn`. + currentPidRef.current = null; setRunning(true); setLog(""); const ac = new AbortController(); @@ -156,6 +202,9 @@ export function RunTraining() { }, file, ac.signal, + (pid) => { + currentPidRef.current = pid; + }, ); } catch (err) { // Aborts are expected when the user navigates away mid-stream; @@ -169,6 +218,7 @@ export function RunTraining() { ); } finally { runningRef.current = false; + currentPidRef.current = null; if (trainingAbortRef.current === ac) trainingAbortRef.current = null; // Always release the running flag, including the user-initiated // abort path. setState on an already-unmounted component is a diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index 53ed723c..7fabc80f 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -32,8 +32,13 @@ export interface Job { */ export interface ManifestSummary { trainer: { name: string } | null; - /** Present when an inspectable trainer is loaded; otherwise null. */ - configHash?: string | null; + /** + * Stable hash of the trainer's cloud-side `JobConfig`. The server is + * always paired with this SPA in the same package, so the field is + * always present in the wire payload — `null` when no inspectable + * trainer is loaded, a hex string otherwise. Not optional. + */ + configHash: string | null; } export interface ManifestError { @@ -261,6 +266,14 @@ export async function streamTraining( onChunk: (text: string) => void, file?: string, signal?: AbortSignal, + /** + * Called once with the spawned subprocess's pid (or `null` if the + * server didn't include the `X-Arkor-Train-Pid` header). The SPA + * uses this to scope HMR `restart` events to the run *this* call + * actually started, so a passive tab whose own run was hot-swapped + * doesn't latch onto a sibling tab's restart broadcast. + */ + onSpawn?: (pid: number | null) => void, ): Promise { const res = await apiFetch("/api/train", { method: "POST", @@ -268,6 +281,11 @@ export async function streamTraining( body: JSON.stringify({ ...(file ? { file } : {}) }), signal, }); + if (onSpawn) { + const raw = res.headers.get("x-arkor-train-pid"); + const parsed = raw ? Number.parseInt(raw, 10) : NaN; + onSpawn(Number.isFinite(parsed) ? parsed : null); + } if (!res.body) return; const reader = res.body.getReader(); const decoder = new TextDecoder(); From e870a3b7cb354f062d3ceae94c6896f4c6a42b24 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 03:23:01 +0900 Subject: [PATCH 09/55] feat: enhance HMR support and JSON serialization in config hashing --- packages/arkor/src/core/configHash.test.ts | 60 ++++++++++++++ packages/arkor/src/core/configHash.ts | 45 ++++++++-- .../arkor/src/core/trainerInspection.test.ts | 83 ++++++++++++++++++- packages/arkor/src/core/trainerInspection.ts | 50 ++++++++--- packages/arkor/src/studio/server.test.ts | 39 +++++++++ packages/arkor/src/studio/server.ts | 34 ++++++-- .../studio-app/src/components/RunTraining.tsx | 23 +++-- packages/studio-app/src/lib/api.test.ts | 73 ++++++++++++++++ packages/studio-app/src/lib/api.ts | 17 ++++ 9 files changed, 395 insertions(+), 29 deletions(-) diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts index def058ed..018c3aee 100644 --- a/packages/arkor/src/core/configHash.test.ts +++ b/packages/arkor/src/core/configHash.test.ts @@ -48,4 +48,64 @@ describe("hashJobConfig", () => { }; expect(hashJobConfig(a)).toBe(hashJobConfig(b)); }); + + it("treats `undefined` object properties identically to omitted ones (JSON parity)", () => { + // Regression: the previous `stableStringify` delegated to + // `JSON.stringify(undefined)` which returns `undefined` (not a + // string) — concatenated via template literal that became the + // substring `"undefined"` in the hash input. So `{ a: 1 }` and + // `{ a: 1, b: undefined }` produced different hashes even though + // they're indistinguishable on the wire (`JSON.stringify` drops + // `undefined` properties). + const omitted: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }; + const explicitlyUndefined: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + // `unknown`-typed forwarder fields can legitimately end up + // holding `undefined` if a caller spreads from a partial source. + warmupSteps: undefined, + datasetFormat: undefined, + }; + expect(hashJobConfig(omitted)).toBe(hashJobConfig(explicitlyUndefined)); + }); + + it("normalises `undefined` array slots to null (JSON parity)", () => { + // `JSON.stringify([undefined])` → `"[null]"`. The previous + // implementation produced the literal substring `"[undefined]"` + // instead, which is not even valid JSON. + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", undefined, "c"] as unknown, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", null, "c"] as unknown, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + + it("ignores function / symbol properties (JSON parity)", () => { + // `JSON.stringify` drops these too. The hash should be insensitive + // to "transparent" callbacks accidentally landing in a forwarded + // config (the SDK separates `callbacks` out, but `unknown` fields + // could leak one). + const fn = () => 0; + const sym = Symbol("foo"); + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: fn as unknown, + loggingSteps: sym as unknown, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); }); diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts index eb8cda22..712eedb5 100644 --- a/packages/arkor/src/core/configHash.ts +++ b/packages/arkor/src/core/configHash.ts @@ -1,23 +1,54 @@ import { createHash } from "node:crypto"; import type { JobConfig } from "./types"; +/** + * Type-narrowing helper for "this value cannot be represented in JSON". + * Mirrors the cases JSON.stringify silently drops (when in object + * positions) or coerces to `null` (when in array positions): `undefined`, + * functions, and symbols. + */ +function isNonJsonRepresentable(v: unknown): boolean { + return v === undefined || typeof v === "function" || typeof v === "symbol"; +} + /** * Deterministic JSON serialiser: keys sorted at every nesting level so * `{a:1, b:2}` and `{b:2, a:1}` produce the same string. Necessary because * `JSON.stringify` follows insertion order, which isn't stable across * `buildJobConfig` revisions or user-side spread-merge tricks. + * + * Mirrors the JSON wire-format exactly for non-representable values + * (`undefined`, functions, symbols): omitted in object positions, + * serialised as `null` in array positions. The previous implementation + * delegated to `JSON.stringify` which returns the literal value + * `undefined` (not a string) for those — concatenated into the output + * via template literals it became the substring `"undefined"`, which + * is not valid JSON and would silently change the hash if a + * `JobConfig` field ever held one of those values (notably the + * `unknown`-typed forwarder fields). */ function stableStringify(value: unknown): string { - if (value === null || typeof value !== "object") return JSON.stringify(value); + if (value === null) return "null"; + // Top-level non-representable: align with `JSON.stringify(undefined)` + // semantics by collapsing to "null" so the hash input stays valid + // JSON-shaped text rather than the literal substring "undefined". + if (isNonJsonRepresentable(value)) return "null"; + if (typeof value !== "object") return JSON.stringify(value); if (Array.isArray(value)) { - return `[${value.map(stableStringify).join(",")}]`; + // Array slots: non-representable → "null" (matches JSON spec). + const items = value.map((v) => + isNonJsonRepresentable(v) ? "null" : stableStringify(v), + ); + return `[${items.join(",")}]`; } - const keys = Object.keys(value as Record).sort(); + // Object slots: drop non-representable values entirely (matches + // `JSON.stringify({a: undefined}) === "{}"`). + const obj = value as Record; + const keys = Object.keys(obj) + .filter((k) => !isNonJsonRepresentable(obj[k])) + .sort(); const parts = keys.map( - (k) => - `${JSON.stringify(k)}:${stableStringify( - (value as Record)[k], - )}`, + (k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`, ); return `{${parts.join(",")}}`; } diff --git a/packages/arkor/src/core/trainerInspection.test.ts b/packages/arkor/src/core/trainerInspection.test.ts index df557c6f..8335ef39 100644 --- a/packages/arkor/src/core/trainerInspection.test.ts +++ b/packages/arkor/src/core/trainerInspection.test.ts @@ -1,11 +1,14 @@ -import { describe, it, expect } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { createArkor } from "./arkor"; import { createTrainer } from "./trainer"; import { findInspectableTrainer, findTrainerInModule, getTrainerInspection, + replaceTrainerCallbacks, + requestTrainerEarlyStop, } from "./trainerInspection"; +import type { Trainer } from "./types"; function brandedTrainer(name: string) { // Real `createTrainer` attaches the inspection brand. We only need @@ -115,3 +118,81 @@ describe("findInspectableTrainer (brand-required path)", () => { expect(getTrainerInspection(trainer)).toBeNull(); }); }); + +describe("requestTrainerEarlyStop / replaceTrainerCallbacks brand-missing fallback", () => { + // Regression: previously these helpers asserted the brand was + // present and threw a synchronous TypeError on hand-rolled trainers. + // `runner.ts`'s `extractTrainer` accepts ANY `{start, wait, cancel}` + // shape — that's a documented public path for unbranded trainers — + // so the SIGTERM handler crashed instead of stopping the run. + + it("requestTrainerEarlyStop falls back to trainer.cancel() for unbranded trainers", async () => { + const cancelCalls = vi.fn(async () => {}); + const trainer = { + name: "manual", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: cancelCalls, + } as unknown as Trainer; + + // Must not throw, must resolve, must have called cancel(). + await expect(requestTrainerEarlyStop(trainer)).resolves.toBeUndefined(); + expect(cancelCalls).toHaveBeenCalledTimes(1); + }); + + it("requestTrainerEarlyStop swallows a thrown cancel() so the SIGTERM handler can still settle", async () => { + // The runner's SIGTERM handler chains + // `requestTrainerEarlyStop(...).catch(...).finally(() => process.exit(0))`. + // If the brand-missing fallback let cancel()'s rejection bubble, + // the `.finally` would still fire, but the cancel error would + // surface as an unhandled rejection from the test runner. The + // documented contract for cancel() is best-effort, so swallow. + const trainer = { + name: "manual", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: vi.fn(async () => { + throw new Error("network down"); + }), + } as unknown as Trainer; + + await expect(requestTrainerEarlyStop(trainer)).resolves.toBeUndefined(); + }); + + it("requestTrainerEarlyStop is async-shaped: synchronous throws inside the brand call become rejections", async () => { + // Defense-in-depth: even when the brand IS attached but somehow + // throws synchronously (e.g. a future implementation regression), + // the SIGTERM handler's `.catch` arm should still see it instead + // of the throw escaping past `.finally` and taking the runner + // down. The function is `async`, which wraps any synchronous + // throw inside its body into a rejected promise. + const trainer = brandedTrainer("from-arkor"); + // Replace the brand with a function that throws synchronously. + const KEY = Symbol.for("arkor.trainer.requestEarlyStop"); + Object.defineProperty(trainer, KEY, { + value: () => { + throw new Error("brand impl exploded"); + }, + configurable: true, + }); + await expect(requestTrainerEarlyStop(trainer)).rejects.toThrow( + /brand impl exploded/, + ); + }); + + it("replaceTrainerCallbacks is a no-op (not a throw) for unbranded trainers", () => { + // The HMR pipeline never routes SIGUSR2 to unbranded trainers in + // practice (their `configHash` is null, which forces the + // SIGTERM-restart path), but if a future caller did, it must not + // crash the runner. + const trainer = { + name: "manual", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + } as unknown as Trainer; + expect(() => + replaceTrainerCallbacks(trainer, { onLog: () => {} }), + ).not.toThrow(); + }); +}); diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 9b6fc771..7229ff4d 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -114,10 +114,12 @@ export function attachTrainerCallbackReplacer( /** * Replace the trainer's lifecycle callbacks atomically. The brand is - * unconditionally attached by `createTrainer` in this same SDK package, - * so this can assume the brand is present — there's no documented - * public path that produces a brand-less trainer, and the helper itself - * is never called on user-controlled values. + * attached by `createTrainer`, but `runTrainer`'s `extractTrainer` + * also accepts hand-rolled trainers (any `{ start, wait, cancel }` + * shape) — those don't carry the brand. The HMR pipeline never + * routes SIGUSR2 to such trainers in practice (they always produce + * `configHash: null` upstream, which forces the SIGTERM-restart + * path), so this helper is a no-op for them rather than throwing. */ export function replaceTrainerCallbacks( trainer: Trainer, @@ -125,7 +127,8 @@ export function replaceTrainerCallbacks( ): void { const fn = (trainer as unknown as Record)[ TRAINER_REPLACE_CALLBACKS_KEY - ] as (cbs: Partial) => void; + ] as ((cbs: Partial) => void) | undefined; + if (typeof fn !== "function") return; fn.call(trainer, callbacks); } @@ -154,18 +157,43 @@ export function attachTrainerEarlyStopper( * Resolves once `cancel()` has been accepted by the cloud API, or * after `timeoutMs` if no checkpoint arrived in time. * - * The brand is unconditionally attached by `createTrainer` and the - * runner only ever calls this on a discovered SDK trainer — there's no - * branch for "brand missing". + * `createTrainer` attaches the brand unconditionally, but + * `runTrainer`'s `extractTrainer` also accepts hand-rolled trainers + * — any `{ start, wait, cancel }` shape — which legitimately don't + * carry the brand. Falling back to the public `Trainer.cancel()` for + * those is the closest semantic match available without the SDK's + * checkpoint-aware machinery; it's also what the runner's SIGTERM + * handler needs to keep working (the previous "throw if brand + * missing" behaviour caused a synchronous TypeError before the + * handler's `.catch().finally()` chain attached, so SIGTERM crashed + * the runner instead of stopping the run). */ -export function requestTrainerEarlyStop( +// async wrapper (rather than a bare function returning Promise) so +// any *synchronous* throw inside the brand call (or its arguments) +// becomes a rejected promise — the SIGTERM handler's `.catch()` then +// catches it instead of the throw escaping past the `.finally()` +// chain and taking the runner down. +export async function requestTrainerEarlyStop( trainer: Trainer, opts?: RequestEarlyStopOptions, ): Promise { const fn = (trainer as unknown as Record)[ TRAINER_REQUEST_EARLY_STOP_KEY - ] as (opts?: RequestEarlyStopOptions) => Promise; - return fn.call(trainer, opts); + ] as ((opts?: RequestEarlyStopOptions) => Promise) | undefined; + if (typeof fn !== "function") { + // Best-effort fallback for unbranded trainers: trainer.cancel() + // is part of the public Trainer interface, so it's always safe + // to call. Catch/swallow because the documented contract for + // cancel() is "best-effort" and the SIGTERM handler needs the + // returned promise to settle either way. + try { + await trainer.cancel(); + } catch { + // intentionally ignored — see comment above. + } + return; + } + await fn.call(trainer, opts); } /** diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 000f8764..fea72829 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -134,6 +134,45 @@ describe("Studio server", () => { expect(html.indexOf("arkor-studio-token")).toBeLessThan( html.indexOf(""), ); + // HMR meta tag must NOT appear when no coordinator was supplied. + // The SPA reads this flag to decide whether to open + // `/api/dev/events`; a stray "true" here would make every prod + // session retry against the 404 indefinitely. + expect(html).not.toContain("arkor-hmr-enabled"); + }); + + it("injects when an HMR coordinator is supplied", async () => { + // Regression: the SPA can't tell dev-mode usage from prod-mode + // usage at runtime — `vite build` ships with + // `import.meta.env.DEV === false`, so a build-time DEV gate inside + // the SPA bundle would (wrongly) suppress HMR even in real + // `arkor dev` sessions. The server-side flag is `true` exactly + // when `arkor dev` wired in an HMR coordinator. Verify it lands + // in `` next to the studio-token tag. + const fakeHmr = { + subscribe: () => () => undefined, + getCurrentConfigHash: () => null, + async dispose() {}, + }; + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fakeHmr, + }); + const res = await app.request("/", { + headers: { host: "127.0.0.1:4000" }, + }); + expect(res.status).toBe(200); + const html = await res.text(); + expect(html).toContain( + ``, + ); + expect(html.indexOf("arkor-hmr-enabled")).toBeLessThan( + html.indexOf(""), + ); }); it("serves non-html assets with the correct content-type", async () => { diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 74ef3112..7bc6a08d 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -99,11 +99,31 @@ function htmlAttrEscape(s: string): string { ); } -function injectStudioToken(html: string, token: string): string { - const meta = ``; +/** + * Inject the per-launch studio token (always) and an optional HMR + * feature flag into ``. Both are read by the SPA via + * `` lookups — the token gates `/api/*` requests and + * the HMR flag tells `RunTraining` whether to open + * `/api/dev/events` (which only exists when `arkor dev` wired in an + * HMR coordinator). Without the server-side flag the SPA can't tell + * dev-mode usage from prod-mode usage at runtime: `vite build`'s + * output ships with `import.meta.env.DEV === false`, so any DEV gate + * baked into the bundle would suppress HMR even in real `arkor dev` + * sessions. + */ +function injectStudioMeta( + html: string, + token: string, + hmrEnabled: boolean, +): string { + const tokenTag = ``; + const hmrTag = hmrEnabled + ? `` + : ""; + const tags = `${tokenTag}${hmrTag}`; const idx = html.indexOf(""); - if (idx === -1) return `${meta}${html}`; - return `${html.slice(0, idx)}${meta}${html.slice(idx)}`; + if (idx === -1) return `${tags}${html}`; + return `${html.slice(0, idx)}${tags}${html.slice(idx)}`; } export function buildStudioApp(options: StudioServerOptions) { @@ -559,7 +579,11 @@ export function buildStudioApp(options: StudioServerOptions) { const file = await readFile(join(assetsDir, cleaned)); const ext = cleaned.slice(cleaned.lastIndexOf(".") + 1); if (ext === "html") { - const html = injectStudioToken(file.toString("utf8"), studioToken); + const html = injectStudioMeta( + file.toString("utf8"), + studioToken, + Boolean(options.hmr), + ); return new Response(html, { status: 200, headers: { "content-type": CONTENT_TYPES.html! }, diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 69827133..bef28855 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -1,6 +1,7 @@ import { useEffect, useRef, useState } from "react"; import { fetchManifest, + isHmrEnabled, openDevEvents, streamTraining, type DevEvent, @@ -99,12 +100,16 @@ export function RunTraining() { // `restartTargets`; defer the auto re-invocation until the current // `streamTraining` resolves so we don't run two cloud jobs at once. // - // Gated by `import.meta.env.DEV`: `/api/dev/events` returns 404 when - // the server runs in production mode, and `EventSource` would then - // retry indefinitely creating constant background traffic. The dev - // build is the only place this endpoint exists. + // Gated by `isHmrEnabled()` (server-injected `` flag) rather + // than `import.meta.env.DEV`: the SPA is shipped via `vite build` + // and served by `arkor dev` as static assets, so DEV is `false` in + // every real session. The server-side flag is `true` exactly when + // `arkor dev` wired in an HMR coordinator — i.e. when + // `/api/dev/events` actually exists. Without this flag the + // EventSource would either be dead in real dev sessions (DEV gate) + // or retry forever against a 404 (no gate). useEffect(() => { - if (!import.meta.env.DEV) return; + if (!isHmrEnabled()) return; const es = openDevEvents(); const onMessage = (raw: MessageEvent) => { let payload: DevEvent; @@ -204,6 +209,14 @@ export function RunTraining() { ac.signal, (pid) => { currentPidRef.current = pid; + // Clear the "Restarting with updated code…" status as soon + // as the new run starts spawning. Without this the label + // stays pinned for the entire restarted run because + // `setHmrStatus("restarting")` is set in the *prior* run's + // `finally` and nothing else clears it. We only knock out + // "restarting" specifically — "early-stopping" / "hot- + // swapped" should land via their own state transitions. + setHmrStatus((s) => (s === "restarting" ? "idle" : s)); }, ); } catch (err) { diff --git a/packages/studio-app/src/lib/api.test.ts b/packages/studio-app/src/lib/api.test.ts index e7b49d0b..42fa026d 100644 --- a/packages/studio-app/src/lib/api.test.ts +++ b/packages/studio-app/src/lib/api.test.ts @@ -5,6 +5,7 @@ import { fetchJobs, fetchManifest, fetchMe, + isHmrEnabled, streamInferenceContent, streamTraining, } from "./api"; @@ -541,3 +542,75 @@ describe("streamInferenceContent abort", () => { await expect(consume).rejects.toMatchObject({ name: "AbortError" }); }); }); + +describe("isHmrEnabled", () => { + // Regression: a previous version of `RunTraining` gated its + // EventSource subscription on `import.meta.env.DEV`, which is + // baked to `false` by `vite build` and therefore *always* false + // in a real `arkor dev` session (the SPA is shipped as static + // assets). The new server-side `` + // tag is what tells the SPA whether HMR is actually wired in; + // these tests pin the contract. + // + // The package's vitest config doesn't load jsdom (the rest of the + // suite runs in Node), so we stub the minimal `document` API + // `isHmrEnabled` uses — `querySelector('meta[name=...]')` — + // directly on `globalThis`. The reader's contract is just "look + // up a meta tag and return its content === 'true'", which a tiny + // hand-rolled stub covers without dragging the whole DOM in. + function withMetaContent(value: string | null, fn: () => void) { + const fakeDocument = { + querySelector: (selector: string) => { + if (selector !== 'meta[name="arkor-hmr-enabled"]') return null; + if (value === null) return null; + return { getAttribute: () => value }; + }, + }; + const had = "document" in globalThis; + const previous = (globalThis as { document?: unknown }).document; + (globalThis as { document?: unknown }).document = fakeDocument; + try { + fn(); + } finally { + if (had) (globalThis as { document?: unknown }).document = previous; + else delete (globalThis as { document?: unknown }).document; + } + } + + it("returns true when the server-injected meta says HMR is on", () => { + withMetaContent("true", () => { + expect(isHmrEnabled()).toBe(true); + }); + }); + + it("returns false when the meta tag is missing entirely", () => { + // No injection → SPA must NOT open `/api/dev/events` (which + // would 404 and EventSource-retry forever in a non-HMR build). + withMetaContent(null, () => { + expect(isHmrEnabled()).toBe(false); + }); + }); + + it("returns false for any meta content other than the literal `true`", () => { + // Defensive: don't fail open on a malformed/legacy server that + // injects an empty value or a placeholder. + withMetaContent("", () => expect(isHmrEnabled()).toBe(false)); + withMetaContent("false", () => expect(isHmrEnabled()).toBe(false)); + withMetaContent("yes", () => expect(isHmrEnabled()).toBe(false)); + }); + + it("returns false when there is no document at all (Node SSR / module-load probe)", () => { + // The reader is called during component render; in any non-DOM + // host (test fixtures that import the module without jsdom, a + // hypothetical SSR pre-render) it must return false rather than + // throwing on `document` being undefined. + const had = "document" in globalThis; + const previous = (globalThis as { document?: unknown }).document; + delete (globalThis as { document?: unknown }).document; + try { + expect(isHmrEnabled()).toBe(false); + } finally { + if (had) (globalThis as { document?: unknown }).document = previous; + } + }); +}); diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index 7fabc80f..a1f67075 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -55,6 +55,23 @@ function readStudioToken(): string { const STUDIO_TOKEN = readStudioToken(); +/** + * Whether `arkor dev` wired in an HMR coordinator at server boot. + * The studio server emits `` + * into `index.html` only when `options.hmr` is set, so we can tell + * dev-mode usage from prod-mode usage at runtime — `vite build`'s + * output ships with `import.meta.env.DEV === false`, so a build-time + * gate inside the SPA bundle would (wrongly) suppress HMR even in + * real `arkor dev` sessions. `RunTraining` consults this flag before + * opening `/api/dev/events`; without it, the EventSource would retry + * forever against the 404 the server returns for non-HMR builds. + */ +export function isHmrEnabled(): boolean { + if (typeof document === "undefined") return false; + const meta = document.querySelector('meta[name="arkor-hmr-enabled"]'); + return meta?.getAttribute("content") === "true"; +} + /** * `fetch` with the per-launch CSRF token attached. The token is read once at * module load from the `` tag the Studio server injects into From 0a459df3dfd3159d9868b940929d031a557195fb Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 05:41:42 +0900 Subject: [PATCH 10/55] feat: improve HMR handling and prevent unnecessary cloud job spawning on unmounted views --- packages/arkor/src/studio/hmr.test.ts | 55 +++++++++++++++++++ packages/arkor/src/studio/hmr.ts | 51 +++++++++++++++-- .../studio-app/src/components/RunTraining.tsx | 23 +++++++- 3 files changed, 122 insertions(+), 7 deletions(-) diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index b91a4762..b3f03f07 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -176,6 +176,61 @@ describe("createHmrCoordinator", () => { expect(events.length).toBe(countAfterDispose); }); + it("the cached lastEvent reflects the LATEST source under rapid back-to-back edits", async () => { + // Regression: the BUNDLE_END handler used to fire + // `emitBuildSucceeded` without awaiting, so two quick rebuilds + // could run `inspectBundle` concurrently and broadcast out of + // order — leaving `lastEvent` pointing at the older snapshot. + // We can't deterministically synthesise a race against rolldown's + // real watcher, but we *can* assert the user-visible invariant: + // after a sequence of edits, the cached state must match the + // last write. The new sequence-number guard inside + // `emitBuildSucceeded` drops stale inspection results so the + // final broadcast always wins. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + await nextEvent(events, (e) => e.type === "ready"); + // Two source edits in quick succession. Both must result in a + // broadcast eventually, and `lastEvent.hash` must end up + // matching the file content of the FINAL write — not the + // first one (which would prove the older inspection raced + // past the newer one's broadcast). + writeFileSync( + join(cwd, "src/arkor/index.ts"), + FAKE_MANIFEST.replace(`"alpha"`, `"beta"`), + ); + const v2 = await nextEvent( + events, + (e) => e.type === "rebuild", + 4000, + ); + writeFileSync( + join(cwd, "src/arkor/index.ts"), + FAKE_MANIFEST.replace(`"alpha"`, `"gamma"`), + ); + // Wait for any rebuild whose hash differs from v2's. Without + // the seq guard the older inspection could clobber the cached + // state with v2 again, so this would time out. + const v3 = await nextEvent( + events, + (e) => e.type === "rebuild" && e.hash !== v2.hash, + 4000, + ); + // Settle: give any in-flight inspection time to land so we can + // assert the final cached state really is v3, not a late v2 + // overwrite. + await new Promise((r) => setTimeout(r, 250)); + expect(events[events.length - 1]?.hash).toBe(v3.hash); + } finally { + await hmr.dispose(); + } + }); + it("getCurrentConfigHash() returns the latest cached event's hash", async () => { // Regression: `/api/train` previously called `readManifestSummary` // and ran a redundant rebuild per spawn (racing the watcher). diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 00b10aaa..852e6494 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -125,6 +125,34 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * doesn't reconnect on application-level errors. */ let entryWaitTimer: ReturnType | null = null; + /** + * Monotonically incrementing build sequence number. Bumped on every + * `BUNDLE_END` *before* the inspection awaits, so when an + * inspection eventually resolves it can check whether a newer + * build has started in the meantime and silently drop its stale + * result. + * + * This matters because `inspectBundle` does an asynchronous + * dynamic-import of the just-written artifact. Two rebuilds A → B + * landing within the import window can race, with A's inspection + * resolving *after* B's — the previous "fire-and-forget" code + * would then publish A on top of B and leave `lastEvent` pointing + * at the older `configHash`/`trainerName`. That in turn drove + * `/api/dev/events` to make hot-swap-vs-restart decisions against + * stale routing data and surfaced the wrong trainer name in the + * SPA. + */ + let buildSeq = 0; + /** + * Whether a `ready` event has actually broadcast yet. Tracked + * separately from `firstBuild` because the inspection await means + * the first BUNDLE_END's broadcast can land *after* a second + * BUNDLE_END schedules its own — pinning the type to + * "broadcast-time" rather than "schedule-time" guarantees the SPA + * still sees `ready` first even when the initial inspection loses + * the race. + */ + let firstBroadcast = true; function broadcast(event: HmrEvent): void { lastEvent = event; @@ -139,11 +167,20 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { } } - async function emitBuildSucceeded(eventType: HmrEventType): Promise { + async function emitBuildSucceeded(): Promise { if (disposed) return; + const seq = ++buildSeq; const inspection = await inspectBundle(resolved.outFile); + // Drop stale results: a newer rebuild already started (or + // finished) while our inspection was running. The newer + // inspection will own the broadcast for the latest state; this + // one publishing now would just clobber `lastEvent` with the + // older snapshot. + if (seq !== buildSeq || disposed) return; + const type: HmrEventType = firstBroadcast ? "ready" : "rebuild"; + firstBroadcast = false; broadcast({ - type: eventType, + type, outFile: resolved.outFile, hash: fingerprint(resolved.outFile), configHash: inspection?.configHash ?? null, @@ -190,14 +227,16 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { ...rolldownInputOptions(resolved), output: { file: resolved.outFile, format: "esm" }, }); - let firstBuild = true; watcher.on("event", (event) => { if (event.code === "BUNDLE_END") { // rolldown requires the per-build result to be closed to avoid leaks. event.result.close().catch(() => {}); - const type: HmrEventType = firstBuild ? "ready" : "rebuild"; - firstBuild = false; - void emitBuildSucceeded(type); + // The event type ("ready" vs "rebuild") is decided inside + // `emitBuildSucceeded` *after* the inspection await, based on + // whether any prior broadcast actually landed — see the + // `firstBroadcast` comment for why pinning the type at this + // schedule point would be wrong under inspection races. + void emitBuildSucceeded(); } else if (event.code === "ERROR") { event.result.close().catch(() => {}); broadcast({ diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index bef28855..390828b9 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -55,9 +55,22 @@ export function RunTraining() { // `Timeout` object — explicit `number` so TS doesn't pick up the // Node typing from the global `setTimeout`. const hotSwapTimerRef = useRef(null); + // Tracks "is this React tree still mounted?". The HMR auto-restart + // path schedules `queueMicrotask(() => run(...))` after the prior + // run's `finally` — without this gate, navigating away during the + // tiny window between scheduling and the microtask running would + // fire a fresh `/api/train` POST from an unmounted view, spawning + // an invisible cloud job the user can't see or stop. + const isMountedRef = useRef(true); useEffect(() => { return () => { + isMountedRef.current = false; + // Defense in depth: clearing the latch here means even if a + // microtask snuck past the `isMountedRef` check (concurrent + // edits to React's effect ordering, future refactors), it + // still finds nothing pending. + restartPendingRef.current = false; trainingAbortRef.current?.abort(); if (hotSwapTimerRef.current !== null) { clearTimeout(hotSwapTimerRef.current); @@ -244,8 +257,16 @@ export function RunTraining() { // `running=false` state first (otherwise the re-entry overlaps). restartPendingRef.current = false; setHmrStatus("restarting"); + const fileForRestart = lastTrainFileRef.current; queueMicrotask(() => { - void run(lastTrainFileRef.current); + // Don't auto-spawn a fresh /api/train request from an + // unmounted view — the user navigated away in the small + // window between scheduling and running this microtask, so + // their intent was "stop interacting with this view", not + // "kick off another cloud job invisibly". The unmount + // cleanup also clears `restartPendingRef` defensively. + if (!isMountedRef.current) return; + void run(fileForRestart); }); } else { // User-initiated abort takes precedence over a pending HMR From a9adf14781146bae2aa7eb6ba4dabab56364cae4 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 06:12:41 +0900 Subject: [PATCH 11/55] feat: enhance stream handling in server and improve inspection logic in HMR --- packages/arkor/src/studio/hmr.ts | 50 +++++++++++++++-- packages/arkor/src/studio/server.test.ts | 70 ++++++++++++++++++++++++ packages/arkor/src/studio/server.ts | 61 ++++++++++++++++++--- 3 files changed, 168 insertions(+), 13 deletions(-) diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 852e6494..dca03a8d 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -63,10 +63,14 @@ function fingerprint(outFile: string): string { } } +type InspectionResult = { + configHash: string; + trainerName: string; +} | null; + /** * Dynamic-import the freshly-built bundle and pull a `TrainerInspection` - * snapshot off the discovered trainer. Cache-bust the URL so Node's ESM - * loader returns the new module text rather than a stale evaluation. + * snapshot off the discovered trainer. * * Walks every entry shape `runner.ts` accepts (named `arkor`, named * `trainer`, `default` Arkor manifest, `default.trainer`) via the @@ -76,15 +80,40 @@ function fingerprint(outFile: string): string { * null` and the SPA would unnecessarily SIGTERM-restart on every * rebuild. * + * Cache-bust by file mtime+size rather than `Date.now()`: + * + * - Node's ESM loader caches every dynamically-imported URL for the + * lifetime of the process and never evicts. A `?t=Date.now()` + * suffix produces a unique URL per call, so a long `arkor dev` + * session would accumulate one module record per BUNDLE_END — + * unbounded memory growth. + * - Mtime+size keys the cache to "the actual bytes in this file", + * so spurious watcher events that don't change content reuse the + * prior module record. The leak shrinks from "one entry per + * keystroke" to "one entry per actual rebuild", which for a + * realistic dev session (hundreds of saves over hours) is bounded + * by the number of distinct file states the user produces — and + * that's fundamentally what HMR has to track to surface up-to- + * date trainer state. There's no public Node API for evicting an + * ESM module record, so this is the tightest bound we can offer + * without spawning a child process per inspection. + * * Best-effort: a missing/malformed manifest or a thrown user * constructor returns `null` and the caller treats the rebuild as * "config-unknown". */ -async function inspectBundle( - outFile: string, -): Promise<{ configHash: string; trainerName: string } | null> { +async function inspectBundle(outFile: string): Promise { try { - const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; + let key = "0-0"; + try { + const s = statSync(outFile); + key = `${s.mtimeMs.toFixed(0)}-${s.size}`; + } catch { + // outFile vanished between the BUNDLE_END and our stat — + // fall through to the import attempt; it'll throw and we'll + // return null cleanly. + } + const url = `${pathToFileURL(outFile).href}?t=${key}`; const mod = (await import(url)) as Record; const inspection = findInspectableTrainer(mod); if (!inspection) return null; @@ -239,6 +268,15 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { void emitBuildSucceeded(); } else if (event.code === "ERROR") { event.result.close().catch(() => {}); + // Bump the seq so a still-in-flight `emitBuildSucceeded` + // from a *prior* BUNDLE_END drops its broadcast when its + // inspection finally resolves. Without this, the older + // success would land on top of this error and clobber + // `lastEvent`/`configHash`, leaving the SPA showing a + // healthy rebuild while the actual latest build state is + // a compile error. The successful-rebuild path bumps the + // same counter inside `emitBuildSucceeded`. + buildSeq += 1; broadcast({ type: "error", message: diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index fea72829..4c5b8d78 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -682,6 +682,76 @@ process.exit(0); } await expect(reader.cancel()).resolves.toBeUndefined(); }); + + it("/api/train survives cancellation while the child is still streaming output", async () => { + // Regression: the previous implementation registered raw + // `controller.enqueue(...)` listeners on `child.stdout` / + // `child.stderr` and an unguarded `controller.close()` in + // `child.on("close")`. After the client cancelled the + // ReadableStream, those handlers kept firing — and calling + // `enqueue` / `close` on a closed controller throws "Invalid + // state". The throw escaped the request pipeline as an + // unhandled exception. The fix tracks a `closed` flag, removes + // the child listeners on cancel, and try/catches the post- + // cancel enqueue paths defensively. + await writeCredentials(ANON_CREDS); + const fakeBin = join(trainCwd, "fake-bin.mjs"); + // Bin spits a chunk every ~5 ms forever. We cancel while it's + // mid-stream so the child is *still alive* when listeners are + // removed — the previous bug only surfaced in this window. + writeFileSync( + fakeBin, + `setInterval(() => process.stdout.write("tick\\n"), 5);\nsetInterval(() => {}, 60_000);\n`, + ); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const res = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(res.status).toBe(200); + const reader = res.body!.getReader(); + // Read at least one chunk so the child is definitely streaming + // before we cancel — that's the race window the previous code + // crashed in. + const decoder = new TextDecoder(); + let received = ""; + while (!received.includes("tick")) { + const { value, done } = await reader.read(); + if (done) break; + received += decoder.decode(value, { stream: true }); + } + // Listen for unhandled rejections / uncaught exceptions during + // and shortly after the cancel — before the fix, the child's + // next `data` chunk would synchronously throw inside the + // enqueue callback. + const errors: unknown[] = []; + const onUnhandled = (err: unknown) => errors.push(err); + process.on("uncaughtException", onUnhandled); + process.on("unhandledRejection", onUnhandled); + try { + await reader.cancel(); + // Give the child's interval a few iterations to attempt + // post-cancel writes. The handler must short-circuit on the + // `closed` flag and not crash the worker. + await new Promise((r) => setTimeout(r, 50)); + } finally { + process.off("uncaughtException", onUnhandled); + process.off("unhandledRejection", onUnhandled); + } + expect(errors).toEqual([]); + }); }); describe("auto-anonymous bootstrap", () => { diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 7bc6a08d..4472eabd 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -374,19 +374,66 @@ export function buildStudioApp(options: StudioServerOptions) { cwd: trainCwd, }); activeTrains.register(child, { trainFile, configHash }); - const stream = new ReadableStream({ + // Hoisted out of the `ReadableStream` underlying-source so the + // `start` handler can hand its closure-bound teardown helper to + // the `cancel` handler. `cancel` runs in a separate invocation, + // not through `controller`, so the two need a parent-scope + // rendez-vous variable. + let cancelTeardown: (() => void) | null = null; + const stream = new ReadableStream({ start(controller) { + // After `cancel()` runs, calling `controller.enqueue` / + // `controller.close` on the now-closed controller throws + // ("Invalid state: Controller is closed"). The child + // subprocess keeps emitting `data` and ultimately a `close` + // event for some time after the client disconnects, so each + // forwarder needs its own "are we still attached?" guard. + // Track via a flag plus an explicit listener-removal so the + // event loop also stops dispatching once we've torn down. + let closed = false; + // `child.stdout` is in default (binary) mode, so each `data` + // chunk is a Buffer — and `Buffer extends Uint8Array`, so we + // can pass it straight to `controller.enqueue` without a + // round-trip through `TextEncoder`. The previous code did + // `enc.encode(d)` which implicitly coerced the buffer via + // `String()` — same byte content, but allocates a new array. + const onChunk = (d: Buffer): void => { + if (closed) return; + try { + controller.enqueue(d); + } catch { + // Controller raced us into the closed state — flip the + // flag so subsequent chunks short-circuit. + closed = true; + } + }; const enc = new TextEncoder(); - child.stdout.on("data", (d) => controller.enqueue(enc.encode(d))); - child.stderr.on("data", (d) => controller.enqueue(enc.encode(d))); - child.on("close", (code) => { + const onClose = (code: number | null): void => { activeTrains.unregister(child.pid); - controller.enqueue(enc.encode(`\n---\nexit=${code}\n`)); - controller.close(); - }); + child.stdout.off("data", onChunk); + child.stderr.off("data", onChunk); + if (closed) return; + closed = true; + try { + controller.enqueue(enc.encode(`\n---\nexit=${code}\n`)); + controller.close(); + } catch { + // already cancelled; nothing more to do. + } + }; + child.stdout.on("data", onChunk); + child.stderr.on("data", onChunk); + child.on("close", onClose); + cancelTeardown = () => { + closed = true; + child.stdout.off("data", onChunk); + child.stderr.off("data", onChunk); + child.off("close", onClose); + }; }, cancel() { activeTrains.unregister(child.pid); + cancelTeardown?.(); // `ChildProcess.kill()` can throw (ESRCH if the process has // already exited between this handler's invocation and the // signal delivery). A throw here would surface as an unhandled From b242e2ad705440776edacb884ff287c53faf8f3d Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 06:50:06 +0900 Subject: [PATCH 12/55] feat: add waitForStableEvents function to improve event handling stability in HMR tests --- packages/arkor/src/studio/hmr.test.ts | 71 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index b3f03f07..3ac2ded1 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -3,6 +3,7 @@ import { mkdirSync, mkdtempSync, rmSync, + statSync, writeFileSync, } from "node:fs"; import { tmpdir } from "node:os"; @@ -48,6 +49,31 @@ function nextEvent( }); } +/** + * Resolve once `events.length` has gone `quietWindowMs` without + * growing. Used to wait out spurious watcher events on noisier file + * systems (Windows polling / macOS FSEvents coalescing) before + * asserting the cached state. + */ +function waitForStableEvents( + events: HmrEvent[], + quietWindowMs: number, +): Promise { + return new Promise((resolve) => { + let lastLength = events.length; + let stableSince = Date.now(); + const tick = () => { + if (events.length !== lastLength) { + lastLength = events.length; + stableSince = Date.now(); + } + if (Date.now() - stableSince >= quietWindowMs) return resolve(); + setTimeout(tick, 50); + }; + tick(); + }); +} + describe("createHmrCoordinator", () => { it("emits a `ready` event after the first successful build", async () => { mkdirSync(join(cwd, "src/arkor"), { recursive: true }); @@ -184,9 +210,9 @@ describe("createHmrCoordinator", () => { // We can't deterministically synthesise a race against rolldown's // real watcher, but we *can* assert the user-visible invariant: // after a sequence of edits, the cached state must match the - // last write. The new sequence-number guard inside - // `emitBuildSucceeded` drops stale inspection results so the - // final broadcast always wins. + // bytes that are actually on disk. The new sequence-number guard + // inside `emitBuildSucceeded` drops stale inspection results so + // whichever BUNDLE_END landed last broadcasts last. mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); @@ -195,37 +221,30 @@ describe("createHmrCoordinator", () => { hmr.subscribe((e) => events.push(e)); try { await nextEvent(events, (e) => e.type === "ready"); - // Two source edits in quick succession. Both must result in a - // broadcast eventually, and `lastEvent.hash` must end up - // matching the file content of the FINAL write — not the - // first one (which would prove the older inspection raced - // past the newer one's broadcast). writeFileSync( join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST.replace(`"alpha"`, `"beta"`), ); - const v2 = await nextEvent( - events, - (e) => e.type === "rebuild", - 4000, - ); + await nextEvent(events, (e) => e.type === "rebuild", 4000); writeFileSync( join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST.replace(`"alpha"`, `"gamma"`), ); - // Wait for any rebuild whose hash differs from v2's. Without - // the seq guard the older inspection could clobber the cached - // state with v2 again, so this would time out. - const v3 = await nextEvent( - events, - (e) => e.type === "rebuild" && e.hash !== v2.hash, - 4000, - ); - // Settle: give any in-flight inspection time to land so we can - // assert the final cached state really is v3, not a late v2 - // overwrite. - await new Promise((r) => setTimeout(r, 250)); - expect(events[events.length - 1]?.hash).toBe(v3.hash); + // Wait for the watcher to settle — any rebuild that's going to + // fire (including spurious extras from FSEvents on macOS or + // chokidar polling on Windows) lands within this window. The + // assertion then compares the cached `lastEvent.hash` against + // the *actual* fingerprint of the on-disk artefact, not a + // captured "last expected" hash from earlier in the test — + // that earlier capture was brittle on Windows where rolldown + // routinely emits a 4th BUNDLE_END after the explicit edits + // settle, producing a slightly different output byte (a + // change in the bundled comment header is enough to bump + // mtime + size). + await waitForStableEvents(events, 750); + const stat = statSync(join(cwd, ".arkor/build/index.mjs")); + const expectedHash = `${stat.mtimeMs.toFixed(0)}-${stat.size}`; + expect(events[events.length - 1]?.hash).toBe(expectedHash); } finally { await hmr.dispose(); } From 33ac9d03c95360674dbfc9e1f662254e4cbf9a6e Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 07:50:46 +0900 Subject: [PATCH 13/55] feat: implement module cache busting functions and tests for improved HMR stability --- .../arkor/src/core/moduleCacheBust.test.ts | 63 +++++++++++++++++++ packages/arkor/src/core/moduleCacheBust.ts | 42 +++++++++++++ packages/arkor/src/core/runnerSignals.ts | 10 ++- packages/arkor/src/core/trainer.test.ts | 10 +++ packages/arkor/src/core/trainer.ts | 16 +++++ packages/arkor/src/studio/hmr.ts | 17 ++--- packages/arkor/src/studio/manifest.ts | 14 ++++- .../studio-app/src/components/RunTraining.tsx | 12 +++- 8 files changed, 166 insertions(+), 18 deletions(-) create mode 100644 packages/arkor/src/core/moduleCacheBust.test.ts create mode 100644 packages/arkor/src/core/moduleCacheBust.ts diff --git a/packages/arkor/src/core/moduleCacheBust.test.ts b/packages/arkor/src/core/moduleCacheBust.test.ts new file mode 100644 index 00000000..66255833 --- /dev/null +++ b/packages/arkor/src/core/moduleCacheBust.test.ts @@ -0,0 +1,63 @@ +import { describe, it, expect, beforeEach, afterEach } from "vitest"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { pathToFileURL } from "node:url"; +import { + moduleCacheBustKey, + moduleCacheBustUrl, +} from "./moduleCacheBust"; + +let dir: string; + +beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), "arkor-cachebust-test-")); +}); + +afterEach(() => { + rmSync(dir, { recursive: true, force: true }); +}); + +describe("moduleCacheBustKey", () => { + it("is stable across calls when the file hasn't changed", () => { + // Regression: Node's ESM loader never evicts module records, and + // a `Date.now()` cache-bust would produce a fresh URL on every + // call → unbounded leak across long `arkor dev` sessions + // (5 s `/api/manifest` polls + every save firing SIGUSR2). + // mtime+size keying must collapse repeat reads of unchanged + // bytes onto the same key so the loader serves from cache. + const file = join(dir, "stable.mjs"); + writeFileSync(file, "export const v = 1;"); + const k1 = moduleCacheBustKey(file); + const k2 = moduleCacheBustKey(file); + expect(k1).toBe(k2); + expect(k1).toMatch(/^\d+-\d+$/); + }); + + it("changes when the file content changes (different size)", () => { + const file = join(dir, "growing.mjs"); + writeFileSync(file, "v1"); + const before = moduleCacheBustKey(file); + writeFileSync(file, "version-two"); + const after = moduleCacheBustKey(file); + expect(after).not.toBe(before); + }); + + it("returns a stable fallback (\"0-0\") for missing files instead of throwing", () => { + // The eventual `await import(url)` will throw on a missing + // file; the helper itself should produce a value rather than + // bubbling the stat error and turning every consumer into a + // try/catch site. + expect(moduleCacheBustKey(join(dir, "does-not-exist.mjs"))).toBe("0-0"); + }); +}); + +describe("moduleCacheBustUrl", () => { + it("returns a fully-qualified file URL with the cache-bust query attached", () => { + const file = join(dir, "u.mjs"); + writeFileSync(file, "export const x = 1;"); + const url = moduleCacheBustUrl(file); + expect(url.startsWith(pathToFileURL(file).href + "?t=")).toBe(true); + expect(url).toMatch(/\?t=\d+-\d+$/); + }); +}); diff --git a/packages/arkor/src/core/moduleCacheBust.ts b/packages/arkor/src/core/moduleCacheBust.ts new file mode 100644 index 00000000..21abea8b --- /dev/null +++ b/packages/arkor/src/core/moduleCacheBust.ts @@ -0,0 +1,42 @@ +import { statSync } from "node:fs"; +import { pathToFileURL } from "node:url"; + +/** + * Build a content-derived cache-bust query for `await import(url + "?t=" + key)`. + * + * Why this matters: Node's ESM loader caches every dynamically-imported + * URL for the lifetime of the process and exposes no API to evict a + * record. A naive `?t=Date.now()` cache-bust produces a fresh URL on + * every call, so a long-running `arkor dev` session — where the SPA + * polls `/api/manifest` every few seconds and every save fires + * `BUNDLE_END` + SIGUSR2 — accumulates one module record per call, + * unbounded. + * + * Keying on `mtime + size` collapses repeated reads of the same bytes + * onto the same URL, which Node's loader then serves from its existing + * cache record. The leak shrinks from "one entry per call" to "one + * entry per actual file change", which is the tightest bound we can + * offer without spawning a child process per import. + * + * Falls back to a stable literal on stat failure so the eventual + * `import()` (which will throw on a missing file) gets to surface its + * own clean error rather than us inventing a noisy timestamp here. + */ +export function moduleCacheBustKey(filePath: string): string { + try { + const s = statSync(filePath); + return `${s.mtimeMs.toFixed(0)}-${s.size}`; + } catch { + return "0-0"; + } +} + +/** + * Convenience: full file URL with the cache-bust key already + * appended. The `as const`-style template is small enough to inline + * but doing it in one place keeps the URL shape uniform across the + * three callers (`hmr.ts`, `manifest.ts`, `runnerSignals.ts`). + */ +export function moduleCacheBustUrl(filePath: string): string { + return `${pathToFileURL(filePath).href}?t=${moduleCacheBustKey(filePath)}`; +} diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index 0638ddfa..8bd53173 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -1,4 +1,4 @@ -import { pathToFileURL } from "node:url"; +import { moduleCacheBustUrl } from "./moduleCacheBust"; import { findInspectableTrainer, replaceTrainerCallbacks, @@ -74,7 +74,13 @@ export function installCallbackReloadHandler( entryPath: string, ): () => void { const handler = (): void => { - const url = `${pathToFileURL(entryPath).href}?t=${Date.now()}`; + // mtime+size cache-bust (vs `Date.now()`): Node's ESM loader + // never evicts module records, so a long `arkor start` session + // with frequent SIGUSR2 reloads would accumulate one record per + // signal forever. Keying on the actual artefact bytes collapses + // no-op signals onto the same URL — the leak is bounded to "one + // per real edit", which is fundamentally what HMR has to retain. + const url = moduleCacheBustUrl(entryPath); void (async () => { try { const mod = (await import(url)) as Record; diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 0b9fcb24..d43ce904 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1600,6 +1600,16 @@ describe("createTrainer (early stop)", () => { // Tiny timeout so the test doesn't actually wait 5 minutes. await requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); expect(cancelCalls).toBe(1); + // Regression: the timeout fallback used to leave + // `earlyStopRequested = true` and `startedJob.status = + // "running"`. A subsequent `requestEarlyStop()` call would + // then re-arm a fresh timer and re-issue cancel even though + // the early-stop already fired. With the latch reset and + // local terminal-status update mirroring the + // checkpoint-triggered branch, the second call hits the + // TERMINAL_STATUSES short-circuit and is a true no-op. + await requestTrainerEarlyStop(trainer, { timeoutMs: 5 }); + expect(cancelCalls).toBe(1); } finally { globalThis.fetch = original; } diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index add5c8f0..e4906289 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -496,6 +496,22 @@ export function createTrainer( .cancel() .catch(() => {}) .finally(() => { + // Mirror the checkpoint-triggered early-stop branch: reset + // the latch and reflect the cancellation locally so a + // second `requestEarlyStop()` call is a no-op (instead of + // re-arming a fresh timer + re-issuing cancel) and so + // `wait()`'s eventual resolution exposes a terminal status. + // Without this, a long-lived trainer left in + // `earlyStopRequested = true` would re-cancel on every + // future checkpoint event for the rest of its lifetime. + earlyStopRequested = false; + if (startedJob && !TERMINAL_STATUSES.has(startedJob.status)) { + startedJob = { + ...startedJob, + status: "cancelled", + completedAt: new Date().toISOString(), + }; + } if (active) active.resolve(); if (earlyStopDeferred === active) earlyStopDeferred = null; }); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index dca03a8d..969efe0b 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -1,7 +1,7 @@ import { existsSync, statSync } from "node:fs"; -import { pathToFileURL } from "node:url"; import { watch, type RolldownWatcher } from "rolldown"; import { hashJobConfig } from "../core/configHash"; +import { moduleCacheBustUrl } from "../core/moduleCacheBust"; import { BUILD_DEFAULTS, resolveBuildEntry, @@ -104,17 +104,10 @@ type InspectionResult = { */ async function inspectBundle(outFile: string): Promise { try { - let key = "0-0"; - try { - const s = statSync(outFile); - key = `${s.mtimeMs.toFixed(0)}-${s.size}`; - } catch { - // outFile vanished between the BUNDLE_END and our stat — - // fall through to the import attempt; it'll throw and we'll - // return null cleanly. - } - const url = `${pathToFileURL(outFile).href}?t=${key}`; - const mod = (await import(url)) as Record; + const mod = (await import(moduleCacheBustUrl(outFile))) as Record< + string, + unknown + >; const inspection = findInspectableTrainer(mod); if (!inspection) return null; return { diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index 3699f8b4..51cf51de 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -1,6 +1,6 @@ -import { pathToFileURL } from "node:url"; import { runBuild } from "../cli/commands/build"; import { hashJobConfig } from "../core/configHash"; +import { moduleCacheBustUrl } from "../core/moduleCacheBust"; import { findTrainerInModule, getTrainerInspection, @@ -40,8 +40,16 @@ const EMPTY: ManifestSummary = { trainer: null, configHash: null }; export async function summariseBuiltManifest( outFile: string, ): Promise { - const url = `${pathToFileURL(outFile).href}?t=${Date.now()}`; - const mod = (await import(url)) as Record; + // mtime+size cache-bust (vs `Date.now()`): the SPA polls + // `/api/manifest` every ~5 s, so a `Date.now()` suffix would + // accumulate one ESM module record per poll across a long + // `arkor dev` session — Node's loader has no eviction. Keying on + // the artefact bytes collapses unchanged-poll reads onto the + // existing record. + const mod = (await import(moduleCacheBustUrl(outFile))) as Record< + string, + unknown + >; // Walk every trainer export shape `runner.ts` accepts via the // shared helper (named `arkor`, named `trainer`, default Arkor // manifest, `default.trainer`) so manifest summary, HMR routing, diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 390828b9..8f2489cc 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -124,7 +124,17 @@ export function RunTraining() { useEffect(() => { if (!isHmrEnabled()) return; const es = openDevEvents(); - const onMessage = (raw: MessageEvent) => { + // Typed as `Event` (not `MessageEvent`) because the same handler + // is registered for the `error` event, which EventSource fires + // as a plain `Event` on connection failures (server crashed, + // browser dropped the SSE) — those carry no `.data`. Custom + // server-sent events (`event: ready` / `event: rebuild` / the + // SSE `event: error` frame the HMR server emits) all arrive as + // `MessageEvent` instances, so we narrow before reading + // `.data`. EventSource will auto-retry connection failures, so + // there's nothing to do for them other than not crash. + const onMessage = (raw: Event) => { + if (!(raw instanceof MessageEvent)) return; let payload: DevEvent; try { payload = JSON.parse(raw.data) as DevEvent; From c8f5b5b6487d87e7739d9a730f6d123dc66dac06 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 08:19:31 +0900 Subject: [PATCH 14/55] feat: preserve last successful config hash across ERROR events in HMR --- packages/arkor/src/studio/hmr.test.ts | 40 +++++++++++++++++++++++++++ packages/arkor/src/studio/hmr.ts | 39 ++++++++++++++++++++++---- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 3ac2ded1..02ec82ac 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -282,4 +282,44 @@ describe("createHmrCoordinator", () => { await hmr.dispose(); } }); + + it("getCurrentConfigHash() preserves the last-success hash across an ERROR event", async () => { + // Regression: previously `getCurrentConfigHash()` returned + // `lastEvent?.configHash ?? null`. After an ERROR landed, + // `lastEvent` was the error event (no `configHash`) so the + // getter went null — even though `.arkor/build/index.mjs` still + // held the previous *successful* bundle bytes (ERROR doesn't + // overwrite the output). A child spawned via `/api/train` in + // that window would register `configHash: null`, and the next + // successful BUNDLE_END would diff against null → SIGTERM + // restart instead of SIGUSR2 hot-swap, defeating callback + // hot-swap for the rest of the session. The fix tracks the + // last *successful* hash separately from `lastEvent`. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + const ready = await nextEvent(events, (e) => e.type === "ready"); + const successHash = hmr.getCurrentConfigHash(); + // Sanity: ready event's configHash matches the getter. + expect(successHash).toBe(ready.configHash ?? null); + // Inject a syntax error to force a watcher ERROR event. + writeFileSync( + join(cwd, "src/arkor/index.ts"), + "this is not { valid javascript = ;", + ); + await nextEvent(events, (e) => e.type === "error", 4000); + // After the error, the cached `lastEvent` is the error frame + // — but the on-disk artifact still holds the previous + // success. The getter must return that previous-success hash + // so any `/api/train` spawn during this window still gets a + // useful spawn-time hash for the *next* rebuild's routing. + expect(hmr.getCurrentConfigHash()).toBe(successHash); + } finally { + await hmr.dispose(); + } + }); }); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 969efe0b..6a873104 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -175,6 +175,21 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * the race. */ let firstBroadcast = true; + /** + * Cached `configHash` of the last *successful* build, **independent + * of `lastEvent`**. `lastEvent` tracks every broadcast (including + * `error`) for the cached-replay-on-late-subscribe contract, but a + * transient build error must not blank out the spawn-time hash that + * `/api/train` reads via `getCurrentConfigHash()`. The on-disk + * `.arkor/build/index.mjs` doesn't change on ERROR, so a child + * spawned during an error state is running the *previous* successful + * bundle — and the next BUNDLE_END's hash should be compared + * against THAT. Without this separate cache, the whole rebuild gets + * routed through SIGTERM-restart and SIGUSR2 hot-swap stops working + * for the rest of the session whenever the user briefly broke their + * source. + */ + let lastSuccessConfigHash: string | null = null; function broadcast(event: HmrEvent): void { lastEvent = event; @@ -201,11 +216,18 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { if (seq !== buildSeq || disposed) return; const type: HmrEventType = firstBroadcast ? "ready" : "rebuild"; firstBroadcast = false; + const configHash = inspection?.configHash ?? null; + // BUNDLE_END always reflects what's now on disk — even when the + // bundle is unbranded (`configHash === null`), that's the + // current truth. Capture it so `/api/train` spawning during a + // *subsequent* transient error still has the right spawn-time + // hash to compare against the next successful rebuild. + lastSuccessConfigHash = configHash; broadcast({ type, outFile: resolved.outFile, hash: fingerprint(resolved.outFile), - configHash: inspection?.configHash ?? null, + configHash, trainerName: inspection?.trainerName ?? null, }); } @@ -291,10 +313,17 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { }; }, getCurrentConfigHash() { - // `lastEvent` is `null` until the first BUNDLE_END (or null again - // if the most recent emission was an `error`); both cases are - // legitimate "we don't know the hash yet" signals to the caller. - return lastEvent?.configHash ?? null; + // Returns the hash of the *last successful* build, NOT + // `lastEvent.configHash`. The two diverge after an ERROR: + // `lastEvent` becomes the error event (no `configHash`), but + // `.arkor/build/index.mjs` still holds the previous successful + // bundle bytes — and a child spawned in that window is running + // those bytes. Returning the cached success hash keeps + // `/api/train` registering accurate spawn-time hashes so the + // next successful BUNDLE_END can route hot-swap vs restart + // correctly. `null` only before the first successful build (or + // a build that wasn't inspectable). + return lastSuccessConfigHash; }, async dispose() { disposed = true; From 1d0cc83fea2dce23ccbdc886743773e0118cc2ff Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 08:35:47 +0900 Subject: [PATCH 15/55] feat: implement cleanup hook detachment and reset functionality for improved test isolation --- packages/arkor/src/cli/cleanupHooks.test.ts | 74 ++++++++++++++++++--- packages/arkor/src/cli/cleanupHooks.ts | 62 +++++++++++++++-- packages/arkor/src/cli/commands/dev.test.ts | 27 ++------ 3 files changed, 124 insertions(+), 39 deletions(-) diff --git a/packages/arkor/src/cli/cleanupHooks.test.ts b/packages/arkor/src/cli/cleanupHooks.test.ts index 9428481f..fa39a88f 100644 --- a/packages/arkor/src/cli/cleanupHooks.test.ts +++ b/packages/arkor/src/cli/cleanupHooks.test.ts @@ -1,11 +1,14 @@ import { afterEach, describe, expect, it, vi } from "vitest"; -import { registerCleanupHook } from "./cleanupHooks"; +import { + __resetCleanupHooksForTests, + registerCleanupHook, +} from "./cleanupHooks"; // Each test that emits a signal also installs new listeners on -// `process` for the lifetime of this worker. We can't `process.off` -// the listeners (they're closures inside `registerCleanupHook`) but -// we can ensure each test fires its own per-registration handler and -// process.exit is mocked so the worker survives. +// `process` for the lifetime of this worker. Auto-detach inside the +// handlers covers the fire-then-cleanup case; `__resetCleanupHooksForTests` +// covers tests whose registration never fires (still need their +// listeners off the worker before the next test runs). let exitSpy: ReturnType | null = null; let stdoutSpy: ReturnType | null = null; @@ -15,6 +18,7 @@ afterEach(() => { stdoutSpy?.mockRestore(); exitSpy = null; stdoutSpy = null; + __resetCleanupHooksForTests(); }); function mockExit(): number[] { @@ -77,6 +81,54 @@ describe("registerCleanupHook", () => { expect(codes).toEqual([0]); }); + it("auto-detaches its process listeners after firing so they don't accumulate", () => { + // Regression: previously each `registerCleanupHook` call left + // `process.on('exit', ...)` and per-signal listeners armed + // forever. A long-lived Node worker that re-arms hooks (vitest + // running many tests, or any future caller that re-registers on + // each iteration) tripped Node's + // `MaxListenersExceededWarning`. Fix: each handler synchronously + // detaches its registration after invoking `run()`. + const exitBefore = process.listeners("exit").length; + const sigintBefore = process.listeners("SIGINT").length; + const sigtermBefore = process.listeners("SIGTERM").length; + const sighupBefore = process.listeners("SIGHUP").length; + + registerCleanupHook({ + cleanup: () => {}, + exitOnSignal: false, + }); + + expect(process.listeners("exit").length).toBe(exitBefore + 1); + expect(process.listeners("SIGINT").length).toBe(sigintBefore + 1); + expect(process.listeners("SIGTERM").length).toBe(sigtermBefore + 1); + expect(process.listeners("SIGHUP").length).toBe(sighupBefore + 1); + + // Firing one signal must detach BOTH that registration's signal + // listener AND its sibling exit listener — the registration is + // done after first fire regardless of which channel triggered it. + process.emit("SIGINT", "SIGINT"); + + expect(process.listeners("exit").length).toBe(exitBefore); + expect(process.listeners("SIGINT").length).toBe(sigintBefore); + expect(process.listeners("SIGTERM").length).toBe(sigtermBefore); + expect(process.listeners("SIGHUP").length).toBe(sighupBefore); + }); + + it("__resetCleanupHooksForTests detaches every still-armed registration", () => { + // Test-only escape hatch for registrations whose handler never + // fires inside the test (no signal emitted) — without it, those + // listeners would persist across the vitest worker's test queue. + const exitBefore = process.listeners("exit").length; + registerCleanupHook({ cleanup: () => {}, exitOnSignal: false }); + registerCleanupHook({ cleanup: () => {}, exitOnSignal: true }); + expect(process.listeners("exit").length).toBe(exitBefore + 2); + + __resetCleanupHooksForTests(); + + expect(process.listeners("exit").length).toBe(exitBefore); + }); + it("is idempotent against repeated signals (done latch + bounded exit)", async () => { let invocations = 0; registerCleanupHook({ @@ -93,12 +145,12 @@ describe("registerCleanupHook", () => { await flushMicrotasks(); await flushMicrotasks(); - // Cleanup body runs once even if the signal fires multiple times. + // Cleanup body runs once even if the signal fires multiple times + // (auto-detach removes the listener after first fire; the `done` + // latch is the secondary defence in case detach is racy). expect(invocations).toBe(1); - // Exit may be called multiple times (once per signal handler - // that armed it), but the mock no-ops so the worker survives — - // verify at least one exit fired. - expect(codes.length).toBeGreaterThanOrEqual(1); - expect(codes[0]).toBe(0); + // First SIGINT fires the handler → exit(0); follow-ups hit no + // listener after auto-detach, so codes has exactly one entry. + expect(codes).toEqual([0]); }); }); diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index 1e4cd33e..3d6ffa11 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -35,6 +35,19 @@ export interface CleanupHookOptions { */ const inFlightCleanups = new Set>(); +/** + * Detachers for every still-armed registration. The signal/exit + * handlers each call their own detacher synchronously after invoking + * `run()` so a long-lived worker that calls `registerCleanupHook` + * many times (vitest reusing the same Node worker across tests, or a + * future caller that re-arms hooks dynamically) doesn't pile up + * `process.on(...)` listeners and trip Node's + * `MaxListenersExceededWarning`. Test code can also call + * `__resetCleanupHooksForTests()` to detach every still-armed + * registration up-front for explicit isolation. + */ +const attachedHandlers = new Set<() => void>(); + /** * Register a cleanup hook that fires on `process.exit` and on * SIGINT / SIGTERM / SIGHUP. Used by `runDev` to dispose long-lived @@ -44,9 +57,10 @@ const inFlightCleanups = new Set>(); * * Per-registration signal listeners (rather than a singleton): each * `runDev()` invocation gets its own listener wired to its own - * `done` latch. This matches the old behaviour and keeps test - * isolation simple (vitest's per-test cleanup doesn't have to reach - * into module state). + * `done` latch. Listeners auto-detach as soon as their handler fires + * (the `done` latch makes any later invocation a no-op anyway), so + * a process that goes through many register → fire cycles doesn't + * accumulate stale listeners on `process`. * * `process.on("exit", ...)` listeners cannot be async — Node fires * them right before the process terminates and discards any returned @@ -77,18 +91,20 @@ export function registerCleanupHook(options: CleanupHookOptions): void { return promise; }; - process.on("exit", () => { + const exitHandler = () => { void run(); - }); - + detach(); + }; + const signalHandlers = new Map<(typeof TERMINATING_SIGNALS)[number], () => void>(); for (const sig of TERMINATING_SIGNALS) { - process.on(sig, () => { + signalHandlers.set(sig, () => { // Sync cleanup body fires inside this `run()` call before the // returned promise resolves; that preserves "side effect is // observable right after the handler returns" for sync // cleanups like `unlinkSync` (and the existing tests that // assert on it). const my = run(); + detach(); if (!options.exitOnSignal) return; // Wait for THIS hook's tail and every other in-flight cleanup // (siblings registered in the same process) before exiting. @@ -101,4 +117,36 @@ export function registerCleanupHook(options: CleanupHookOptions): void { ]).then(() => process.exit(0)); }); } + + let detached = false; + const detach = () => { + if (detached) return; + detached = true; + process.off("exit", exitHandler); + for (const sig of TERMINATING_SIGNALS) { + const handler = signalHandlers.get(sig); + if (handler) process.off(sig, handler); + } + attachedHandlers.delete(detach); + }; + attachedHandlers.add(detach); + + process.on("exit", exitHandler); + for (const sig of TERMINATING_SIGNALS) { + const handler = signalHandlers.get(sig); + if (handler) process.on(sig, handler); + } +} + +/** + * Detach every still-armed registration. Test-only escape hatch: a + * vitest worker reuses the same Node process across many tests, and + * each `registerCleanupHook` call leaves listeners attached until + * something fires them. Call this from `afterEach` to keep the + * worker's `process` listener counts flat. + */ +export function __resetCleanupHooksForTests(): void { + for (const detach of [...attachedHandlers]) detach(); + attachedHandlers.clear(); + inFlightCleanups.clear(); } diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index 47299303..bc043c95 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -31,6 +31,7 @@ import { writeCredentials, type AnonymousCredentials, } from "../../core/credentials"; +import { __resetCleanupHooksForTests } from "../cleanupHooks"; import { ensureCredentialsForStudio, runDev } from "./dev"; /** @@ -558,15 +559,6 @@ describe("ensureCredentialsForStudio", () => { }); describe("runDev", () => { - // Track exit/signal listeners we add via scheduleStudioTokenCleanup so - // we can remove them between tests; otherwise vitest's worker would - // accumulate listeners and Node's MaxListenersExceededWarning would - // fire by the third test. - const ORIG_EXIT_LISTENERS = process.listeners("exit").length; - const ORIG_SIGINT_LISTENERS = process.listeners("SIGINT").length; - const ORIG_SIGTERM_LISTENERS = process.listeners("SIGTERM").length; - const ORIG_SIGHUP_LISTENERS = process.listeners("SIGHUP").length; - beforeEach(async () => { vi.mocked(serve).mockClear(); vi.mocked(open).mockClear(); @@ -583,18 +575,11 @@ describe("runDev", () => { }); afterEach(() => { - // Trim the exit/signal listeners runDev installed each iteration to - // keep vitest's worker tidy across tests. - const trim = (ev: string, keep: number) => { - const all = process.listeners(ev as never); - for (let i = keep; i < all.length; i++) { - process.removeListener(ev as never, all[i] as never); - } - }; - trim("exit", ORIG_EXIT_LISTENERS); - trim("SIGINT", ORIG_SIGINT_LISTENERS); - trim("SIGTERM", ORIG_SIGTERM_LISTENERS); - trim("SIGHUP", ORIG_SIGHUP_LISTENERS); + // Each `runDev()` arms exit/signal hooks via `registerCleanupHook`. + // Tests whose handler never fires would leak listeners across the + // vitest worker's queue; this detaches every still-armed + // registration so Node's MaxListenersExceededWarning doesn't trip. + __resetCleanupHooksForTests(); }); it("persists the studio token and starts the server on the requested port", async () => { From bd8461c3cfab7e25aca01c5dbef7eec0bdad2fc2 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 08:47:09 +0900 Subject: [PATCH 16/55] feat: inject HMR-enabled flag alongside CSRF token in index.html for consistent SPA behavior --- packages/studio-app/src/lib/api.ts | 8 ++++++++ packages/studio-app/vite.config.ts | 24 +++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index a1f67075..738ec3f9 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -65,6 +65,14 @@ const STUDIO_TOKEN = readStudioToken(); * real `arkor dev` sessions. `RunTraining` consults this flag before * opening `/api/dev/events`; without it, the EventSource would retry * forever against the 404 the server returns for non-HMR builds. + * + * The Vite SPA dev workflow (`pnpm --filter @arkor/studio-app dev`) + * serves its own `index.html`, so the SPA's `vite.config.ts` plugin + * also injects this meta alongside the studio-token meta — that way + * a single meta-presence check covers both the production-built SPA + * (served by `arkor dev`) and the Vite-served dev SPA, instead of + * needing a separate `import.meta.env.DEV` fallback that would diverge + * between dev workflows. */ export function isHmrEnabled(): boolean { if (typeof document === "undefined") return false; diff --git a/packages/studio-app/vite.config.ts b/packages/studio-app/vite.config.ts index 51e50dcf..b5d40c8e 100644 --- a/packages/studio-app/vite.config.ts +++ b/packages/studio-app/vite.config.ts @@ -25,10 +25,22 @@ function htmlAttrEscape(s: string): string { } /** - * Inject the per-launch Studio CSRF token into served `index.html` so the - * SPA's `apiFetch` can attach it. `arkor dev` writes the token to - * `~/.arkor/studio-token` on launch; we re-read on every request so that - * starting `arkor dev` after Vite is picked up on the next reload. + * Inject the per-launch Studio CSRF token (and the HMR-enabled flag) + * into served `index.html` so the SPA's `apiFetch` can attach the + * token, and `isHmrEnabled()` can light up the `/api/dev/events` + * subscription. `arkor dev` writes the token to `~/.arkor/studio-token` + * on launch; we re-read on every request so that starting `arkor dev` + * after Vite is picked up on the next reload. + * + * Why also inject `arkor-hmr-enabled` here: the SPA reads the meta to + * decide whether to open the SSE channel, and `buildStudioApp` only + * emits it when HMR is wired in. Vite serves its own `index.html` (so + * the runtime backend never gets to inject anything), and the only + * realistic backend for Vite-served pages is `arkor dev` (Vite proxies + * `/api` to :4000), which always boots with HMR. Pairing the two + * meta tags keeps both the production SPA (served by `arkor dev`) and + * the Vite dev workflow (`pnpm --filter @arkor/studio-app dev`) + * behaving the same way: HMR active whenever the token is. * * `apply: "serve"` constrains this to the dev server. If it ran during * `vite build` it would bake the current per-launch token into `dist/ @@ -51,7 +63,9 @@ function arkorStudioToken(): Plugin { return html; } if (!token) return html; - const meta = ``; + const tokenMeta = ``; + const hmrMeta = ``; + const meta = `${tokenMeta}${hmrMeta}`; const idx = html.indexOf(""); if (idx === -1) return `${meta}${html}`; return `${html.slice(0, idx)}${meta}${html.slice(idx)}`; From c4968706e00f99bd736a15579ab0f85b7f03c5f7 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 09:06:45 +0900 Subject: [PATCH 17/55] feat: enhance early-stop handling to support custom trainers in shutdown process --- packages/arkor/src/core/runnerSignals.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index 8bd53173..4542ac44 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -43,8 +43,10 @@ export function installShutdownHandlers(trainer: Trainer): () => void { ); // Drive the trainer's internal early-stop entry point via the // `Symbol.for("arkor.trainer.requestEarlyStop")` brand attached by - // `createTrainer`. The runner only reaches this handler with a - // discovered SDK trainer, so the brand is guaranteed to be present. + // `createTrainer`. `runTrainer` also accepts hand-rolled + // `{ start, wait, cancel }` trainers; for those the brand is + // absent and `requestTrainerEarlyStop` transparently falls back + // to `trainer.cancel()` (best-effort, matches the public contract). requestTrainerEarlyStop(trainer) .catch((err: unknown) => { const msg = err instanceof Error ? err.message : String(err); From 366ea23f38702f7852c0c5b2281b98d9a53d2622 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 09:24:55 +0900 Subject: [PATCH 18/55] feat: enhance signal handling to prevent crashes on unsupported platforms and improve hash consistency for JobConfig --- packages/arkor/src/core/configHash.test.ts | 22 +++++++++++ packages/arkor/src/core/configHash.ts | 13 +++++++ .../arkor/src/core/moduleCacheBust.test.ts | 13 ++++--- packages/arkor/src/core/moduleCacheBust.ts | 23 ++++++++---- packages/arkor/src/core/runnerSignals.test.ts | 37 +++++++++++++++++++ packages/arkor/src/core/runnerSignals.ts | 17 ++++++++- 6 files changed, 112 insertions(+), 13 deletions(-) diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts index 018c3aee..43e1433f 100644 --- a/packages/arkor/src/core/configHash.test.ts +++ b/packages/arkor/src/core/configHash.test.ts @@ -89,6 +89,28 @@ describe("hashJobConfig", () => { expect(hashJobConfig(a)).toBe(hashJobConfig(b)); }); + it("honors `toJSON()` like JSON.stringify (Date, etc.)", () => { + // Regression: `JSON.stringify({ d: new Date(0) })` serialises + // `d` as `"1970-01-01T00:00:00.000Z"`, but a naive recursive + // walker would serialise the Date as `{}` (no enumerable own + // keys). A `JobConfig` whose `unknown`-typed forwarder field + // ever holds a Date (or any object with `toJSON`) would then + // produce a hash that disagrees with the wire-format payload, + // causing spurious "configHash changed" → SIGTERM restarts. + const date = new Date("2024-01-01T00:00:00.000Z"); + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: date as unknown, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: "2024-01-01T00:00:00.000Z" as unknown, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + it("ignores function / symbol properties (JSON parity)", () => { // `JSON.stringify` drops these too. The hash should be insensitive // to "transparent" callbacks accidentally landing in a forwarded diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts index 712eedb5..56f2b4a7 100644 --- a/packages/arkor/src/core/configHash.ts +++ b/packages/arkor/src/core/configHash.ts @@ -34,6 +34,19 @@ function stableStringify(value: unknown): string { // JSON-shaped text rather than the literal substring "undefined". if (isNonJsonRepresentable(value)) return "null"; if (typeof value !== "object") return JSON.stringify(value); + // `JSON.stringify` calls `value.toJSON(key)` first when present, then + // serialises the return value. The canonical example is `Date`, which + // becomes its ISO string. Without this branch a `Date` would hash as + // `{}` (no enumerable keys) and a `JobConfig` whose `unknown`-typed + // forwarder field happened to hold one would diverge from the + // wire-format payload — leading to bogus configHash drift and + // unnecessary SIGTERM restarts on every rebuild. + const maybeToJSON = (value as { toJSON?: unknown }).toJSON; + if (typeof maybeToJSON === "function") { + return stableStringify( + (maybeToJSON as (key?: string) => unknown).call(value), + ); + } if (Array.isArray(value)) { // Array slots: non-representable → "null" (matches JSON spec). const items = value.map((v) => diff --git a/packages/arkor/src/core/moduleCacheBust.test.ts b/packages/arkor/src/core/moduleCacheBust.test.ts index 66255833..4960b391 100644 --- a/packages/arkor/src/core/moduleCacheBust.test.ts +++ b/packages/arkor/src/core/moduleCacheBust.test.ts @@ -31,7 +31,9 @@ describe("moduleCacheBustKey", () => { const k1 = moduleCacheBustKey(file); const k2 = moduleCacheBustKey(file); expect(k1).toBe(k2); - expect(k1).toMatch(/^\d+-\d+$/); + // mtimeMs-ctimeMs-size; mtimeMs/ctimeMs may carry sub-ms precision + // (no `toFixed(0)`) so digits include an optional fractional part. + expect(k1).toMatch(/^[\d.]+-[\d.]+-\d+$/); }); it("changes when the file content changes (different size)", () => { @@ -43,12 +45,13 @@ describe("moduleCacheBustKey", () => { expect(after).not.toBe(before); }); - it("returns a stable fallback (\"0-0\") for missing files instead of throwing", () => { + it("returns a stable fallback (\"0-0-0\") for missing files instead of throwing", () => { // The eventual `await import(url)` will throw on a missing // file; the helper itself should produce a value rather than // bubbling the stat error and turning every consumer into a - // try/catch site. - expect(moduleCacheBustKey(join(dir, "does-not-exist.mjs"))).toBe("0-0"); + // try/catch site. Three zeros — one each for mtimeMs, ctimeMs, + // size — to keep the shape uniform with the success branch. + expect(moduleCacheBustKey(join(dir, "does-not-exist.mjs"))).toBe("0-0-0"); }); }); @@ -58,6 +61,6 @@ describe("moduleCacheBustUrl", () => { writeFileSync(file, "export const x = 1;"); const url = moduleCacheBustUrl(file); expect(url.startsWith(pathToFileURL(file).href + "?t=")).toBe(true); - expect(url).toMatch(/\?t=\d+-\d+$/); + expect(url).toMatch(/\?t=[\d.]+-[\d.]+-\d+$/); }); }); diff --git a/packages/arkor/src/core/moduleCacheBust.ts b/packages/arkor/src/core/moduleCacheBust.ts index 21abea8b..da238291 100644 --- a/packages/arkor/src/core/moduleCacheBust.ts +++ b/packages/arkor/src/core/moduleCacheBust.ts @@ -12,11 +12,20 @@ import { pathToFileURL } from "node:url"; * `BUNDLE_END` + SIGUSR2 — accumulates one module record per call, * unbounded. * - * Keying on `mtime + size` collapses repeated reads of the same bytes - * onto the same URL, which Node's loader then serves from its existing - * cache record. The leak shrinks from "one entry per call" to "one - * entry per actual file change", which is the tightest bound we can - * offer without spawning a child process per import. + * Keying on `mtimeMs + ctimeMs + size` collapses repeated reads of the + * same bytes onto the same URL, which Node's loader then serves from + * its existing cache record. The leak shrinks from "one entry per + * call" to "one entry per actual file change", which is the tightest + * bound we can offer without spawning a child process per import. + * + * `mtimeMs` is kept at full sub-millisecond precision (no rounding): + * a previous `toFixed(0)` collapsed two distinct edits that landed in + * the same millisecond and produced an identically-sized output onto + * the same key, which made Node's loader return the *stale* module + * for the second edit (HMR/manifest staleness on fast filesystems). + * `ctimeMs` is included as belt-and-braces against the (rare) case + * where mtime collides but ctime moves — `touch -m` and some build + * tools update one without the other. * * Falls back to a stable literal on stat failure so the eventual * `import()` (which will throw on a missing file) gets to surface its @@ -25,9 +34,9 @@ import { pathToFileURL } from "node:url"; export function moduleCacheBustKey(filePath: string): string { try { const s = statSync(filePath); - return `${s.mtimeMs.toFixed(0)}-${s.size}`; + return `${s.mtimeMs}-${s.ctimeMs}-${s.size}`; } catch { - return "0-0"; + return "0-0-0"; } } diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index aeac9ab4..ebd6f29b 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -169,6 +169,43 @@ describe("installCallbackReloadHandler", () => { } }); + it("returns a no-op disposer when SIGUSR2 registration throws (Windows fallback)", () => { + // Regression: `process.on("SIGUSR2", ...)` can throw at + // registration time on platforms that don't support the signal + // (notably Windows). Previously this would surface as a hard + // crash at `arkor start` boot. The handler now wraps the + // registration in try/catch and degrades to a no-op disposer so + // the rest of the runner stays up — the server's + // `safeKill(child, "SIGUSR2")` already detects the same + // condition and falls back to SIGTERM-restart there. + const trainer = makeTrainer(); + const file = join(cwd, "entry.mjs"); + writeFileSync(file, "export const x = 1;\n"); + + const realOn = process.on.bind(process); + const onSpy = vi + .spyOn(process, "on") + .mockImplementation(((event: string, listener: (...args: unknown[]) => void) => { + if (event === "SIGUSR2") { + throw new Error("ENOSYS: function not implemented"); + } + return realOn(event as never, listener as never); + }) as typeof process.on); + + let dispose: (() => void) | undefined; + try { + // Must not throw despite the SIGUSR2 registration failure. + dispose = installCallbackReloadHandler(trainer, file); + expect(typeof dispose).toBe("function"); + // No listener was attached, so the disposer is a no-op; calling + // it must not throw either (mirroring the success-path contract + // for tests that always invoke the disposer in `finally`). + expect(() => dispose?.()).not.toThrow(); + } finally { + onSpy.mockRestore(); + } + }); + it("logs a skip warning when the bundle has no inspectable trainer", async () => { const trainer = makeTrainer(); const file = join(cwd, "no-trainer.mjs"); diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index 4542ac44..f00f51d8 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -103,7 +103,22 @@ export function installCallbackReloadHandler( } })(); }; - process.on(CALLBACK_RELOAD_SIGNAL, handler); + // `process.on('SIGUSR2', ...)` can throw at registration time on + // platforms that don't support the signal (notably Windows: libuv's + // signal-wrap returns ENOSYS for SIGUSR2 on win32 and the error + // escapes to userland on some Node versions). The server-side + // `trainRegistry.safeKill(child, "SIGUSR2")` already detects this + // ("unsupported" → falls back to SIGTERM-restart), so an unarmed + // listener here is the documented contract on those platforms — + // quietly degrade to a no-op disposer rather than crashing + // `arkor start` at boot. + try { + process.on(CALLBACK_RELOAD_SIGNAL, handler); + } catch { + return () => { + // no-op: handler was never attached + }; + } return () => { process.off(CALLBACK_RELOAD_SIGNAL, handler); }; From 88e098d40df6de33b55e39e2b62d2d33b3b13c3f Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 09:36:24 +0900 Subject: [PATCH 19/55] feat: update cache-bust logic to use mtime, ctime, and size for improved stability and memory management --- .../arkor/src/core/moduleCacheBust.test.ts | 2 +- packages/arkor/src/core/runnerSignals.ts | 13 +++--- packages/arkor/src/studio/hmr.test.ts | 4 +- packages/arkor/src/studio/hmr.ts | 42 ++++++++++++------- packages/arkor/src/studio/manifest.ts | 6 +-- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/packages/arkor/src/core/moduleCacheBust.test.ts b/packages/arkor/src/core/moduleCacheBust.test.ts index 4960b391..f4aaca00 100644 --- a/packages/arkor/src/core/moduleCacheBust.test.ts +++ b/packages/arkor/src/core/moduleCacheBust.test.ts @@ -24,7 +24,7 @@ describe("moduleCacheBustKey", () => { // a `Date.now()` cache-bust would produce a fresh URL on every // call → unbounded leak across long `arkor dev` sessions // (5 s `/api/manifest` polls + every save firing SIGUSR2). - // mtime+size keying must collapse repeat reads of unchanged + // mtime+ctime+size keying must collapse repeat reads of unchanged // bytes onto the same key so the loader serves from cache. const file = join(dir, "stable.mjs"); writeFileSync(file, "export const v = 1;"); diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index f00f51d8..f5404b0b 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -76,12 +76,13 @@ export function installCallbackReloadHandler( entryPath: string, ): () => void { const handler = (): void => { - // mtime+size cache-bust (vs `Date.now()`): Node's ESM loader - // never evicts module records, so a long `arkor start` session - // with frequent SIGUSR2 reloads would accumulate one record per - // signal forever. Keying on the actual artefact bytes collapses - // no-op signals onto the same URL — the leak is bounded to "one - // per real edit", which is fundamentally what HMR has to retain. + // mtime+ctime+size cache-bust (vs `Date.now()`): Node's ESM + // loader never evicts module records, so a long `arkor start` + // session with frequent SIGUSR2 reloads would accumulate one + // record per signal forever. Keying on the actual artefact bytes + // (via `moduleCacheBustUrl`) collapses no-op signals onto the + // same URL — the leak is bounded to "one per real edit", which + // is fundamentally what HMR has to retain. const url = moduleCacheBustUrl(entryPath); void (async () => { try { diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 02ec82ac..391fe46d 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -240,10 +240,10 @@ describe("createHmrCoordinator", () => { // routinely emits a 4th BUNDLE_END after the explicit edits // settle, producing a slightly different output byte (a // change in the bundled comment header is enough to bump - // mtime + size). + // mtime + ctime + size). await waitForStableEvents(events, 750); const stat = statSync(join(cwd, ".arkor/build/index.mjs")); - const expectedHash = `${stat.mtimeMs.toFixed(0)}-${stat.size}`; + const expectedHash = `${stat.mtimeMs}-${stat.ctimeMs}-${stat.size}`; expect(events[events.length - 1]?.hash).toBe(expectedHash); } finally { await hmr.dispose(); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 6a873104..3ce2fc91 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -16,8 +16,9 @@ export interface HmrEvent { type: HmrEventType; outFile?: string; /** - * Short fingerprint of the bundle artefact (mtime + size). Subscribers - * use this to dedupe replays of the same successful build. + * Short fingerprint of the bundle artefact (mtime + ctime + size, + * mirroring `core/moduleCacheBust.ts`'s key shape). Subscribers use + * this to dedupe replays of the same successful build. */ hash?: string; /** @@ -57,8 +58,19 @@ export type HmrOptions = BuildEntryOptions; function fingerprint(outFile: string): string { try { const s = statSync(outFile); - return `${s.mtimeMs.toFixed(0)}-${s.size}`; + // Mirrors `moduleCacheBustKey`'s success-branch shape so the + // broadcast hash and the import URL move together — and so two + // distinct edits within the same millisecond that produce + // identically-sized output don't collide and silently dedup at + // the SPA layer. `ctimeMs` is the belt-and-braces guard for the + // (rare) `touch -m`-style case where mtime stays put. + return `${s.mtimeMs}-${s.ctimeMs}-${s.size}`; } catch { + // Different fallback than `moduleCacheBustKey`'s "0-0-0": that + // helper is for a URL query where the eventual `import()` is + // expected to surface its own missing-file error, but here a + // stable literal would let SPA dedup swallow genuinely-fresh + // events when stat racily fails. Force a unique value instead. return Date.now().toString(36); } } @@ -80,23 +92,25 @@ type InspectionResult = { * null` and the SPA would unnecessarily SIGTERM-restart on every * rebuild. * - * Cache-bust by file mtime+size rather than `Date.now()`: + * Cache-bust by file mtime+ctime+size (via `moduleCacheBustUrl`) + * rather than `Date.now()`: * * - Node's ESM loader caches every dynamically-imported URL for the * lifetime of the process and never evicts. A `?t=Date.now()` * suffix produces a unique URL per call, so a long `arkor dev` * session would accumulate one module record per BUNDLE_END — * unbounded memory growth. - * - Mtime+size keys the cache to "the actual bytes in this file", - * so spurious watcher events that don't change content reuse the - * prior module record. The leak shrinks from "one entry per - * keystroke" to "one entry per actual rebuild", which for a - * realistic dev session (hundreds of saves over hours) is bounded - * by the number of distinct file states the user produces — and - * that's fundamentally what HMR has to track to surface up-to- - * date trainer state. There's no public Node API for evicting an - * ESM module record, so this is the tightest bound we can offer - * without spawning a child process per inspection. + * - The composite key (`mtimeMs-ctimeMs-size`) keys the cache to + * "the actual bytes in this file", so spurious watcher events + * that don't change content reuse the prior module record. The + * leak shrinks from "one entry per keystroke" to "one entry per + * actual rebuild", which for a realistic dev session (hundreds + * of saves over hours) is bounded by the number of distinct file + * states the user produces — and that's fundamentally what HMR + * has to track to surface up-to-date trainer state. There's no + * public Node API for evicting an ESM module record, so this is + * the tightest bound we can offer without spawning a child + * process per inspection. * * Best-effort: a missing/malformed manifest or a thrown user * constructor returns `null` and the caller treats the rebuild as diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index 51cf51de..70be440e 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -40,12 +40,12 @@ const EMPTY: ManifestSummary = { trainer: null, configHash: null }; export async function summariseBuiltManifest( outFile: string, ): Promise { - // mtime+size cache-bust (vs `Date.now()`): the SPA polls + // mtime+ctime+size cache-bust (vs `Date.now()`): the SPA polls // `/api/manifest` every ~5 s, so a `Date.now()` suffix would // accumulate one ESM module record per poll across a long // `arkor dev` session — Node's loader has no eviction. Keying on - // the artefact bytes collapses unchanged-poll reads onto the - // existing record. + // the artefact bytes (via `moduleCacheBustUrl`) collapses + // unchanged-poll reads onto the existing record. const mod = (await import(moduleCacheBustUrl(outFile))) as Record< string, unknown From 63fef23a2dc76064a3ac3000245cdda3d1af1132 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 09:45:37 +0900 Subject: [PATCH 20/55] feat: enhance HMR coordinator comments for clarity on lazy initialization and signal handling --- packages/arkor/src/cli/commands/dev.ts | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index 077fe8f7..14047a5b 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -205,11 +205,21 @@ export async function runDev(options: DevOptions = {}): Promise { const studioToken = randomBytes(32).toString("base64url"); // HMR coordinator: a long-lived rolldown watcher over the user's - // `src/arkor` graph. Lazy-started on first `/api/dev/events` connection so - // an `arkor dev` launched in an unbuilt project doesn't immediately fail. - // Registered before the studio-token cleanup so the latter remains the - // most-recently-attached signal listener (existing tests rely on this - // ordering to find the token-removal handler). + // `src/arkor` graph. The coordinator itself is lazy (`subscribe()` + // is what starts the watcher, not `createHmrCoordinator`), but + // `buildStudioApp` registers its per-rebuild signal-dispatch + // subscriber unconditionally — that subscriber needs to run on + // every BUNDLE_END regardless of whether any SSE client is + // connected, so it can SIGUSR2/SIGTERM active `/api/train` + // children and keep `lastSuccessConfigHash` warm for spawn-time + // capture. Net effect: the watcher starts at server boot. An + // `arkor dev` launched in an unbuilt project doesn't fail immediately + // because `startWatcher` falls through to a poll loop that waits + // for the entry file to appear (see `hmr.ts:entryWaitTimer`). + // + // Registered before the studio-token cleanup so the latter remains + // the most-recently-attached signal listener (existing tests rely + // on this ordering to find the token-removal handler). const hmr = createHmrCoordinator({ cwd: process.cwd() }); scheduleHmrCleanup(hmr); From 2057c7a71f46a78e6860d3cebae742758ce9bee2 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 16:34:12 +0900 Subject: [PATCH 21/55] feat: update trainer detection logic to support additional export shapes and improve error handling in signal management --- .../arkor/src/core/trainerInspection.test.ts | 18 ++++++++-- packages/arkor/src/core/trainerInspection.ts | 17 ++++++++-- .../arkor/src/studio/trainRegistry.test.ts | 34 ++++++++++++++++++- packages/arkor/src/studio/trainRegistry.ts | 20 ++++++++--- 4 files changed, 79 insertions(+), 10 deletions(-) diff --git a/packages/arkor/src/core/trainerInspection.test.ts b/packages/arkor/src/core/trainerInspection.test.ts index 8335ef39..ae830017 100644 --- a/packages/arkor/src/core/trainerInspection.test.ts +++ b/packages/arkor/src/core/trainerInspection.test.ts @@ -53,15 +53,29 @@ describe("findTrainerInModule (trainer-shape walk)", () => { expect(found).toBe(trainer); }); - it("finds shape #4: default.trainer nested", () => { + it("finds shape #4: default IS the Trainer", () => { + // Regression: `runner.ts`'s `extractTrainer` accepts + // `export default createTrainer(...)` directly (the trainer + // object itself becomes `mod.default`), but Studio's manifest / + // HMR walk previously skipped this shape. Result: a project that + // ran fine under `arkor start` showed as "no trainer" in Studio + // and HMR forced a SIGTERM-restart on every rebuild because + // `configHash` came back null. const trainer = brandedTrainer("d"); + const found = findTrainerInModule({ default: trainer }); + expect(found).toBe(trainer); + }); + + it("finds shape #5: default.trainer nested", () => { + const trainer = brandedTrainer("e"); const found = findTrainerInModule({ default: { trainer } }); expect(found).toBe(trainer); }); - it("works for hand-rolled (unbranded) trainers in any of the four shapes", () => { + it("works for hand-rolled (unbranded) trainers in any of the five shapes", () => { const trainer = unbrandedTrainer("manual"); expect(findTrainerInModule({ trainer })?.name).toBe("manual"); + expect(findTrainerInModule({ default: trainer })?.name).toBe("manual"); expect(findTrainerInModule({ default: { trainer } })?.name).toBe("manual"); }); diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index 7229ff4d..de3a7497 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -225,11 +225,17 @@ function isTrainerLike(value: unknown): value is TrainerLike { * manifest UI displays the trainer's `name` for hand-rolled trainers * too, even when HMR can't compute a `configHash` for them. * - * The four supported shapes: + * The five supported shapes (mirroring `runner.ts`'s `extractTrainer`): * 1. `export const arkor = createArkor({ trainer })` * 2. `export const trainer = createTrainer(...)` (bare named export) * 3. `export default createArkor({ trainer })` - * 4. `export default { trainer: createTrainer(...) }` + * 4. `export default createTrainer(...)` (default IS a Trainer) + * 5. `export default { trainer: createTrainer(...) }` + * + * Without shape #4 a project that default-exports a Trainer would run + * fine under `arkor start` but show as "no trainer" in Studio's + * manifest, with `configHash: null` forcing every HMR rebuild down the + * SIGTERM-restart path instead of the SIGUSR2 hot-swap path. */ export function findTrainerInModule( mod: Record, @@ -245,7 +251,12 @@ export function findTrainerInModule( if (isArkor(mod.default) && (mod.default as Arkor).trainer) { candidates.push((mod.default as Arkor).trainer); } - // 4: default.trainer nested + // 4: default IS the Trainer itself. The `isTrainerLike` filter + // below sorts this out from cases 3/5 (an Arkor manifest doesn't + // have `start`/`wait`/`cancel`, nor does a plain `{ trainer }` + // wrapper), so pushing `mod.default` unconditionally is safe. + if (mod.default !== undefined) candidates.push(mod.default); + // 5: default.trainer nested if ( mod.default && typeof mod.default === "object" && diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index a08cd51a..95f7a36c 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -112,7 +112,9 @@ describe("TrainRegistry", () => { const reg = new TrainRegistry(); const dead = fakeChild(601); dead.kill.mockImplementation(() => { - throw new Error("ESRCH"); + const err = new Error("kill ESRCH") as Error & { code?: string }; + err.code = "ESRCH"; + throw err; }); reg.register(dead as unknown as ChildProcess, { configHash: "stale", @@ -123,6 +125,36 @@ describe("TrainRegistry", () => { expect(result.restartTargets).toEqual([]); }); + it("dispatchRebuild classifies ESRCH on the hash-match branch as 'gone' (no SIGTERM fallback)", () => { + // Regression: `safeKill` previously treated any thrown error as + // `"unsupported"`, which on the hash-match branch triggers a + // SIGTERM fallback (intended for Windows + SIGUSR2 unsupported). + // POSIX `kill(2)` raises `ESRCH` for an already-exited child — + // classifying that as "unsupported" caused a needless SIGTERM + // attempt against a dead PID. Now ESRCH routes through the + // "gone" branch (no fallback, no restart-target push) so the + // child is dropped silently for the close handler to reap. + const reg = new TrainRegistry(); + const goneOnSigusr2 = fakeChild(801); + goneOnSigusr2.kill.mockImplementation(() => { + const err = new Error("kill ESRCH") as Error & { code?: string }; + err.code = "ESRCH"; + throw err; + }); + reg.register(goneOnSigusr2 as unknown as ChildProcess, { + configHash: "match", + trainFile: "/tmp/g.ts", + }); + const result = reg.dispatchRebuild("match"); + // No hot-swap (SIGUSR2 failed), no restart (correctly classified + // as gone, NOT routed into the SIGTERM fallback path). + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([]); + // Single SIGUSR2 attempt — no SIGTERM fallback was issued. + expect(goneOnSigusr2.kill).toHaveBeenCalledTimes(1); + expect(goneOnSigusr2.kill).toHaveBeenCalledWith("SIGUSR2"); + }); + it("dispatchRebuild omits dead-on-kill children when kill returns false (no throw)", () => { // Regression: `ChildProcess.kill()` returns `false` (without // throwing) when the target process is already gone. The previous diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 97e1140a..caf68166 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -55,17 +55,29 @@ export interface DispatchResult { * Outcome of a single `child.kill(signal)` call. * * - `"ok"`: signal was delivered. - * - `"gone"`: process was already exited (`kill` returned `false`); no - * real signal was sent. + * - `"gone"`: process was already exited. Surfaces both as `kill` + * returning `false` (Node's mapped form) and as a thrown `ESRCH` + * (a race where the child exits between the `entries` lookup and + * the `kill` call — POSIX `kill(2)` raises `ESRCH` for + * non-existent PIDs and Node propagates it on some versions). * - `"unsupported"`: the platform doesn't support this signal kind - * (Windows + `SIGUSR2`); `kill` threw. + * (Windows + `SIGUSR2` → `ENOSYS`; bad signal name → `EINVAL`); + * `kill` threw with that error code. */ type KillResult = "ok" | "gone" | "unsupported"; function safeKill(child: ChildProcess, signal: NodeJS.Signals): KillResult { try { return child.kill(signal) ? "ok" : "gone"; - } catch { + } catch (err) { + // `ESRCH` ("no such process") means the child already exited — + // semantically identical to `kill returning false`. Mis-classifying + // it as `"unsupported"` would route a hash-match hot-swap candidate + // into the SIGTERM fallback, which then also no-ops (also gone) but + // costs a needless restart-bucket inclusion until the close handler + // unregisters the child. + const code = (err as NodeJS.ErrnoException | null)?.code; + if (code === "ESRCH") return "gone"; return "unsupported"; } } From 8c72d89a4aae17afac42a4aee0684190220b9446 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 19:21:44 +0900 Subject: [PATCH 22/55] fix: prevent watcher from crashing on undefined result during ERROR events in HMR by using optional chaining --- packages/arkor/src/studio/hmr.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 3ce2fc91..6cd5d158 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -296,7 +296,17 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // schedule point would be wrong under inspection races. void emitBuildSucceeded(); } else if (event.code === "ERROR") { - event.result.close().catch(() => {}); + // Rolldown's ERROR events don't always carry a `result` — + // when the failure is in the parse/resolve phase there's + // no per-build output to close, so `event.result` is + // `undefined`. Calling `.close()` then would throw + // synchronously, escape this listener, and permanently + // wedge the watcher so the SPA stays on the prior `error` + // state forever even after the user fixes their code. + // Optional-chain so we still close any result that *is* + // present (avoiding the leak rolldown warns about) without + // blowing up the watcher when none is. + event.result?.close().catch(() => {}); // Bump the seq so a still-in-flight `emitBuildSucceeded` // from a *prior* BUNDLE_END drops its broadcast when its // inspection finally resolves. Without this, the older From 0f8e55cc828061932e3bcf5add6fbde6417e18d3 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 19:30:15 +0900 Subject: [PATCH 23/55] refactor: simplify cleanup hook exit logic by directly using in-flight cleanups array for process exit --- packages/arkor/src/cli/cleanupHooks.ts | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index 3d6ffa11..19f203f3 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -103,18 +103,20 @@ export function registerCleanupHook(options: CleanupHookOptions): void { // observable right after the handler returns" for sync // cleanups like `unlinkSync` (and the existing tests that // assert on it). - const my = run(); + run(); detach(); if (!options.exitOnSignal) return; - // Wait for THIS hook's tail and every other in-flight cleanup - // (siblings registered in the same process) before exiting. - // Settled promises pass through Promise.allSettled in a single - // microtask, so a process whose hooks are all synchronous - // exits effectively immediately. - void Promise.allSettled([ - my, - ...inFlightCleanups, - ]).then(() => process.exit(0)); + // Wait for every in-flight cleanup (this hook's tail + any + // siblings registered in the same process) before exiting. + // The promise this hook's `run()` just produced is already in + // `inFlightCleanups` (added inside `run()` itself), so the + // spread captures it without us needing to also push the + // returned value separately. Settled promises pass through + // Promise.allSettled in a single microtask, so a process whose + // hooks are all synchronous exits effectively immediately. + void Promise.allSettled([...inFlightCleanups]).then(() => + process.exit(0), + ); }); } From 414fe37a188757a2bd4d9c71bea075029c1417a5 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 20:18:55 +0900 Subject: [PATCH 24/55] feat: enhance JSON serialization in hashJobConfig to ensure key context is preserved and improve signal handling for concurrent reloads --- packages/arkor/src/core/configHash.test.ts | 30 +++++++++ packages/arkor/src/core/configHash.ts | 39 +++++++---- packages/arkor/src/core/runnerSignals.test.ts | 66 ++++++++++++++++++- packages/arkor/src/core/runnerSignals.ts | 20 ++++++ packages/arkor/src/studio/server.ts | 59 +++++++++++++++-- 5 files changed, 194 insertions(+), 20 deletions(-) diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts index 43e1433f..63875e9b 100644 --- a/packages/arkor/src/core/configHash.test.ts +++ b/packages/arkor/src/core/configHash.test.ts @@ -111,6 +111,36 @@ describe("hashJobConfig", () => { expect(hashJobConfig(a)).toBe(hashJobConfig(b)); }); + it("threads the property key through to user-defined `toJSON(key)` (JSON parity)", () => { + // Regression: `JSON.stringify` calls `value.toJSON(key)` with + // the hosting property name (or array index as string), so a + // `toJSON` that branches on the key produces different output + // depending on where the value lives in the tree. The previous + // `stableStringify` called `toJSON()` without the key argument, + // so the hash diverged from the wire-format payload for any + // user object whose serialiser depends on context. + // + // The fixture's `toJSON(key)` returns `"key="`. Compare + // against an explicit string field holding what JSON.stringify + // would produce — matching hashes prove the key reached toJSON. + const ctx = { + toJSON(key: string) { + return `key=${key}`; + }, + }; + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: ctx as unknown, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: "key=warmupSteps" as unknown, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + it("ignores function / symbol properties (JSON parity)", () => { // `JSON.stringify` drops these too. The hash should be insensitive // to "transparent" callbacks accidentally landing in a forwarded diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts index 56f2b4a7..4d074acf 100644 --- a/packages/arkor/src/core/configHash.ts +++ b/packages/arkor/src/core/configHash.ts @@ -27,41 +27,54 @@ function isNonJsonRepresentable(v: unknown): boolean { * `JobConfig` field ever held one of those values (notably the * `unknown`-typed forwarder fields). */ -function stableStringify(value: unknown): string { +function stableStringify(value: unknown, key: string = ""): string { if (value === null) return "null"; // Top-level non-representable: align with `JSON.stringify(undefined)` // semantics by collapsing to "null" so the hash input stays valid // JSON-shaped text rather than the literal substring "undefined". if (isNonJsonRepresentable(value)) return "null"; if (typeof value !== "object") return JSON.stringify(value); - // `JSON.stringify` calls `value.toJSON(key)` first when present, then - // serialises the return value. The canonical example is `Date`, which - // becomes its ISO string. Without this branch a `Date` would hash as - // `{}` (no enumerable keys) and a `JobConfig` whose `unknown`-typed - // forwarder field happened to hold one would diverge from the - // wire-format payload — leading to bogus configHash drift and - // unnecessary SIGTERM restarts on every rebuild. + // `JSON.stringify` calls `value.toJSON(key)` first when present + // (passing `""` at the top level, the property name in object + // positions, the index-as-string in array positions), then + // serialises the return value. The canonical example is `Date`, + // which becomes its ISO string. Without this branch a `Date` + // would hash as `{}` (no enumerable keys) and a `JobConfig` whose + // `unknown`-typed forwarder field happened to hold one would + // diverge from the wire-format payload — leading to bogus + // configHash drift and unnecessary SIGTERM restarts on every + // rebuild. The `key` argument is threaded through recursion so + // user-side `toJSON(key)` implementations that branch on the + // hosting property/index see the same value JSON.stringify would + // give them. const maybeToJSON = (value as { toJSON?: unknown }).toJSON; if (typeof maybeToJSON === "function") { return stableStringify( - (maybeToJSON as (key?: string) => unknown).call(value), + (maybeToJSON as (key: string) => unknown).call(value, key), + key, ); } if (Array.isArray(value)) { // Array slots: non-representable → "null" (matches JSON spec). - const items = value.map((v) => - isNonJsonRepresentable(v) ? "null" : stableStringify(v), + // Index-as-string keys mirror `JSON.stringify`'s behaviour for + // array elements (per the ECMAScript spec, `SerializeJSONArray` + // calls `SerializeJSONProperty` with the index converted to a + // string). + const items = value.map((v, i) => + isNonJsonRepresentable(v) ? "null" : stableStringify(v, String(i)), ); return `[${items.join(",")}]`; } // Object slots: drop non-representable values entirely (matches - // `JSON.stringify({a: undefined}) === "{}"`). + // `JSON.stringify({a: undefined}) === "{}"`). Property names are + // passed as the recursion key so a nested `toJSON(key)` sees the + // hosting field name. const obj = value as Record; const keys = Object.keys(obj) .filter((k) => !isNonJsonRepresentable(obj[k])) .sort(); const parts = keys.map( - (k) => `${JSON.stringify(k)}:${stableStringify(obj[k])}`, + (k) => `${JSON.stringify(k)}:${stableStringify(obj[k], k)}`, ); return `{${parts.join(",")}}`; } diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index ebd6f29b..a10922a0 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -25,10 +25,16 @@ afterEach(() => { function makeTrainer(): Trainer & { __earlyStop: { calls: number }; - __replace: { lastCallbacks: Partial | null }; + __replace: { + lastCallbacks: Partial | null; + calls: number; + }; } { const earlyStop = { calls: 0 }; - const replace = { lastCallbacks: null as Partial | null }; + const replace = { + lastCallbacks: null as Partial | null, + calls: 0, + }; const trainer: Trainer = { name: "n", async start() { @@ -45,6 +51,7 @@ function makeTrainer(): Trainer & { // — there are no public methods on `Trainer` for either any more. attachTrainerCallbackReplacer(trainer, (cbs) => { replace.lastCallbacks = cbs; + replace.calls += 1; }); attachTrainerEarlyStopper(trainer, async () => { earlyStop.calls += 1; @@ -206,6 +213,61 @@ describe("installCallbackReloadHandler", () => { } }); + it("drops a stale reload's result when a newer SIGUSR2 starts before the import resolves", async () => { + // Regression: each SIGUSR2 starts a fire-and-forget + // `import()` + `replaceTrainerCallbacks`. Two same-`configHash` + // rebuilds firing back-to-back can race — the earlier import's + // bytes sometimes resolve *after* the newer one, and + // `replaceTrainerCallbacks` overwrites the freshly-loaded + // callbacks with the prior version. The fix version-gates each + // reload via a monotonic `loadSeq`; this test pins the contract + // by firing two signals back-to-back and asserting that + // `replaceTrainerCallbacks` was invoked exactly **once** — + // proving the older IIFE dropped its result at the + // `seq !== loadSeq` check before reaching the replace call. + const trainer = makeTrainer(); + attachTrainerInspection(trainer, () => ({ + name: "n", + config: { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }, + callbacks: {}, + })); + + const file = writeUserBundle("v1"); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const stderrSpy = vi + .spyOn(process.stderr, "write") + .mockImplementation((() => true) as typeof process.stderr.write); + const dispose = installCallbackReloadHandler(trainer, file); + try { + // First signal — captures seq=1 inside the IIFE. + process.emit("SIGUSR2", "SIGUSR2"); + // Rewrite the bundle to v2 BEFORE letting either import + // resolve. mtime+ctime+size change → distinct cache-bust URL. + writeUserBundle("v2"); + // Second signal — captures seq=2, bumps loadSeq to 2. + process.emit("SIGUSR2", "SIGUSR2"); + // Generous fixed wait so both imports definitely settle — + // we can't poll on `lastCallbacks !== null` because the v1 + // IIFE might land first and short-circuit our wait, hiding + // the count assertion below. + await new Promise((r) => setTimeout(r, 200)); + // Without the seq guard, both IIFEs would call + // `replaceTrainerCallbacks` and `calls` would be 2. With the + // guard, the older IIFE's `seq !== loadSeq` short-circuit + // skips the replace call entirely. + expect(trainer.__replace.calls).toBe(1); + } finally { + dispose(); + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + } + }); + it("logs a skip warning when the bundle has no inspectable trainer", async () => { const trainer = makeTrainer(); const file = join(cwd, "no-trainer.mjs"); diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index f5404b0b..404ad9e3 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -75,7 +75,24 @@ export function installCallbackReloadHandler( trainer: Trainer, entryPath: string, ): () => void { + /** + * Monotonic counter for sequencing concurrent SIGUSR2 reloads. + * Bumped synchronously inside the signal handler *before* the + * dynamic-import await begins, so each in-flight reload knows its + * arrival order. When the import resolves, the IIFE compares its + * captured `seq` against `loadSeq` and silently drops the result + * if a newer signal already started a newer reload — without this, + * two same-`configHash` rebuilds firing back-to-back can race on + * the import: the earlier import's bytes (now stale on disk) + * resolve *after* the newer one, and `replaceTrainerCallbacks` + * overwrites the freshly-loaded callbacks with the prior version, + * leaving the running job out of sync until the next rebuild. + * Mirrors the `buildSeq` guard in `studio/hmr.ts`'s + * `emitBuildSucceeded`. + */ + let loadSeq = 0; const handler = (): void => { + const seq = ++loadSeq; // mtime+ctime+size cache-bust (vs `Date.now()`): Node's ESM // loader never evicts module records, so a long `arkor start` // session with frequent SIGUSR2 reloads would accumulate one @@ -87,6 +104,9 @@ export function installCallbackReloadHandler( void (async () => { try { const mod = (await import(url)) as Record; + // A newer SIGUSR2 already started its own import while we + // were awaiting; drop our result so the latest edit wins. + if (seq !== loadSeq) return; const callbacks = extractCallbacks(mod); if (!callbacks) { process.stderr.write( diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 4472eabd..19c74ec3 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -1,4 +1,5 @@ -import { spawn } from "node:child_process"; +import { spawn, type ChildProcessByStdio } from "node:child_process"; +import type { Readable, Writable } from "node:stream"; import { readFile, realpath } from "node:fs/promises"; import { timingSafeEqual } from "node:crypto"; import { Hono } from "hono"; @@ -369,10 +370,32 @@ export function buildStudioApp(options: StudioServerOptions) { : null; const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); - const child = spawn(process.execPath, args, { - stdio: "pipe", - cwd: trainCwd, - }); + // `spawn()` is mostly async (filesystem failures surface as the + // child's `error` event), but Node can still throw synchronously + // for argument-shape problems (e.g. invalid stdio descriptor on + // unusual platforms). Catch both paths so an `/api/train` POST + // can never hang the SPA — sync throws return a clean 500, async + // 'error' events forward into the stream and close it (handled + // inside the ReadableStream `start()` below). + // `ChildProcessByStdio` is the + // specific overload return for `stdio: "pipe"` — narrows + // `child.stdout` / `child.stderr` away from the nullable + // `Readable | null` of the general `ChildProcess` type. + // `ReturnType` would land on the union and force + // a `?.` everywhere downstream. + let child: ChildProcessByStdio; + try { + child = spawn(process.execPath, args, { + stdio: "pipe", + cwd: trainCwd, + }); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + return c.json( + { error: `Failed to spawn training subprocess: ${msg}` }, + 500, + ); + } activeTrains.register(child, { trainFile, configHash }); // Hoisted out of the `ReadableStream` underlying-source so the // `start` handler can hand its closure-bound teardown helper to @@ -421,14 +444,40 @@ export function buildStudioApp(options: StudioServerOptions) { // already cancelled; nothing more to do. } }; + // `error` event fires when async spawn machinery surfaces a + // failure (ENOENT for the executable, EACCES, EAGAIN under + // resource exhaustion, etc.). Without this listener the + // ReadableStream would never close — the SPA would hang + // waiting for output that never arrives. Forward the error + // text into the stream body, close, and unregister the + // child. Node's contract is: if 'error' fires, 'close' may + // or may not follow — both paths are guarded by the `closed` + // flag and the `unregister` call is idempotent. + const onError = (err: Error): void => { + activeTrains.unregister(child.pid); + child.stdout.off("data", onChunk); + child.stderr.off("data", onChunk); + if (closed) return; + closed = true; + try { + controller.enqueue( + enc.encode(`\n---\nerror=${err.message}\n`), + ); + controller.close(); + } catch { + // already cancelled; nothing more to do. + } + }; child.stdout.on("data", onChunk); child.stderr.on("data", onChunk); child.on("close", onClose); + child.on("error", onError); cancelTeardown = () => { closed = true; child.stdout.off("data", onChunk); child.stderr.off("data", onChunk); child.off("close", onClose); + child.off("error", onError); }; }, cancel() { From 7114f9f2204d21ab765ab95d8ddfdb436ec3df9f Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 22:43:26 +0900 Subject: [PATCH 25/55] feat: improve JSON serialization in hashJobConfig to handle undefined values correctly and enhance HMR signal dispatch for all successful build events --- packages/arkor/src/core/configHash.test.ts | 50 +++++++++++ packages/arkor/src/core/configHash.ts | 98 +++++++++++----------- packages/arkor/src/core/trainer.ts | 14 ++-- packages/arkor/src/studio/server.test.ts | 83 ++++++++++++++++++ packages/arkor/src/studio/server.ts | 14 +++- 5 files changed, 203 insertions(+), 56 deletions(-) diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts index 63875e9b..4d7566f1 100644 --- a/packages/arkor/src/core/configHash.test.ts +++ b/packages/arkor/src/core/configHash.test.ts @@ -141,6 +141,56 @@ describe("hashJobConfig", () => { expect(hashJobConfig(a)).toBe(hashJobConfig(b)); }); + it("omits an object property whose `toJSON(key)` returns undefined (JSON parity)", () => { + // Regression: `JSON.stringify({ a: { toJSON: () => undefined } })` + // produces `"{}"` — `toJSON` returning `undefined` is the spec's + // "skip me" signal in object position. The previous + // `stableStringify` collapsed every non-representable value to + // the literal string `"null"` at recursion time, so the same + // input hashed as `{"a":null}` instead of `{}`. That divergence + // forced unnecessary SIGTERM restarts whenever a `JobConfig` + // field's serialiser opted out — `configHash` would diverge from + // the wire-format payload (which DOES omit the field). + const omitting = { + toJSON() { + return undefined; + }, + }; + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + warmupSteps: omitting as unknown, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + + it("substitutes `null` for an array element whose `toJSON(idx)` returns undefined (JSON parity)", () => { + // Sibling contract: in array position, `JSON.stringify` writes + // `null` for a `toJSON()→undefined` element (it can't drop the + // slot without shifting indices). The `stableStringify` boundary + // for arrays maps the omit sentinel to `"null"`. + const omitting = { + toJSON() { + return undefined; + }, + }; + const a: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", omitting, "c"] as unknown, + }; + const b: JobConfig = { + model: "m", + datasetSource: { type: "huggingface", name: "x" }, + datasetFormat: ["a", null, "c"] as unknown, + }; + expect(hashJobConfig(a)).toBe(hashJobConfig(b)); + }); + it("ignores function / symbol properties (JSON parity)", () => { // `JSON.stringify` drops these too. The hash should be insensitive // to "transparent" callbacks accidentally landing in a forwarded diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts index 4d074acf..fb76d1f1 100644 --- a/packages/arkor/src/core/configHash.ts +++ b/packages/arkor/src/core/configHash.ts @@ -1,52 +1,50 @@ import { createHash } from "node:crypto"; import type { JobConfig } from "./types"; -/** - * Type-narrowing helper for "this value cannot be represented in JSON". - * Mirrors the cases JSON.stringify silently drops (when in object - * positions) or coerces to `null` (when in array positions): `undefined`, - * functions, and symbols. - */ -function isNonJsonRepresentable(v: unknown): boolean { - return v === undefined || typeof v === "function" || typeof v === "symbol"; -} - /** * Deterministic JSON serialiser: keys sorted at every nesting level so * `{a:1, b:2}` and `{b:2, a:1}` produce the same string. Necessary because * `JSON.stringify` follows insertion order, which isn't stable across * `buildJobConfig` revisions or user-side spread-merge tricks. * - * Mirrors the JSON wire-format exactly for non-representable values - * (`undefined`, functions, symbols): omitted in object positions, - * serialised as `null` in array positions. The previous implementation - * delegated to `JSON.stringify` which returns the literal value - * `undefined` (not a string) for those — concatenated into the output - * via template literals it became the substring `"undefined"`, which - * is not valid JSON and would silently change the hash if a - * `JobConfig` field ever held one of those values (notably the - * `unknown`-typed forwarder fields). + * Returns `string | undefined`. `undefined` is the "omit me from my + * containing object" sentinel — it propagates from any value + * `JSON.stringify` would silently drop in object position + * (`undefined`, functions, symbols, *and* objects whose `toJSON(key)` + * returns one of those). Callers sit at three boundaries: + * + * - Top level: `hashJobConfig` collapses `undefined` to `"null"` + * so the digest input stays a valid hash string. + * - Array slots: the map below substitutes `"null"` (matches + * `JSON.stringify([undefined]) === "[null]"`). + * - Object slots: the loop filters the key out entirely (matches + * `JSON.stringify({a: undefined}) === "{}"`). + * + * The previous implementation collapsed every non-representable to + * the literal string `"null"` at recursion time, which leaked into + * object slots as `{"a":null}` instead of the JSON-correct `{}` — + * making `configHash` diverge from the wire-format payload for + * `JobConfig` fields whose `toJSON(key)` happened to return + * `undefined` (the spec-defined "skip me" signal). That divergence + * forces unnecessary SIGTERM restarts on every rebuild. */ -function stableStringify(value: unknown, key: string = ""): string { +function stableStringify(value: unknown, key: string = ""): string | undefined { if (value === null) return "null"; - // Top-level non-representable: align with `JSON.stringify(undefined)` - // semantics by collapsing to "null" so the hash input stays valid - // JSON-shaped text rather than the literal substring "undefined". - if (isNonJsonRepresentable(value)) return "null"; + // Non-representable values: omit (undefined return) so each caller's + // boundary handler chooses the right substitution per its position. + if (value === undefined || typeof value === "function" || typeof value === "symbol") { + return undefined; + } if (typeof value !== "object") return JSON.stringify(value); // `JSON.stringify` calls `value.toJSON(key)` first when present // (passing `""` at the top level, the property name in object // positions, the index-as-string in array positions), then - // serialises the return value. The canonical example is `Date`, - // which becomes its ISO string. Without this branch a `Date` - // would hash as `{}` (no enumerable keys) and a `JobConfig` whose - // `unknown`-typed forwarder field happened to hold one would - // diverge from the wire-format payload — leading to bogus - // configHash drift and unnecessary SIGTERM restarts on every - // rebuild. The `key` argument is threaded through recursion so + // serialises the return value. Canonical example: `Date` → ISO + // string. The `key` argument is threaded through recursion so // user-side `toJSON(key)` implementations that branch on the - // hosting property/index see the same value JSON.stringify would - // give them. + // hosting property/index see the same value JSON.stringify would. + // If `toJSON` returns `undefined`, that propagates as the omit + // sentinel — the spec-defined "skip me" path. const maybeToJSON = (value as { toJSON?: unknown }).toJSON; if (typeof maybeToJSON === "function") { return stableStringify( @@ -60,22 +58,20 @@ function stableStringify(value: unknown, key: string = ""): string { // array elements (per the ECMAScript spec, `SerializeJSONArray` // calls `SerializeJSONProperty` with the index converted to a // string). - const items = value.map((v, i) => - isNonJsonRepresentable(v) ? "null" : stableStringify(v, String(i)), - ); + const items = value.map((v, i) => stableStringify(v, String(i)) ?? "null"); return `[${items.join(",")}]`; } - // Object slots: drop non-representable values entirely (matches - // `JSON.stringify({a: undefined}) === "{}"`). Property names are - // passed as the recursion key so a nested `toJSON(key)` sees the - // hosting field name. + // Object slots: skip keys whose serialised value is `undefined` + // (matches `JSON.stringify({a: undefined}) === "{}"`). Property + // names are passed as the recursion key so a nested `toJSON(key)` + // sees the hosting field name. const obj = value as Record; - const keys = Object.keys(obj) - .filter((k) => !isNonJsonRepresentable(obj[k])) - .sort(); - const parts = keys.map( - (k) => `${JSON.stringify(k)}:${stableStringify(obj[k], k)}`, - ); + const parts: string[] = []; + for (const k of Object.keys(obj).sort()) { + const serialised = stableStringify(obj[k], k); + if (serialised === undefined) continue; + parts.push(`${JSON.stringify(k)}:${serialised}`); + } return `{${parts.join(",")}}`; } @@ -86,8 +82,10 @@ function stableStringify(value: unknown, key: string = ""): string { * full restart with `requestEarlyStop`). */ export function hashJobConfig(config: JobConfig): string { - return createHash("sha256") - .update(stableStringify(config)) - .digest("hex") - .slice(0, 16); + // Top-level fallback to `"null"` so a pathological config that + // serialises to `undefined` (top-level `toJSON` returning + // undefined, etc.) still produces a deterministic digest input + // rather than crashing `createHash.update(undefined)`. + const serialised = stableStringify(config) ?? "null"; + return createHash("sha256").update(serialised).digest("hex").slice(0, 16); } diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index e4906289..dec1a263 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -146,11 +146,15 @@ export function createTrainer( let scope: { orgSlug: string; projectSlug: string } | null = null; let clientPromise: Promise | null = null; - // Mutable callbacks slot. Each `dispatch()` invocation reads this fresh, - // so `replaceCallbacks(...)` takes effect on the next event. Events - // already mid-await keep their old reference until they resolve, which - // matches the "replace, don't interrupt" contract documented on - // `Trainer.replaceCallbacks`. + // Mutable callbacks slot. Each `dispatch()` invocation reads this + // fresh, so the rotation triggered by the + // `Symbol.for("arkor.trainer.replaceCallbacks")` brand + // (`replaceTrainerCallbacks` in `core/trainerInspection.ts`) takes + // effect on the next event. Events already mid-await keep their + // old reference until they resolve, which matches the "replace, + // don't interrupt" contract. Public `Trainer` deliberately doesn't + // expose this — it's a dev-only HMR primitive driven by the + // SIGUSR2 path in `core/runnerSignals.ts`. let currentCallbacks: Partial = input.callbacks ?? {}; // Early-stop state. `requestEarlyStop()` arms the latch; the next diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 4c5b8d78..01ed4d23 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -1705,6 +1705,89 @@ process.exit(0); expect(fake.subscriberCount).toBe(1); }); + it("dispatches HMR signals for `ready` events too (not only `rebuild`)", async () => { + // Regression: previously the dispatch fired only on + // `rebuild`, so a child started via `/api/train` *before* + // the watcher's first successful BUNDLE_END (the very first + // success is broadcast as `ready`, and the entry-wait recovery + // path also emits `ready`) would never get SIGUSR2/SIGTERM- + // routed when that build eventually landed — leaving it + // running a stale or empty artifact. Exercise the contract + // here by spawning a hanging child, then emitting `ready` + // with a different `configHash`; dispatch should pick up the + // mismatch and surface restart targets in the SSE frame. + await writeCredentials(ANON_CREDS); + const hangingBin = join(trainCwd, "hanging-bin.mjs"); + // setInterval keeps the event loop alive without trapping + // SIGTERM, so dispatch's kill returns the child to the OS. + writeFileSync(hangingBin, "setInterval(() => {}, 1000);\n"); + + const fake = fakeHmr("h1"); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: hangingBin, + hmr: fake.coordinator, + }); + + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + const pid = Number(trainRes.headers.get("x-arkor-train-pid")); + + const sseRes = await app.request( + `/api/dev/events?studioToken=${encodeURIComponent(STUDIO_TOKEN)}`, + { headers: { host: "127.0.0.1:4000" } }, + ); + const reader = sseRes.body!.getReader(); + const decoder = new TextDecoder(); + + try { + // `configHash` = "h2" mismatches the spawn-time "h1" → SIGTERM + // path → `restartTargets` should be non-empty in the SSE frame. + fake.emit({ + type: "ready", + outFile: "/tmp/x.mjs", + hash: "abc", + configHash: "h2", + trainerName: "t", + }); + + let received = ""; + while (!received.includes("\n\n")) { + const { value, done } = await reader.read(); + if (done) break; + received += decoder.decode(value, { stream: true }); + } + expect(received).toContain("event: ready"); + // The dispatch augmentation marker — would be absent if the + // `event.type !== "error"` filter regressed back to gating on + // `=== "rebuild"`, and `restart`/`restartTargets` would never + // appear on a `ready` frame. + expect(received).toContain('"restart":true'); + expect(received).toContain(`"pid":${pid}`); + } finally { + await reader.cancel(); + // Best-effort cleanup if dispatch's SIGTERM hasn't reaped + // the child yet (signal delivery is async in the kernel). + try { + process.kill(pid, "SIGKILL"); + } catch { + // already gone + } + } + }); + it("forwards rebuild events as SSE frames", async () => { const fake = fakeHmr(); const app = buildStudioApp({ diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 19c74ec3..358c469d 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -545,7 +545,19 @@ export function buildStudioApp(options: StudioServerOptions) { // fresh signal pass against the same rebuild. hmr.subscribe((event) => { let augmented: AugmentedEvent = event; - if (event.type === "rebuild" && activeTrains.size > 0) { + // Route dispatch through every *successful* build event, not + // just `rebuild`. The coordinator emits the very first + // successful compile as `ready` (and the entry-wait recovery + // path also broadcasts `ready` when a fresh-scaffold project's + // entry file first appears). A child started via `/api/train` + // before the first `ready` (e.g. the SPA fired Run Training + // immediately after `arkor dev` booted, while the watcher's + // initial BUNDLE_END was still in flight) would otherwise + // never get SIGUSR2/SIGTERM-routed when that build lands — + // leaving it stuck on a stale or empty artifact until the + // next edit triggers a `rebuild`. Filtering by "not error" + // is forward-compatible with any new successful event types. + if (event.type !== "error" && activeTrains.size > 0) { // Single per-child decision pass: hash match → SIGUSR2 (with // a Windows fallback to SIGTERM since win32 doesn't deliver // SIGUSR2), hash mismatch → SIGTERM. The registry returns From aee40ffe7c5e270bf5b3c45f983453c5d0a08210 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 5 May 2026 23:29:54 +0900 Subject: [PATCH 26/55] feat: implement pre-spawn event buffering in RunTraining to handle HMR dispatch during startup, ensuring accurate signal management and preventing stale code execution --- .../studio-app/src/components/RunTraining.tsx | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 8f2489cc..19f69353 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -55,6 +55,18 @@ export function RunTraining() { // `Timeout` object — explicit `number` so TS doesn't pick up the // Node typing from the global `setTimeout`. const hotSwapTimerRef = useRef(null); + // SSE events that arrived during the startup window — after `run()` + // set `runningRef.current = true` but before `streamTraining`'s + // `onSpawn` populated `currentPidRef`. The per-pid filter would + // otherwise drop any HMR dispatch landing in this window because + // `myPid === null`, leaving the user on stale code: a config- + // changing rebuild fires immediately after the Run click → server + // SIGTERMs the just-started child → exit reaches us → no auto- + // restart latch. Buffer here, drain in `onSpawn` once we know our + // pid so the per-pid decision can run retroactively. Cleared in + // `run()`'s `finally` (and on unmount) so a failed spawn doesn't + // leak entries into the next run. + const pendingPreSpawnEventsRef = useRef([]); // Tracks "is this React tree still mounted?". The HMR auto-restart // path schedules `queueMicrotask(() => run(...))` after the prior // run's `finally` — without this gate, navigating away during the @@ -71,6 +83,7 @@ export function RunTraining() { // edits to React's effect ordering, future refactors), it // still finds nothing pending. restartPendingRef.current = false; + pendingPreSpawnEventsRef.current = []; trainingAbortRef.current?.abort(); if (hotSwapTimerRef.current !== null) { clearTimeout(hotSwapTimerRef.current); @@ -161,6 +174,14 @@ export function RunTraining() { // *my* child land in this bucket?" so a tab whose run was // hot-swapped doesn't latch onto a sibling tab's restart. const myPid = currentPidRef.current; + // Pre-spawn race: if we've started a run but `onSpawn` hasn't + // populated our pid yet, the dispatch result for our own child + // would be silently ignored. Stash the payload and let + // `onSpawn` re-run the per-pid decision once the pid arrives. + if (myPid === null && runningRef.current) { + pendingPreSpawnEventsRef.current.push(payload); + return; + } const myRestart = runningRef.current && myPid !== null && @@ -240,6 +261,38 @@ export function RunTraining() { // "restarting" specifically — "early-stopping" / "hot- // swapped" should land via their own state transitions. setHmrStatus((s) => (s === "restarting" ? "idle" : s)); + // Drain any HMR events that landed in the pre-spawn race + // window. Apply the same per-pid decision retroactively now + // that the pid is known. Restart wins over hot-swap (a + // stale child got SIGTERM'd → must re-spawn), so collapse + // the buffer's findings into a single decision rather than + // dispatching every buffered event verbatim. + const buffered = pendingPreSpawnEventsRef.current; + pendingPreSpawnEventsRef.current = []; + let restartHit = false; + let hotSwapHit = false; + for (const ev of buffered) { + if (ev.restartTargets?.some((t) => t.pid === pid)) { + restartHit = true; + break; + } + if (ev.hotSwapTargets?.some((t) => t.pid === pid)) { + hotSwapHit = true; + } + } + if (restartHit) { + restartPendingRef.current = true; + setHmrStatus("early-stopping"); + } else if (hotSwapHit) { + setHmrStatus("hot-swapped"); + if (hotSwapTimerRef.current !== null) { + clearTimeout(hotSwapTimerRef.current); + } + hotSwapTimerRef.current = window.setTimeout(() => { + setHmrStatus((s) => (s === "hot-swapped" ? "idle" : s)); + hotSwapTimerRef.current = null; + }, 1500); + } }, ); } catch (err) { @@ -255,6 +308,11 @@ export function RunTraining() { } finally { runningRef.current = false; currentPidRef.current = null; + // Drop any pre-spawn buffer entries that survived a failed + // run (spawn errored before `onSpawn` could drain). Without + // this they'd be carried into the next run and falsely match + // the new pid only by luck. + pendingPreSpawnEventsRef.current = []; if (trainingAbortRef.current === ac) trainingAbortRef.current = null; // Always release the running flag, including the user-initiated // abort path. setState on an already-unmounted component is a From 430be8152ee83cdfff64719a403326d0226ba43c Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Wed, 6 May 2026 00:31:01 +0900 Subject: [PATCH 27/55] fix: resolve early-stop latch issue in trainer to ensure immediate settlement on terminal events, preventing delayed shutdown during SIGTERM --- packages/arkor/src/cli/cleanupHooks.test.ts | 55 ++++++++++ packages/arkor/src/cli/cleanupHooks.ts | 35 ++++-- packages/arkor/src/core/trainer.test.ts | 114 ++++++++++++++++++++ packages/arkor/src/core/trainer.ts | 42 ++++++-- 4 files changed, 225 insertions(+), 21 deletions(-) diff --git a/packages/arkor/src/cli/cleanupHooks.test.ts b/packages/arkor/src/cli/cleanupHooks.test.ts index fa39a88f..7e90f1ad 100644 --- a/packages/arkor/src/cli/cleanupHooks.test.ts +++ b/packages/arkor/src/cli/cleanupHooks.test.ts @@ -81,6 +81,61 @@ describe("registerCleanupHook", () => { expect(codes).toEqual([0]); }); + it("waits for sibling async cleanups even when the exit-owning hook is registered FIRST", async () => { + // Regression: even with the in-flight set in place, the + // exit-owning hook's signal handler used to take its + // `[...inFlightCleanups]` snapshot synchronously inside the + // listener body. Node's EventEmitter dispatches signal listeners + // in registration order, so when the exit-owning hook is wired + // up *first*, its handler takes the snapshot before any sibling + // hook (registered later) gets a chance to run its handler and + // add its own in-flight promise. Result: `Promise.allSettled` + // resolved on the snapshot of just-this-hook's promise → exit + // fired → siblings' async cleanup got cut off mid-flight. + // + // The order in the existing "waits for an async sibling + // cleanup" test happens to dodge this bug by registering the + // async hook first, so its handler runs first and seeds + // inFlightCleanups before the exit-owner takes its snapshot. + // This test inverts the order to actually exercise the + // queueMicrotask-deferred snapshot fix. + const order: string[] = []; + let resolveSlow!: () => void; + const slow = new Promise((resolve) => { + resolveSlow = resolve; + }); + + // Register exit-owner FIRST. + registerCleanupHook({ + cleanup: () => { + order.push("sync-cleanup"); + }, + exitOnSignal: true, + }); + // Sibling async cleanup registered AFTER. With the old code, + // its promise wouldn't make it into the exit-owner's snapshot. + registerCleanupHook({ + cleanup: () => + slow.then(() => { + order.push("async-cleanup-finished"); + }), + }); + + const codes = mockExit(); + process.emit("SIGINT", "SIGINT"); + + // Sync ran inline; async pending; exit must NOT have fired. + expect(order).toEqual(["sync-cleanup"]); + expect(codes).toEqual([]); + + resolveSlow(); + await flushMicrotasks(); + await flushMicrotasks(); + + expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); + expect(codes).toEqual([0]); + }); + it("auto-detaches its process listeners after firing so they don't accumulate", () => { // Regression: previously each `registerCleanupHook` call left // `process.on('exit', ...)` and per-signal listeners armed diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index 19f203f3..f63b171f 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -106,17 +106,30 @@ export function registerCleanupHook(options: CleanupHookOptions): void { run(); detach(); if (!options.exitOnSignal) return; - // Wait for every in-flight cleanup (this hook's tail + any - // siblings registered in the same process) before exiting. - // The promise this hook's `run()` just produced is already in - // `inFlightCleanups` (added inside `run()` itself), so the - // spread captures it without us needing to also push the - // returned value separately. Settled promises pass through - // Promise.allSettled in a single microtask, so a process whose - // hooks are all synchronous exits effectively immediately. - void Promise.allSettled([...inFlightCleanups]).then(() => - process.exit(0), - ); + // Snapshot `inFlightCleanups` AFTER every other signal listener + // for this signal has run. Node's EventEmitter dispatches + // listeners synchronously in registration order, so if the + // exit-owning hook happens to be registered *first*, taking the + // snapshot here in the listener body would miss promises that + // sibling hooks are about to add when their listeners run a + // few sync steps later. `queueMicrotask` defers past the end of + // the current sync turn (where `process.emit` finishes + // dispatching all listeners), so the snapshot includes every + // sibling's freshly-registered promise. Without this, an + // `arkor dev` whose `scheduleStudioTokenCleanup` (exitOnSignal: + // true) was registered before `scheduleHmrCleanup` (async + // dispose) would `process.exit(0)` mid-`hmr.dispose()` and + // leak the rolldown watcher. + // + // Settled promises pass through `Promise.allSettled` in a + // single microtask, so a process whose hooks are all + // synchronous still exits effectively immediately (one extra + // microtask round-trip). + queueMicrotask(() => { + void Promise.allSettled([...inFlightCleanups]).then(() => + process.exit(0), + ); + }); }); } diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index d43ce904..a0e92dd4 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1526,6 +1526,120 @@ describe("createTrainer (early stop)", () => { expect(stopPromiseResult).toBe("resolved"); }); + it("resolves the early-stop latch when the run hits a terminal event before the next checkpoint", async () => { + // Regression: previously `requestEarlyStop()`'s deferred was + // only resolved by (a) the checkpoint-triggered cancel branch + // or (b) the timeout fallback. If the run reached + // `training.completed` / `training.failed` *before* another + // checkpoint landed (a common case for short jobs or runs that + // had already saved their last checkpoint when SIGTERM arrived), + // the deferred stayed pending until the (default 5-min) timeout + // fired — the SIGTERM handler in `installShutdownHandlers` + // awaits that promise before exit, so shutdown was delayed up to + // `timeoutMs`. Both terminal branches now settle the latch + // explicitly so the signal path completes immediately when the + // job is already terminal. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + // started → log (arms early-stop) → completed; no checkpoint.saved + // in between, so the checkpoint-triggered resolution path is *not* + // exercised — only the new terminal-branch settlement is. + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: training.completed\ndata: ${JSON.stringify({ + type: "training.completed", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + artifacts: [], + })}\n\n`, + ]; + + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + let stopResolved = false; + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: () => { + // Long timeout: if the fix regresses, this test would + // hang for ~60s before the timer fires. With the + // terminal-branch settlement, the deferred resolves the + // moment `training.completed` lands. + void requestTrainerEarlyStop(trainer, { + timeoutMs: 60_000, + }).then(() => { + stopResolved = true; + }); + }, + }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + const result = await trainer.wait(); + // Flush microtasks so the .then() chain off `requestEarlyStop` + // observes the resolution before we assert. + await new Promise((r) => setImmediate(r)); + expect(result.job.status).toBe("completed"); + // No cancel POST was issued — the terminal branch just + // releases the latch; it doesn't cancel a run that already + // completed on its own. + expect(cancelCalls).toBe(0); + // The latch resolved via the terminal handler, not via the + // 60-second timeout. (The test would simply time out long + // before the timeout fired if this regressed.) + expect(stopResolved).toBe(true); + } finally { + globalThis.fetch = original; + } + }); + it("falls back to immediate cancel when no checkpoint arrives within timeoutMs", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index dec1a263..5a15b932 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -169,6 +169,29 @@ export function createTrainer( } | null = null; let earlyStopRequested = false; + /** + * Drop the early-stop latch (clear timer + resolve deferred + reset + * the request flag). Called from any path that means "wait()'s + * cancel-after-checkpoint promise is no longer waiting on anything" + * — the checkpoint-driven cancel branch, the terminal `completed` + * / `failed` branches, and the up-front guard in + * `requestEarlyStop()` when the job is already terminal. Without + * this called from terminal branches, a `requestEarlyStop()` armed + * mid-run that races a `training.completed` / `training.failed` + * before the next `checkpoint.saved` would leave the deferred + * pending until the (default 5-min) timeout fires — the SIGTERM + * handler in `installShutdownHandlers` would block on that promise + * and delay shutdown for up to `timeoutMs`. + */ + function settleEarlyStopLatch(): void { + if (earlyStopDeferred) { + if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); + earlyStopDeferred.resolve(); + earlyStopDeferred = null; + } + earlyStopRequested = false; + } + async function getClient(): Promise { if (!clientPromise) { clientPromise = (async () => { @@ -306,10 +329,7 @@ export function createTrainer( status: "cancelled", completedAt: event.timestamp, }; - if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); - earlyStopDeferred.resolve(); - earlyStopDeferred = null; - earlyStopRequested = false; + settleEarlyStopLatch(); return { terminal: true, artifacts: terminalResult?.artifacts ?? [] }; } return { terminal: false, artifacts: terminalResult?.artifacts ?? [] }; @@ -322,6 +342,10 @@ export function createTrainer( }; const artifacts = (event.artifacts ?? []) as unknown[]; await callbacks.onCompleted?.({ job: startedJob, artifacts }); + // Job already terminal — release any armed early-stop latch + // so a SIGTERM handler awaiting `requestEarlyStop()` settles + // immediately rather than blocking until the timeout fires. + settleEarlyStopLatch(); return { terminal: true, artifacts }; } case "training.failed": { @@ -332,6 +356,9 @@ export function createTrainer( completedAt: event.timestamp, }; await callbacks.onFailed?.({ job: startedJob, error: event.error }); + // Symmetric to the `completed` branch above — terminal status + // settles the latch even though the run failed. + settleEarlyStopLatch(); return { terminal: true, artifacts: [] }; } } @@ -473,12 +500,7 @@ export function createTrainer( ): Promise { // Nothing in flight: cleanup any prior latch and resolve. if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { - if (earlyStopDeferred) { - if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); - earlyStopDeferred.resolve(); - earlyStopDeferred = null; - } - earlyStopRequested = false; + settleEarlyStopLatch(); return; } // Idempotent: a second call piggybacks on the first. From c7022c4cf1039b2417f9193ef34d5d003ed630d8 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Wed, 6 May 2026 00:54:57 +0900 Subject: [PATCH 28/55] feat: implement restart grace window in RunTraining to handle late SSE events, ensuring accurate restart behavior and preventing stale code execution after child process exits --- .../studio-app/src/components/RunTraining.tsx | 98 ++++++++++++++++--- 1 file changed, 86 insertions(+), 12 deletions(-) diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 19f69353..d3a9241c 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -67,6 +67,22 @@ export function RunTraining() { // `run()`'s `finally` (and on unmount) so a failed spawn doesn't // leak entries into the next run. const pendingPreSpawnEventsRef = useRef([]); + // Grace window after the train stream closes during which the SSE + // handler can still latch a *late* restart event onto our just- + // exited child. The `/api/train` stream and `/api/dev/events` SSE + // are independent connections — under the race where the child + // exits before the matching `rebuild` event lands on the SSE + // channel (fast child exit, network jitter), `run()`'s finally + // would synchronously settle "no restart" and the user would be + // left on stale code despite the server-side SIGTERM. This timer + // defers the no-restart decision and keeps `currentPidRef` set so + // the SSE handler can still match per-pid; if the late event + // arrives within the window it sets `restartPendingRef` and the + // timer's callback fires the auto-restart from there. The window + // is short (a few hundred ms) — well under user perception for + // the no-restart outcome but long enough to absorb realistic + // cross-connection delivery skew. + const restartGraceTimerRef = useRef(null); // Tracks "is this React tree still mounted?". The HMR auto-restart // path schedules `queueMicrotask(() => run(...))` after the prior // run's `finally` — without this gate, navigating away during the @@ -89,6 +105,10 @@ export function RunTraining() { clearTimeout(hotSwapTimerRef.current); hotSwapTimerRef.current = null; } + if (restartGraceTimerRef.current !== null) { + clearTimeout(restartGraceTimerRef.current); + restartGraceTimerRef.current = null; + } }; }, []); @@ -182,8 +202,18 @@ export function RunTraining() { pendingPreSpawnEventsRef.current.push(payload); return; } + // Don't gate `myRestart` on `runningRef.current`: the + // `/api/train` stream and `/api/dev/events` SSE channel are + // independent connections, so a fast child exit can race the + // SSE delivery and flip `runningRef` to false JUST BEFORE the + // matching `rebuild` event lands here. Per-pid filtering via + // `currentPidRef` is what scopes the latch to *this tab's* + // child; `run()`'s finally keeps `currentPidRef` set during a + // brief grace window after the train stream closes for + // exactly this reason. Without dropping the `runningRef` + // gate, post-exit restart events would silently no-op and + // leave the tab on stale code. const myRestart = - runningRef.current && myPid !== null && (payload.restartTargets?.some((t) => t.pid === myPid) ?? false); const myHotSwap = @@ -307,7 +337,14 @@ export function RunTraining() { ); } finally { runningRef.current = false; - currentPidRef.current = null; + // DO NOT null `currentPidRef` here — the SSE handler needs to + // be able to match per-pid during the post-exit grace window + // below to catch a `rebuild` event that races behind the + // train stream's close on the separate connection. Captured + // here so the grace timer can detect "a new run started + // during the window" by comparing the current ref against + // `pidAtExit` and skipping its cleanup in that case. + const pidAtExit = currentPidRef.current; // Drop any pre-spawn buffer entries that survived a failed // run (spawn errored before `onSpawn` could drain). Without // this they'd be carried into the next run and falsely match @@ -318,12 +355,27 @@ export function RunTraining() { // abort path. setState on an already-unmounted component is a // no-op in React 18+, so the unmount-cleanup case handles itself. setRunning(false); - if (restartPendingRef.current && !ac.signal.aborted) { - // HMR-driven auto-restart: the dev loop SIGTERM'd the previous - // run because the rebuild changed cloud-side config. Re-spawn - // with the same args after a microtask so React commits the - // `running=false` state first (otherwise the re-entry overlaps). + + if (ac.signal.aborted) { + // User Stop wins over any pending or in-flight HMR restart — + // clear everything synchronously and skip the grace window + // so the tab really settles instead of bouncing back up. + restartPendingRef.current = false; + currentPidRef.current = null; + setHmrStatus("idle"); + if (restartGraceTimerRef.current !== null) { + clearTimeout(restartGraceTimerRef.current); + restartGraceTimerRef.current = null; + } + return; + } + + if (restartPendingRef.current) { + // Fast path: SSE event already landed before exit. Fire the + // restart synchronously without waiting for the grace + // window so the common case has no perceptible delay. restartPendingRef.current = false; + currentPidRef.current = null; setHmrStatus("restarting"); const fileForRestart = lastTrainFileRef.current; queueMicrotask(() => { @@ -336,12 +388,34 @@ export function RunTraining() { if (!isMountedRef.current) return; void run(fileForRestart); }); - } else { - // User-initiated abort takes precedence over a pending HMR - // restart — clear the latch so a Stop click really stops. - restartPendingRef.current = false; - setHmrStatus("idle"); + return; } + + // Slow path: SSE rebuild event might still be in flight on a + // separate connection. Defer the "no restart" decision so the + // SSE handler has time to land and flip `restartPendingRef`. + // `currentPidRef` stays set for the grace window so that + // late event can still match per-pid. + if (restartGraceTimerRef.current !== null) { + clearTimeout(restartGraceTimerRef.current); + } + restartGraceTimerRef.current = window.setTimeout(() => { + restartGraceTimerRef.current = null; + // A new run started during the window (overwrote the pid). + // Leave its lifecycle alone — its own finally will manage + // the cleanup eventually. + if (currentPidRef.current !== pidAtExit) return; + currentPidRef.current = null; + if (!isMountedRef.current) return; + if (restartPendingRef.current) { + restartPendingRef.current = false; + setHmrStatus("restarting"); + const fileForRestart = lastTrainFileRef.current; + void run(fileForRestart); + } else { + setHmrStatus("idle"); + } + }, 250); } } From 21ffd87e3229ebfb90f9f31aa15203fb3a390956 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 19:05:11 +0900 Subject: [PATCH 29/55] Refactor flushMicrotasks function documentation for clarity and accuracy regarding the use of setImmediate and its role in the cleanupHooks process. --- packages/arkor/src/cli/commands/dev.test.ts | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index bc043c95..aec80771 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -35,13 +35,24 @@ import { __resetCleanupHooksForTests } from "../cleanupHooks"; import { ensureCredentialsForStudio, runDev } from "./dev"; /** - * Yield long enough for the cleanupHooks coordinator to settle its - * `Promise.allSettled(...)` chain and dispatch `process.exit(0)`. Two - * `setImmediate`-equivalent ticks cover the typical case (one for the - * `Promise.resolve(...)` wrapping inside `run()`, one for the - * `.then(() => process.exit(0))`); using `setImmediate` instead of - * `Promise.resolve` ensures the exit microtask actually runs before - * we resume. + * Yield one `setImmediate` tick — enough for the cleanupHooks + * coordinator's `Promise.allSettled(...).then(() => process.exit(0))` + * chain to drain when there are no async cleanups in flight (the + * common case in this file: signal handler → queueMicrotask → + * already-resolved `allSettled` → `.then` → `process.exit(0)`, + * which all collapses into the single macrotask boundary that + * `setImmediate` yields to). + * + * `setImmediate` is the right primitive (vs `Promise.resolve` / + * `queueMicrotask`) because we need the event loop to actually + * turn — the `process.exit` mock fires inside a `.then` callback + * scheduled from a previous microtask checkpoint, and a microtask- + * only flush would resume *before* that callback gets to run. + * + * Tests that drive a chain with extra microtask hops (e.g. async + * sibling cleanups whose promises also pass through + * `Promise.allSettled`) await this helper twice in a row — see + * the cleanupHooks tests. */ function flushMicrotasks(): Promise { return new Promise((resolve) => setImmediate(resolve)); From e93488e33df52fc586fc5b031673674dcc92faf6 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 19:13:28 +0900 Subject: [PATCH 30/55] Update KillResult documentation in trainRegistry.ts to clarify the meaning of "unsupported" and its handling of various error cases in the safeKill function. --- packages/arkor/src/studio/trainRegistry.ts | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index caf68166..90f7e777 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -60,9 +60,17 @@ export interface DispatchResult { * (a race where the child exits between the `entries` lookup and * the `kill` call — POSIX `kill(2)` raises `ESRCH` for * non-existent PIDs and Node propagates it on some versions). - * - `"unsupported"`: the platform doesn't support this signal kind - * (Windows + `SIGUSR2` → `ENOSYS`; bad signal name → `EINVAL`); - * `kill` threw with that error code. + * - `"unsupported"`: any *other* `kill` throw — i.e. the signal + * couldn't be delivered for a reason that isn't "process is gone". + * The motivating case is the platform not supporting this signal + * kind (Windows + `SIGUSR2` → `ENOSYS`; bad signal name → + * `EINVAL`), which `dispatchRebuild` falls back to SIGTERM-restart + * for. The bucket is intentionally a catch-all rather than a + * whitelist of error codes: rare cases like `EPERM` (lost the + * right to signal a re-parented child) and platform-specific + * surprises take the same conservative fallback — try the next + * signal, otherwise drop the entry — which is what callers want + * from "kill failed for some non-recoverable reason". */ type KillResult = "ok" | "gone" | "unsupported"; @@ -75,7 +83,8 @@ function safeKill(child: ChildProcess, signal: NodeJS.Signals): KillResult { // it as `"unsupported"` would route a hash-match hot-swap candidate // into the SIGTERM fallback, which then also no-ops (also gone) but // costs a needless restart-bucket inclusion until the close handler - // unregisters the child. + // unregisters the child. Every other throw collapses into + // `"unsupported"` per the type doc above. const code = (err as NodeJS.ErrnoException | null)?.code; if (code === "ESRCH") return "gone"; return "unsupported"; From c71d0d32dc0c77015363efd59674da90afd2ac70 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 19:30:55 +0900 Subject: [PATCH 31/55] Enhance error handling in streamTraining to fail fast on non-2xx responses, ensuring accurate user feedback on failures. Update RunTraining component to correctly manage mounted state in React StrictMode. --- .../studio-app/src/components/RunTraining.tsx | 11 +++++ packages/studio-app/src/lib/api.test.ts | 47 +++++++++++++++++++ packages/studio-app/src/lib/api.ts | 24 ++++++++++ 3 files changed, 82 insertions(+) diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index d3a9241c..42cbd621 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -92,6 +92,17 @@ export function RunTraining() { const isMountedRef = useRef(true); useEffect(() => { + // Re-arm the mounted flag every time the effect (re-)runs. + // React StrictMode (enabled in `main.tsx` for dev) intentionally + // runs setup → cleanup → setup once on mount to surface + // ordering bugs; without this re-arm the cleanup's + // `isMountedRef.current = false` would persist into the second + // setup, making the ref permanently false while the component + // is actually mounted. The HMR auto-restart paths guarded by + // `isMountedRef.current` would then silently no-op in every + // Vite dev session even though they work fine in `vite build` + // output (StrictMode's double-effect is a dev-only behaviour). + isMountedRef.current = true; return () => { isMountedRef.current = false; // Defense in depth: clearing the latch here means even if a diff --git a/packages/studio-app/src/lib/api.test.ts b/packages/studio-app/src/lib/api.test.ts index 42fa026d..f195055d 100644 --- a/packages/studio-app/src/lib/api.test.ts +++ b/packages/studio-app/src/lib/api.test.ts @@ -362,6 +362,53 @@ describe("streamTraining", () => { expect(JSON.parse(captured)).toEqual({}); }); + it("throws with the response body when /api/train returns non-2xx", async () => { + // Regression: previously `streamTraining` ignored `res.ok` and + // proceeded to call `onSpawn` + read the body even on 403/500 + // failures. The SPA would treat a failed spawn (auth rejection, + // server-side EACCES surfacing as 500, etc.) as a normal + // completion — `onChunk` got nothing, `onSpawn` was called with + // a `null` pid, and `run()` resolved cleanly. The user saw an + // idle UI with no log line and no clue what went wrong. Fail + // fast so the caller's catch path surfaces the server's reason. + globalThis.fetch = vi.fn( + async () => + new Response("anonymous tokens disabled", { + status: 403, + statusText: "Forbidden", + }), + ) as typeof fetch; + const onChunkCalls: string[] = []; + let onSpawnCalls = 0; + await expect( + streamTraining( + (t) => onChunkCalls.push(t), + undefined, + undefined, + () => { + onSpawnCalls += 1; + }, + ), + ).rejects.toThrow(/403.*anonymous tokens disabled/); + // The body must NOT have been streamed and `onSpawn` must NOT + // have been called with the bogus null pid — both would mislead + // the SPA into treating the failure as a successful run. + expect(onChunkCalls).toEqual([]); + expect(onSpawnCalls).toBe(0); + }); + + it("falls back to the bare status when the error response body is empty", async () => { + // Belt-and-braces for upstreams that return non-2xx with no + // body. The status code is enough to surface the failure to + // the user; we just don't want a `: undefined` suffix. + globalThis.fetch = vi.fn( + async () => new Response(null, { status: 500, statusText: "Server Error" }), + ) as typeof fetch; + await expect(streamTraining(() => undefined)).rejects.toThrow( + /^\/api\/train failed \(500 Server Error\)$/, + ); + }); + it("returns silently when the response has no body (e.g. 204 from upstream)", async () => { globalThis.fetch = vi.fn( async () => new Response(null, { status: 204 }), diff --git a/packages/studio-app/src/lib/api.ts b/packages/studio-app/src/lib/api.ts index 738ec3f9..a089fa3f 100644 --- a/packages/studio-app/src/lib/api.ts +++ b/packages/studio-app/src/lib/api.ts @@ -306,6 +306,30 @@ export async function streamTraining( body: JSON.stringify({ ...(file ? { file } : {}) }), signal, }); + // Fail fast on non-2xx so a failed spawn (auth 403, validation 400, + // server-side spawn EACCES surfacing as 500, etc.) doesn't slip + // through as a "successful" silent run. Without this, the SPA + // would call `onSpawn(null)` (the failure response carries no + // `X-Arkor-Train-Pid`), then hit `!res.body` or read an empty + // body and resolve as if the run completed cleanly — leaving the + // user looking at an idle UI and no log output. Read the body + // text for diagnostics so the caller's error log shows the + // server's reason instead of a bare status code. + if (!res.ok) { + let detail = ""; + try { + detail = (await res.text()).trim(); + } catch { + // Body unreadable (already consumed, network gone, etc.) — + // surface the status alone rather than masking the failure + // entirely. + } + throw new Error( + detail + ? `/api/train failed (${res.status} ${res.statusText}): ${detail}` + : `/api/train failed (${res.status} ${res.statusText})`, + ); + } if (onSpawn) { const raw = res.headers.get("x-arkor-train-pid"); const parsed = raw ? Number.parseInt(raw, 10) : NaN; From 4facf014a8753450f94f55337d03bf0295efce32 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 19:52:46 +0900 Subject: [PATCH 32/55] Improve error handling in HMR subscription to prevent crashes from throwing subscribers during lastEvent replay. Update TrainRegistry to backfill configHash for pre-ready spawns, avoiding unnecessary SIGTERM restarts and optimizing rebuild dispatch logic. --- packages/arkor/src/studio/hmr.test.ts | 40 +++++++++++++++ packages/arkor/src/studio/hmr.ts | 21 +++++++- packages/arkor/src/studio/server.ts | 28 ++++++++--- .../arkor/src/studio/trainRegistry.test.ts | 50 +++++++++++++++---- packages/arkor/src/studio/trainRegistry.ts | 20 ++++++++ 5 files changed, 140 insertions(+), 19 deletions(-) diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 391fe46d..8334c090 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -182,6 +182,46 @@ describe("createHmrCoordinator", () => { } }); + it("subscribe()'s lastEvent replay swallows a throwing subscriber so initialization keeps working", async () => { + // Regression: `subscribe()` synchronously replays `lastEvent` to + // a fresh subscriber for the late-mount-cached-state contract. + // Previously the replay had no try/catch, so a subscriber that + // threw during that one call (typical case: an SSE controller + // that closed mid-replay — `controller.enqueue` on a closed + // stream throws) propagated out of `subscribe()` and broke + // whoever just registered. `broadcast()` already swallowed + // subscriber throws defensively; this test pins the symmetric + // contract on `subscribe()`. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const firstEvents: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => firstEvents.push(e)); + try { + await nextEvent(firstEvents, (e) => e.type === "ready"); + // A subscriber whose body throws on the cached-state replay. + const throwingSubscriber = (): void => { + throw new Error("controller closed"); + }; + // Must not throw out of subscribe(); must still return a + // working unsubscribe. + let unsubscribe: () => void = () => undefined; + expect(() => { + unsubscribe = hmr.subscribe(throwingSubscriber); + }).not.toThrow(); + expect(typeof unsubscribe).toBe("function"); + // Confirm the coordinator is still healthy: a *new* subscriber + // (after the throwing one) still receives the cached replay. + const recoveryEvents: HmrEvent[] = []; + hmr.subscribe((e) => recoveryEvents.push(e)); + expect(recoveryEvents.length).toBeGreaterThanOrEqual(1); + unsubscribe(); + } finally { + await hmr.dispose(); + } + }); + it("stops broadcasting after dispose()", async () => { mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 6cd5d158..14cad357 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -330,7 +330,26 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { return { subscribe(fn) { subscribers.add(fn); - if (lastEvent) fn(lastEvent); + // Replay the last broadcast so a late-mounting subscriber (an + // `/api/dev/events` SSE client opening after the first BUNDLE_END, + // or `buildStudioApp`'s dispatch subscriber registering after + // entry-wait recovery) sees current state without waiting for + // the next rebuild. + // + // Wrapped in the same defensive try/catch as `broadcast` so a + // throw inside the subscriber (typically an SSE controller that + // closed mid-replay — `controller.enqueue` on a closed stream + // throws) doesn't propagate out of `subscribe()` and crash + // whoever just registered. One bad subscriber must not be able + // to break HMR initialisation for the rest of the process. + if (lastEvent) { + try { + fn(lastEvent); + } catch { + // Swallow — subscribers own their own teardown; we just + // shouldn't poison their `subscribe()` call site. + } + } startWatcher(); return () => { subscribers.delete(fn); diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 358c469d..b63b7898 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -431,10 +431,26 @@ export function buildStudioApp(options: StudioServerOptions) { } }; const enc = new TextEncoder(); - const onClose = (code: number | null): void => { - activeTrains.unregister(child.pid); + // Detach every listener this stream wired onto `child`. Called + // from `onClose` / `onError` themselves (so once one fires the + // closure references — controller, TextEncoder — drop and the + // subprocess record can be GC'd promptly even if the other + // event also queues), and from `cancelTeardown` for the + // client-side cancel path. Removing only the `data` listeners + // (as the previous code did) left `close` / `error` attached + // to the dead ChildProcess, which kept their closures pinned + // until the process object itself was reaped — meaningful + // memory pressure for an `arkor dev` session that spawns many + // children over hours. + const detachListeners = (): void => { child.stdout.off("data", onChunk); child.stderr.off("data", onChunk); + child.off("close", onClose); + child.off("error", onError); + }; + const onClose = (code: number | null): void => { + activeTrains.unregister(child.pid); + detachListeners(); if (closed) return; closed = true; try { @@ -455,8 +471,7 @@ export function buildStudioApp(options: StudioServerOptions) { // flag and the `unregister` call is idempotent. const onError = (err: Error): void => { activeTrains.unregister(child.pid); - child.stdout.off("data", onChunk); - child.stderr.off("data", onChunk); + detachListeners(); if (closed) return; closed = true; try { @@ -474,10 +489,7 @@ export function buildStudioApp(options: StudioServerOptions) { child.on("error", onError); cancelTeardown = () => { closed = true; - child.stdout.off("data", onChunk); - child.stderr.off("data", onChunk); - child.off("close", onClose); - child.off("error", onError); + detachListeners(); }; }, cancel() { diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index 95f7a36c..66af0954 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -64,18 +64,48 @@ describe("TrainRegistry", () => { expect(b.kill).toHaveBeenCalledWith("SIGTERM"); }); - it("dispatchRebuild SIGTERMs children whose stored hash is null", () => { - // A spawn that raced an in-flight build can land with `configHash: - // null`. It must not be hot-swapped — even if the new bundle's hash - // is known, we have no proof the spawned subprocess is running the - // same config. + it("dispatchRebuild backfills the hash and skips dispatch when the spawn-time hash was null", () => { + // Regression: previously a child registered with `configHash: + // null` (spawn happened *before* the HMR watcher emitted its + // first successful build, so `getCurrentConfigHash()` returned + // null) was treated as a hash mismatch on the next event with + // a real hash and SIGTERM-restarted. Since the dispatch now + // fires on `ready` events too, that turned every "click Run + // before the watcher's first BUNDLE_END" into a spurious + // cancel+restart cycle (extra GPU spend / job churn) triggered + // purely by startup timing rather than any actual code change. + // The fix backfills the entry's hash with the first known value + // and skips signal dispatch — the child either already loaded + // the right bundle or surfaces its own load error; future + // rebuilds compare against the backfilled hash like any other. const reg = new TrainRegistry(); - const a = fakeChild(301); - reg.register(a as unknown as ChildProcess, { configHash: null }); - const result = reg.dispatchRebuild("h"); + const c = fakeChild(401); + reg.register(c as unknown as ChildProcess, { + configHash: null, + trainFile: "/tmp/preready.ts", + }); + const result = reg.dispatchRebuild("first-real-hash"); + // Neither bucket — no signal sent, nothing for the SPA to react to. expect(result.hotSwapTargets).toEqual([]); - expect(result.restartTargets).toHaveLength(1); - expect(a.kill).toHaveBeenCalledWith("SIGTERM"); + expect(result.restartTargets).toEqual([]); + expect(c.kill).not.toHaveBeenCalled(); + // A subsequent dispatch with the SAME hash must take the hot- + // swap path (proves the backfill landed; without it this would + // STILL be null vs "first-real-hash" → SIGTERM). + const second = reg.dispatchRebuild("first-real-hash"); + expect(second.hotSwapTargets).toEqual([ + { pid: 401, trainFile: "/tmp/preready.ts" }, + ]); + expect(second.restartTargets).toEqual([]); + expect(c.kill).toHaveBeenCalledWith("SIGUSR2"); + // And a different hash on a later rebuild now correctly routes + // to SIGTERM-restart (backfilled hash is real). + c.kill.mockClear(); + const third = reg.dispatchRebuild("second-hash"); + expect(third.restartTargets).toEqual([ + { pid: 401, trainFile: "/tmp/preready.ts" }, + ]); + expect(c.kill).toHaveBeenCalledWith("SIGTERM"); }); it("unregister removes the child from the policy decisions", () => { diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 90f7e777..1c4dc4b1 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -158,6 +158,26 @@ export class TrainRegistry { for (const [pid, entry] of this.entries) { if (entry.earlyStopRequested) continue; const target: RestartTarget = { pid, trainFile: entry.trainFile }; + // Pre-ready spawn: this child was registered via `/api/train` + // *before* the HMR watcher's first successful build, so its + // recorded `configHash` is `null`. Now that a real hash has + // arrived, treat the current build as the child's baseline: + // backfill the hash and skip signal dispatch entirely. The + // alternative (treating `null !== nextHash` as a real + // mismatch) would SIGTERM-restart the just-started child even + // though the config never actually changed — a spurious + // cancel+restart cycle that costs GPU budget for no benefit + // and that's triggered purely by startup timing (user clicked + // Run before the watcher's initial BUNDLE_END landed). The + // SIGUSR2 path is also wrong here: the child might still be + // mid-flight on its initial bundle import, and racing a + // reload signal against that load wastes work. Future rebuilds + // against this entry now compare against the backfilled hash + // like any other child. + if (entry.configHash === null && nextConfigHash !== null) { + entry.configHash = nextConfigHash; + continue; + } const matches = nextConfigHash !== null && entry.configHash !== null && From 5f1316f91f45ccab1db9c8cfae85f20625225193 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 20:53:59 +0900 Subject: [PATCH 33/55] Implement robust error handling in trainer callbacks to ensure early-stop latch is settled even when user callbacks throw. Update TrainRegistry to track early-stop requests, preventing double SIGTERM signals during HMR cycles and enhancing stability during training operations. --- packages/arkor/src/core/trainer.test.ts | 123 ++++++++++++++++++ packages/arkor/src/core/trainer.ts | 30 +++-- packages/arkor/src/studio/server.ts | 13 ++ .../arkor/src/studio/trainRegistry.test.ts | 27 ++++ packages/arkor/src/studio/trainRegistry.ts | 17 +++ 5 files changed, 201 insertions(+), 9 deletions(-) diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index a0e92dd4..1ef1e4fa 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1640,6 +1640,129 @@ describe("createTrainer (early stop)", () => { } }); + it("settles the early-stop latch even when the user's onCompleted callback throws", async () => { + // Regression: previously `settleEarlyStopLatch()` was called + // *after* awaiting `callbacks.onCompleted` / `onFailed`. A + // thrown user callback propagated out of `dispatch()` before + // the settle ran, leaving `earlyStopDeferred` pending — the + // SIGTERM handler in `installShutdownHandlers` would block on + // that promise until the (default 5-min) timeout fired, + // delaying shutdown for a user-code bug. Wrapping in + // `try/finally` ensures the latch is released regardless, + // while preserving the throw's propagation through `wait()` so + // callers still see the original error. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: training.completed\ndata: ${JSON.stringify({ + type: "training.completed", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + artifacts: [], + })}\n\n`, + ]; + + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + let stopResolved = false; + let stopRejected = false; + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: () => { + // Arm early-stop with a long timeout — if the latch + // isn't released by `finally`, this would hang for the + // full 60 seconds. + void requestTrainerEarlyStop(trainer, { + timeoutMs: 60_000, + }).then( + () => { + stopResolved = true; + }, + () => { + stopRejected = true; + }, + ); + }, + onCompleted: () => { + throw new Error("user callback boom"); + }, + }, + }, + { + baseUrl: "http://mock", + credentials: creds, + cwd, + reconnectDelayMs: 1, + // `wait()` catches dispatch throws and routes them through + // its reconnect loop; with the default unbounded retry the + // user-callback throw above would loop forever and the test + // would just time out. Cap retries at 0 so the first thrown + // dispatch surfaces as a `wait()` rejection — that lets us + // observe the *latch* settlement (the actual contract under + // test) cleanly. + maxReconnectAttempts: 0, + }, + ); + + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + // The user-callback throw is wrapped by `handleFailure` after + // `maxReconnectAttempts: 0` exhausts; the original error is + // preserved as `cause`. We just need wait() to settle so the + // test doesn't hang — the *body* of the assertion is the + // latch state below. + await expect(trainer.wait()).rejects.toThrow(); + // The latch must have settled (via `finally`) BEFORE wait() + // rejected. Without the `try/finally` around `onCompleted` + // the latch would still be armed → `stopResolved` stays + // false → the test fails (rather than timing out, since + // `maxReconnectAttempts: 0` already unblocks wait()). + await new Promise((r) => setImmediate(r)); + expect(stopResolved).toBe(true); + expect(stopRejected).toBe(false); + } finally { + globalThis.fetch = original; + } + }); + it("falls back to immediate cancel when no checkpoint arrives within timeoutMs", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index 5a15b932..d2f6b4f5 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -341,11 +341,19 @@ export function createTrainer( completedAt: event.timestamp, }; const artifacts = (event.artifacts ?? []) as unknown[]; - await callbacks.onCompleted?.({ job: startedJob, artifacts }); - // Job already terminal — release any armed early-stop latch - // so a SIGTERM handler awaiting `requestEarlyStop()` settles - // immediately rather than blocking until the timeout fires. - settleEarlyStopLatch(); + // `try/finally` so the latch settles even when the user's + // `onCompleted` callback throws: otherwise a thrown + // callback would leave `earlyStopDeferred` pending and the + // SIGTERM handler awaiting `requestEarlyStop()` would block + // until the timeout (default 5 min). The throw still + // propagates through `dispatch()` → `wait()` so callers see + // the original error — we just don't strand the shutdown + // path along with it. + try { + await callbacks.onCompleted?.({ job: startedJob, artifacts }); + } finally { + settleEarlyStopLatch(); + } return { terminal: true, artifacts }; } case "training.failed": { @@ -355,10 +363,14 @@ export function createTrainer( error: event.error, completedAt: event.timestamp, }; - await callbacks.onFailed?.({ job: startedJob, error: event.error }); - // Symmetric to the `completed` branch above — terminal status - // settles the latch even though the run failed. - settleEarlyStopLatch(); + // Symmetric to the `completed` branch above — terminal + // status settles the latch even when the run failed *and* + // the user's `onFailed` callback itself throws. + try { + await callbacks.onFailed?.({ job: startedJob, error: event.error }); + } finally { + settleEarlyStopLatch(); + } return { terminal: true, artifacts: [] }; } } diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index b63b7898..cd4dceee 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -493,8 +493,21 @@ export function buildStudioApp(options: StudioServerOptions) { }; }, cancel() { + // Capture the early-stop flag *before* unregistering: the + // unregister wipes the entry, after which we can't tell + // whether HMR's `dispatchRebuild` had already SIGTERMed + // this child. If it had, sending another SIGTERM here + // would land as the *second* signal on the runner side and + // trigger `installShutdownHandlers`' emergency `exit(143)` + // fast-path — which bypasses the checkpoint-preserving + // early-stop + cloud `cancel()` flow and can leave the + // cloud run alive while the local subprocess dies. The HMR + // path is already driving the child to a clean exit, so we + // just unregister + detach listeners and let it run. + const earlyStopInFlight = activeTrains.isEarlyStopRequested(child.pid); activeTrains.unregister(child.pid); cancelTeardown?.(); + if (earlyStopInFlight) return; // `ChildProcess.kill()` can throw (ESRCH if the process has // already exited between this handler's invocation and the // signal delivery). A throw here would surface as an unhandled diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index 66af0954..d1c230b9 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -108,6 +108,33 @@ describe("TrainRegistry", () => { expect(c.kill).toHaveBeenCalledWith("SIGTERM"); }); + it("isEarlyStopRequested reflects the dispatchRebuild SIGTERM flag", () => { + // Regression: `/api/train`'s ReadableStream `cancel()` consults + // this flag to avoid sending a *second* SIGTERM to a child that + // HMR's `dispatchRebuild` already SIGTERMed for early-stop. A + // double-SIGTERM hits `installShutdownHandlers`' emergency + // `exit(143)` fast-path, bypassing the checkpoint-preserving + // cancel flow and potentially leaving the cloud run alive. + const reg = new TrainRegistry(); + const a = fakeChild(901); + reg.register(a as unknown as ChildProcess, { + configHash: "h1", + trainFile: "/tmp/a.ts", + }); + expect(reg.isEarlyStopRequested(901)).toBe(false); + // Mismatched hash → SIGTERM → flag flips on. + reg.dispatchRebuild("h2"); + expect(reg.isEarlyStopRequested(901)).toBe(true); + // Defensive cases: non-numeric / unknown / never-registered pid. + expect(reg.isEarlyStopRequested(undefined)).toBe(false); + expect(reg.isEarlyStopRequested(99999)).toBe(false); + // Once the child unregisters (close handler) the flag effectively + // resets — subsequent queries return false rather than retaining + // stale state. + reg.unregister(901); + expect(reg.isEarlyStopRequested(901)).toBe(false); + }); + it("unregister removes the child from the policy decisions", () => { const reg = new TrainRegistry(); const a = fakeChild(401); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 1c4dc4b1..6b284741 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -116,6 +116,23 @@ export class TrainRegistry { if (typeof pid === "number") this.entries.delete(pid); } + /** + * Whether `dispatchRebuild` has already issued a graceful-restart + * SIGTERM to this child as part of an HMR cycle. Consulted by + * `/api/train`'s ReadableStream `cancel()` handler so a client- + * driven cancel (tab close, navigation, aborted fetch) doesn't + * pile a second SIGTERM on top of an in-progress early-stop — + * the runner's `installShutdownHandlers` interprets a second + * SIGTERM as the emergency `exit(143)` fast-path, which bypasses + * the checkpoint-preserving early-stop + `cancel()` flow and + * leaves the cloud-side run live while the local subprocess + * dies. Defeats the main safety goal of the HMR restart logic. + */ + isEarlyStopRequested(pid: number | undefined): boolean { + if (typeof pid !== "number") return false; + return this.entries.get(pid)?.earlyStopRequested ?? false; + } + get size(): number { return this.entries.size; } From 8b037c5246c96ea8a856ba399593bdcbe09dd72b Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 22:22:24 +0900 Subject: [PATCH 34/55] Refactor child process management in buildStudioApp to maintain data listeners during cancellation. This prevents deadlocks by ensuring the OS pipe continues to drain while the child process exits gracefully. Update comments for clarity on the rationale behind listener management. --- packages/arkor/src/studio/server.ts | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index cd4dceee..dfac2099 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -488,8 +488,24 @@ export function buildStudioApp(options: StudioServerOptions) { child.on("close", onClose); child.on("error", onError); cancelTeardown = () => { + // Don't detach data listeners here: the child stays alive + // for some time after the SPA cancels — either because + // we're skipping `child.kill()` for an in-progress + // HMR early-stop, or because `child.kill()`'s SIGTERM + // triggers a graceful checkpoint+exit that takes + // seconds. During that window the child keeps writing + // logs to its stdout/stderr pipes; if our `data` + // listeners are gone, Node stops draining the OS pipe, + // the buffer fills, and the child's next `write()` + // blocks indefinitely — deadlocking the very graceful + // exit we're trying to preserve. The `closed` flag + // already makes `enqueue`/`close` a no-op so the + // controller-closed race stays safe; the eventual + // `onClose` / `onError` listeners detach everything + // (via `detachListeners()`) when the child finally + // exits. That timing — at-exit, not at-cancel — is the + // correct moment to break the closure refs for GC. closed = true; - detachListeners(); }; }, cancel() { @@ -503,7 +519,10 @@ export function buildStudioApp(options: StudioServerOptions) { // early-stop + cloud `cancel()` flow and can leave the // cloud run alive while the local subprocess dies. The HMR // path is already driving the child to a clean exit, so we - // just unregister + detach listeners and let it run. + // just unregister + flip `closed` (via `cancelTeardown`) + // and let it run. The data listeners stay attached so the + // OS pipe keeps draining while the child checkpoints — + // see `cancelTeardown` for the backpressure rationale. const earlyStopInFlight = activeTrains.isEarlyStopRequested(child.pid); activeTrains.unregister(child.pid); cancelTeardown?.(); From e30ef7458cf0b3ba91680ab7d128748057afa632 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Fri, 8 May 2026 23:54:50 +0900 Subject: [PATCH 35/55] feat: add end-to-end tests for Studio HMR functionality This commit introduces a new test suite for the Hot Module Replacement (HMR) feature in Arkor Studio. It includes tests for the registration of the HMR meta tag and verifies that editing the trainer configuration triggers the appropriate rebuild events. The tests ensure that the system behaves correctly when interacting with the SSE API, enhancing the reliability of the HMR implementation. --- e2e/studio/src/specs/hmr.spec.ts | 281 +++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 e2e/studio/src/specs/hmr.spec.ts diff --git a/e2e/studio/src/specs/hmr.spec.ts b/e2e/studio/src/specs/hmr.spec.ts new file mode 100644 index 00000000..1371e47e --- /dev/null +++ b/e2e/studio/src/specs/hmr.spec.ts @@ -0,0 +1,281 @@ +import { writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { expect, test } from "../harness/fixture"; + +/** + * Rewrite the seeded `src/arkor/index.ts` with a new trainer `name` + * (and arbitrary content tail to bump mtime + size beyond any + * sub-millisecond resolution noise on fast filesystems). We rewrite + * the WHOLE file (not append) so rolldown's incremental cache can't + * reuse the prior module record and skip the rebuild. + * + * Two key shape differences from `seedFixture.ts`'s `seedManifest`: + * + * 1. The trainer carries the `Symbol.for("arkor.trainer.inspect")` + * brand so `findInspectableTrainer` (used by `studio/hmr.ts`'s + * `inspectBundle`) can read its name + config — without the + * brand, every SSE rebuild frame gets `trainerName: null` and + * the SSE-level test below can't distinguish the post-edit + * rebuild from the cached initial-build replay. The seed + * fixture skips the brand because its existing tests only + * exercise the `/api/manifest` path (which uses + * `findTrainerInModule`, brand-less) — extending it would + * couple every test to inspection internals it doesn't care + * about. + * + * 2. The brand returns a real `JobConfig` shape (`model` + + * `datasetSource` set), not the seed's empty placeholder, so + * `hashJobConfig` produces a stable non-empty `configHash`. + * `studio/server.ts`'s `dispatchRebuild` consults that hash to + * route between SIGUSR2 hot-swap and SIGTERM restart; the + * existing E2E only tests the boot path so it never needs a + * real config there. + * + * `Symbol.for` keys round-trip across the dev process / built + * bundle realm boundary because they live in the global symbol + * registry — same mechanism `core/trainerInspection.ts` documents + * for the runtime CLI / `.arkor/build/index.mjs` split. + */ +function rewriteManifest(projectDir: string, name: string): void { + const path = join(projectDir, "src", "arkor", "index.ts"); + writeFileSync( + path, + [ + 'const TRAINER_INSPECT_KEY = Symbol.for("arkor.trainer.inspect");', + "const trainer = {", + ` name: ${JSON.stringify(name)},`, + " start: async () => ({ id: 'e2e-job', url: '' }),", + " wait: async () => ({ status: 'completed' as const }),", + " cancel: async () => {},", + "};", + "Object.defineProperty(trainer, TRAINER_INSPECT_KEY, {", + " value: () => ({", + " name: trainer.name,", + " config: {", + ' model: "studio-e2e-model",', + ' datasetSource: { type: "huggingface" as const, name: "studio-e2e-dataset" },', + " },", + " callbacks: {},", + " }),", + " enumerable: false,", + "});", + 'export const arkor = { _kind: "arkor" as const, trainer };', + "export default arkor;", + `// rewritten-${name}-${Date.now()}`, + "", + ].join("\n"), + ); +} + +interface SseFrame { + event: string; + data: string; +} + +/** + * Open `/api/dev/events`, parse incoming SSE frames, and resolve when + * `predicate` first returns true. Cleans up the underlying body + * reader on resolve / reject so the Hono server's connection bookkeeping + * doesn't leak between tests. + * + * `arkor dev` requires the studio token via the query param (EventSource + * can't set headers); the same allow-list governs `fetch()` here. + */ +async function awaitSseFrame( + studioUrl: string, + token: string, + predicate: (frame: SseFrame) => boolean, + timeoutMs: number, +): Promise { + const url = `${studioUrl}/api/dev/events?studioToken=${encodeURIComponent(token)}`; + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + let res: Response; + try { + res = await fetch(url, { signal: controller.signal }); + } catch (err) { + clearTimeout(timeout); + throw new Error( + `SSE connect failed for ${url}: ${(err as Error).message}`, + ); + } + if (!res.ok || !res.body) { + clearTimeout(timeout); + throw new Error( + `SSE connect returned ${res.status} ${res.statusText}; body=${ + res.body ? "present" : "missing" + }`, + ); + } + const reader = res.body.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + try { + while (true) { + const { value, done } = await reader.read(); + if (done) { + throw new Error("SSE stream ended before predicate matched"); + } + buf += decoder.decode(value, { stream: true }); + // Frames are terminated by a blank line (`\n\n`). Split, keep + // the trailing partial in `buf` for the next iteration. + const parts = buf.split("\n\n"); + buf = parts.pop() ?? ""; + for (const raw of parts) { + if (!raw) continue; + let event = ""; + let data = ""; + for (const line of raw.split("\n")) { + if (line.startsWith("event: ")) event = line.slice(7); + else if (line.startsWith("data: ")) data = line.slice(6); + } + const frame: SseFrame = { event, data }; + if (predicate(frame)) return frame; + } + } + } finally { + clearTimeout(timeout); + // Cancel rather than just release: cancel propagates to the Hono + // ReadableStream's `cancel()` handler so the server unsubscribes + // this listener from the HMR coordinator promptly. Otherwise the + // listener lingers until the next dispose, which can produce + // cross-test bleed when running with `--repeat-each`. + await reader.cancel().catch(() => {}); + } +} + +test.describe("Studio HMR", () => { + test("/api/dev/events is registered with the hmr-enabled meta tag", async ({ + page, + studio, + }) => { + // Boot-time wiring: `arkor dev` always wires up the HMR + // coordinator, so the served HTML must carry both the + // studio-token meta and the hmr-enabled meta. Without the + // hmr-enabled tag, `isHmrEnabled()` returns false in the SPA + // and the auto-restart / hot-swap paths silently no-op. + await page.goto(studio.url); + const hmrMeta = page.locator('meta[name="arkor-hmr-enabled"]'); + await expect(hmrMeta).toHaveCount(1); + await expect(hmrMeta).toHaveAttribute("content", "true"); + + // Endpoint sanity-check: a GET without the studio token must 403 + // (regression for the CSRF allow-list — `eventStreamPathPattern` + // permits the query-token form, but a raw GET stays gated). + const noToken = await fetch(`${studio.url}/api/dev/events`); + expect(noToken.status).toBe(403); + }); + + test("editing src/arkor/index.ts broadcasts a rebuild SSE frame with the new trainer name", async ({ + studio, + fixturePaths, + }) => { + // Subscribe FIRST so the cached `ready` event from the watcher's + // initial BUNDLE_END is consumed before we trigger the new + // rebuild. Without draining the cached frame we'd race: if the + // initial inspection finished before our subscribe arrives, the + // first frame we see could be the stale `ready` for the seeded + // name and the predicate would match the wrong build. The + // predicate explicitly requires the post-edit name to dodge that. + const newName = "studio-e2e-trainer-edited"; + rewriteManifest(fixturePaths.projectDir, newName); + + const frame = await awaitSseFrame( + studio.url, + studio.token, + (f) => { + if (f.event !== "rebuild" && f.event !== "ready") return false; + // Some replays have empty data; skip those. + if (!f.data) return false; + try { + const parsed = JSON.parse(f.data) as { + trainerName?: string | null; + }; + return parsed.trainerName === newName; + } catch { + return false; + } + }, + // Generous: rolldown's first cold build on a fresh project + // can take 1–2s on a slow CI runner; the post-edit rebuild is + // typically faster (incremental) but we don't want to flake on + // a noisy host. + 20_000, + ); + + expect(frame.event === "rebuild" || frame.event === "ready").toBe(true); + const parsed = JSON.parse(frame.data) as { + outFile?: string; + trainerName?: string | null; + configHash?: string | null; + }; + expect(parsed.trainerName).toBe(newName); + // The artefact path is also part of the contract: HMR consumers + // (including the runner subprocess on SIGUSR2) re-import the + // bundle by `outFile`. A regression that drops it would silently + // disable hot-swap. + expect(parsed.outFile).toMatch(/\.arkor[\\/]build[\\/]index\.mjs$/); + }); + + test("/api/manifest reflects the edited trainer name after a save", async ({ + studio, + fixturePaths, + }) => { + // End-to-end through the Hono `/api/manifest` route, which + // dynamic-imports the freshly-built artefact via + // `summariseBuiltManifest`. The HMR rebuild must have completed + // *and* the cache-bust URL must reflect the new bytes for this + // assertion to pass — exercises the rebuild → write artefact → + // re-import → return summary chain end-to-end. + const newName = `studio-e2e-trainer-renamed-${Date.now()}`; + rewriteManifest(fixturePaths.projectDir, newName); + + await expect + .poll( + async () => { + const res = await fetch(`${studio.url}/api/manifest`, { + headers: { "X-Arkor-Studio-Token": studio.token }, + }); + if (!res.ok) return null; + const body = (await res.json()) as { + trainer?: { name?: string } | null; + }; + return body.trainer?.name ?? null; + }, + { + // Same 20s budget as the SSE test for the same reason: the + // first rebuild after spawn can be slow on cold CI. Keep + // the poll interval modest so we don't hammer the dev + // loop's `runBuild` faster than it can settle. + timeout: 20_000, + intervals: [200, 400, 800, 1500], + }, + ) + .toBe(newName); + }); + + test("the SPA Run Training caption updates without a page reload after a save", async ({ + page, + studio, + fixturePaths, + }) => { + // End-to-end browser proof: the SPA's RunTraining component + // subscribes to `/api/dev/events`, calls `fetchManifest()` on + // each rebuild, and re-renders the trainer caption. Reloading + // the page would mask any regression in that subscription path, + // so we explicitly DO NOT navigate again after the edit. + await page.goto(studio.url); + await expect(page.getByText(/studio-e2e-trainer/).first()).toBeVisible(); + + const newName = `studio-e2e-trainer-live-${Date.now()}`; + rewriteManifest(fixturePaths.projectDir, newName); + + // The new name should appear without a navigation. Match by + // substring rather than exact text so the surrounding "Trainer + // from src/arkor/index.ts" caption decoration doesn't + // need to be replicated here. + await expect(page.getByText(newName).first()).toBeVisible({ + timeout: 20_000, + }); + }); +}); From 89a817150cd984354b47b3569b132f975f062d42 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 01:20:29 +0900 Subject: [PATCH 36/55] feat: implement artifact hash tracking in HMR for accurate rebuild management This commit introduces a new mechanism to track the on-disk build artifact fingerprint during the Hot Module Replacement (HMR) process. It ensures that the system can accurately determine whether the artifact has changed between the spawn and rebuild phases. This enhancement prevents stale configurations from being used in subsequent rebuilds, thereby maintaining alignment between the cloud-side JobConfig and the actual loaded configuration. Additionally, it includes tests to verify the correct behavior of the new artifact hash handling in various scenarios. --- packages/arkor/src/core/trainer.test.ts | 95 +++++++++++++++++++ packages/arkor/src/core/trainer.ts | 80 +++++++++++++--- packages/arkor/src/studio/hmr.ts | 35 +++++++ packages/arkor/src/studio/server.test.ts | 26 ++++- packages/arkor/src/studio/server.ts | 24 ++++- .../arkor/src/studio/trainRegistry.test.ts | 95 ++++++++++++++----- packages/arkor/src/studio/trainRegistry.ts | 95 +++++++++++++++---- 7 files changed, 392 insertions(+), 58 deletions(-) diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 1ef1e4fa..0f1ef06a 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1865,6 +1865,101 @@ describe("createTrainer (early stop)", () => { await requestTrainerEarlyStop(trainer, { timeoutMs: 1 }); }); + it("waits out an in-flight start() so a SIGTERM during create-job can still cancel the new job", async () => { + // Codex P1 regression: `start()` sets `scope` *before* awaiting + // `client.createJob`, so there's a real window where the cloud + // job is being created but `startedJob` is still null. If a + // runner-side SIGTERM lands in that window, an immediate + // "no-op" early-stop would let `installShutdownHandlers` exit + // the process — leaving the just-created cloud job running + // with no cancel POST. The fix is to await the in-flight + // `start()` promise inside `requestEarlyStop()` so the cancel + // path sees a definite job id (or a definite start failure). + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + let cancelCalls = 0; + let releaseCreateJob!: () => void; + const createJobReleased = new Promise((resolve) => { + releaseCreateJob = resolve; + }); + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + // Hold createJob open so we can fire `requestEarlyStop` + // mid-flight. Once the test releases the gate, return a + // valid job — that establishes the post-create state + // requestEarlyStop should then act on (cancel POST). + await createJobReleased; + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + // Fire start() but DON'T await — its createJob is gated. + const startPromise = trainer.start(); + // Yield once so the start microtasks queue up to the + // `await client.createJob`. + await new Promise((r) => setImmediate(r)); + // requestEarlyStop fires while start() is mid-flight. With + // the fix it awaits start() rather than no-op'ing immediately. + // Tiny `timeoutMs` so once `start()` resolves the latch's + // timeout-fallback fires the cancel POST quickly — there's no + // SSE stream in this test, so the checkpoint-driven path + // never arrives. We're testing the "stop awaited start()" leg + // of the contract, not the checkpoint plumbing. + const stopPromise = requestTrainerEarlyStop(trainer, { + timeoutMs: 50, + }); + // Sanity: stop hasn't resolved yet — it's blocked on + // start() which is blocked on createJob. + let stopSettled = false; + void stopPromise.then(() => { + stopSettled = true; + }); + await new Promise((r) => setImmediate(r)); + expect(stopSettled).toBe(false); + // Release createJob → start() resolves → stop() proceeds. + releaseCreateJob(); + await startPromise; + await stopPromise; + // The deciding behaviour: cancel POST was issued because the + // stop awaited start() and saw a real job id. Without the + // in-flight gate, stop would have returned immediately on + // the null `startedJob`, no cancel POST, cloud job orphaned. + expect(cancelCalls).toBe(1); + } finally { + globalThis.fetch = original; + } + }); + it("replaceTrainerCallbacks (internal HMR brand) swaps the dispatched callbacks on the next event", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index d2f6b4f5..e42a9b3a 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -145,6 +145,16 @@ export function createTrainer( let startedJob: TrainingJob | null = null; let scope: { orgSlug: string; projectSlug: string } | null = null; let clientPromise: Promise | null = null; + // In-flight `start()` promise: non-null between the first + // `client.createJob` call and the `startedJob` assignment. Lets + // `requestEarlyStop()` detect the "scope set but startedJob still + // null" window (`scope` is needed by `client.createJob` so we set + // it before the await) and wait out the create-job POST so a + // SIGTERM landing in that window can still drive a clean cancel + // once the job id materialises. Without this gate the early-stop + // path would no-op, the runner would `process.exit(0)`, and the + // newly created cloud job would orphan with no cancel POST. + let startInFlight: Promise | null = null; // Mutable callbacks slot. Each `dispatch()` invocation reads this // fresh, so the rotation triggered by the @@ -381,18 +391,46 @@ export function createTrainer( async start() { if (startedJob) return { jobId: startedJob.id }; - const client = await getClient(); - const state = await resolveProjectState(client); - scope = { orgSlug: state.orgSlug, projectSlug: state.projectSlug }; - - const { job } = await client.createJob({ - orgSlug: state.orgSlug, - projectSlug: state.projectSlug, - name: input.name, - config, - }); - startedJob = job; - return { jobId: job.id }; + // Already-pending start: reuse the in-flight promise so a + // concurrent caller (notably `requestEarlyStop` awaiting it + // to close the SIGTERM-during-create-job race) doesn't issue + // a second `client.createJob` POST. `Promise.resolve` returns + // the existing promise unchanged when it's already a thenable. + if (startInFlight) { + const job = await startInFlight; + return { jobId: job.id }; + } + // Track the pending creation so `requestEarlyStop()` can + // detect the "started but not yet recorded" window and wait + // out the `client.createJob` POST. We set `scope` *before* + // the await (it's needed by the await itself), so a SIGTERM + // landing during the await would otherwise see + // `!startedJob && scope` and exit immediately — leaving the + // newly created cloud job uncancelled. + const startPromise = (async () => { + const client = await getClient(); + const state = await resolveProjectState(client); + scope = { orgSlug: state.orgSlug, projectSlug: state.projectSlug }; + const { job } = await client.createJob({ + orgSlug: state.orgSlug, + projectSlug: state.projectSlug, + name: input.name, + config, + }); + startedJob = job; + return job; + })(); + startInFlight = startPromise; + try { + const job = await startPromise; + return { jobId: job.id }; + } finally { + // Clear regardless of resolve/reject so a failed start can + // be retried (the caller decides), and a successful one + // doesn't pin a stale promise on the trainer for the rest + // of its lifetime. + startInFlight = null; + } }, async wait(): Promise { @@ -510,6 +548,24 @@ export function createTrainer( async function requestEarlyStop( opts: RequestEarlyStopOptions = {}, ): Promise { + // SIGTERM-during-create-job race: a runner-side SIGTERM can land + // between `start()`'s `scope = { … }` assignment and its + // `client.createJob(...)` resolution, with `startedJob` still + // null but a real cloud job about to exist. Treating that window + // as "nothing in flight" would `process.exit(0)` immediately + // after this returns, leaving the newly created cloud job + // running with no cancel POST. Awaiting `startInFlight` collapses + // the race onto a definite startedJob (success) or a definite + // start failure (rejection) — either way the branches below + // can decide on real state. Swallow the rejection: if `start()` + // failed there's nothing to cancel anyway. + if (startInFlight) { + try { + await startInFlight; + } catch { + // intentionally ignored — failed start has no job to cancel + } + } // Nothing in flight: cleanup any prior latch and resolve. if (!startedJob || !scope || TERMINAL_STATUSES.has(startedJob.status)) { settleEarlyStopLatch(); diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 14cad357..e030db5a 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -50,6 +50,21 @@ export interface HmrCoordinator { * scaffold) or the latest event was an `error`. */ getCurrentConfigHash(): string | null; + /** + * Synchronous fingerprint of the on-disk build artefact RIGHT NOW + * (fresh stat, not cached). Used by `/api/train`'s registry entry + * so HMR routing in the pre-ready-spawn case (`configHash === null`) + * can compare against the rebuild's `event.hash` to tell whether + * the child read the same bytes. Without this gate, an edit + * landing between spawn and the watcher's first BUNDLE_END would + * silently teach the registry to use the post-edit `configHash` + * as the child's baseline — later same-hash rebuilds would then + * hot-swap callbacks into a child whose cloud-side `JobConfig` + * was actually spawned against an older version, leaving the + * cloud run on a stale config. `null` when stat fails (artefact + * doesn't exist yet, fresh project never built). + */ + getCurrentArtifactHash(): string | null; dispose(): Promise; } @@ -368,6 +383,26 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // a build that wasn't inspectable). return lastSuccessConfigHash; }, + getCurrentArtifactHash() { + // Fresh stat — not the cached `lastEvent.hash`. The cached + // hash describes the bytes the watcher last broadcast about, + // but the on-disk artefact may be newer (a BUNDLE_END is + // queued, file already written, inspection still pending) or + // older (next BUNDLE_END hasn't fired yet but the user just + // edited and saved). For the registry's pre-ready-spawn gate + // we want "what bytes will the child's `await import()` see + // RIGHT NOW", which only the live `fingerprint(outFile)` + // gives. Null falls through `fingerprint`'s catch when the + // file doesn't exist yet — equivalent to "child can't load + // anything", which dispatchRebuild treats as a forced + // SIGTERM-restart. + try { + statSync(resolved.outFile); + } catch { + return null; + } + return fingerprint(resolved.outFile); + }, async dispose() { disposed = true; subscribers.clear(); diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 01ed4d23..78c13e02 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -152,6 +152,7 @@ describe("Studio server", () => { const fakeHmr = { subscribe: () => () => undefined, getCurrentConfigHash: () => null, + getCurrentArtifactHash: () => null, async dispose() {}, }; const app = buildStudioApp({ @@ -611,6 +612,7 @@ process.exit(0); getCurrentCalls += 1; return "spawn-time-hash"; }, + getCurrentArtifactHash: () => "spawn-artefact-hash", async dispose() {}, }; const fakeBin = join(trainCwd, "fake-bin.mjs"); @@ -691,9 +693,16 @@ process.exit(0); // ReadableStream, those handlers kept firing — and calling // `enqueue` / `close` on a closed controller throws "Invalid // state". The throw escaped the request pipeline as an - // unhandled exception. The fix tracks a `closed` flag, removes - // the child listeners on cancel, and try/catches the post- - // cancel enqueue paths defensively. + // unhandled exception. The fix flips a `closed` flag in + // `cancelTeardown` and try/catches the post-cancel enqueue + // paths defensively. NOTE: cancel intentionally does NOT + // detach the `data` listeners — leaving them attached keeps + // the OS pipe draining while the child checkpoints / exits + // gracefully (otherwise a full pipe back-pressures and + // deadlocks the very graceful exit we're preserving). + // `onClose` / `onError` detach all listeners when the child + // finally exits. See `cancelTeardown` in `studio/server.ts` + // for the full backpressure rationale. await writeCredentials(ANON_CREDS); const fakeBin = join(trainCwd, "fake-bin.mjs"); // Bin spits a chunk every ~5 ms forever. We cancel while it's @@ -1552,6 +1561,11 @@ process.exit(0); // spawned-config snapshot. const subs = new Set<(e: HmrEvent) => void>(); let currentConfigHash: string | null = initialConfigHash; + // Match the real coordinator's behaviour: a stable artefact + // fingerprint at spawn time. Tests that exercise the + // pre-ready-spawn path (configHash null, then a real hash) + // can override via `setArtifactHash`. + let currentArtifactHash: string | null = "fake-artefact-hash"; const coordinator: HmrCoordinator = { subscribe(fn) { subs.add(fn); @@ -1562,6 +1576,9 @@ process.exit(0); getCurrentConfigHash() { return currentConfigHash; }, + getCurrentArtifactHash() { + return currentArtifactHash; + }, async dispose() { subs.clear(); }, @@ -1574,6 +1591,9 @@ process.exit(0); setConfigHash(hash: string | null) { currentConfigHash = hash; }, + setArtifactHash(hash: string | null) { + currentArtifactHash = hash; + }, get subscriberCount() { return subs.size; }, diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index dfac2099..ff0fac9f 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -368,6 +368,19 @@ export function buildStudioApp(options: StudioServerOptions) { const configHash: string | null = options.hmr ? options.hmr.getCurrentConfigHash() : null; + // Spawn-time fingerprint of the on-disk build artefact. Only the + // pre-ready-spawn case in `dispatchRebuild` consults it: when a + // rebuild lands while the child's `configHash` is still null, + // backfilling the new hash is only safe if the artefact bytes + // the child loaded (= the bytes on disk *now*, at spawn) are + // the same bytes the new hash describes. Without this gate, an + // edit landing between spawn and the watcher's first BUNDLE_END + // would silently align the registry with a config the child + // never actually loaded → cloud-side `JobConfig` drift on + // subsequent same-hash hot-swaps. + const spawnArtifactHash: string | null = options.hmr + ? options.hmr.getCurrentArtifactHash() + : null; const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); // `spawn()` is mostly async (filesystem failures surface as the @@ -396,7 +409,7 @@ export function buildStudioApp(options: StudioServerOptions) { 500, ); } - activeTrains.register(child, { trainFile, configHash }); + activeTrains.register(child, { trainFile, configHash, spawnArtifactHash }); // Hoisted out of the `ReadableStream` underlying-source so the // `start` handler can hand its closure-bound teardown helper to // the `cancel` handler. `cancel` runs in a separate invocation, @@ -608,8 +621,15 @@ export function buildStudioApp(options: StudioServerOptions) { // both buckets so the SPA can react per-child rather than // assuming one global outcome. const nextHash = event.configHash ?? null; + // The artefact fingerprint travels in the same broadcast + // (`event.hash`). Pass it through so the registry can gate + // the pre-ready-spawn backfill on whether the bytes the + // child loaded match what this rebuild's hash describes + // — see `dispatchRebuild`'s comment for why a null entry + // hash + matching artefact is the only safe backfill path. + const nextArtifactHash = event.hash ?? null; const { hotSwapTargets, restartTargets } = - activeTrains.dispatchRebuild(nextHash); + activeTrains.dispatchRebuild(nextHash, nextArtifactHash); augmented = { ...event, hotSwap: hotSwapTargets.length > 0, diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index d1c230b9..cf3c15ef 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -64,50 +64,101 @@ describe("TrainRegistry", () => { expect(b.kill).toHaveBeenCalledWith("SIGTERM"); }); - it("dispatchRebuild backfills the hash and skips dispatch when the spawn-time hash was null", () => { - // Regression: previously a child registered with `configHash: - // null` (spawn happened *before* the HMR watcher emitted its - // first successful build, so `getCurrentConfigHash()` returned - // null) was treated as a hash mismatch on the next event with - // a real hash and SIGTERM-restarted. Since the dispatch now - // fires on `ready` events too, that turned every "click Run - // before the watcher's first BUNDLE_END" into a spurious - // cancel+restart cycle (extra GPU spend / job churn) triggered - // purely by startup timing rather than any actual code change. - // The fix backfills the entry's hash with the first known value - // and skips signal dispatch — the child either already loaded - // the right bundle or surfaces its own load error; future - // rebuilds compare against the backfilled hash like any other. + it("dispatchRebuild backfills the hash and skips dispatch when the spawn-time artefact matches the new build", () => { + // Pre-ready spawn (configHash: null) is the "user clicked Run + // before the watcher's first BUNDLE_END" case. Whether it's + // safe to backfill the new hash as the child's baseline depends + // on whether the on-disk artefact has changed between spawn + // and now: if `spawnArtifactHash === nextArtifactHash`, the + // child read exactly the bytes the new hash describes → + // backfill + skip dispatch (no spurious cancel+restart cycle). + // Otherwise — see the next test — SIGTERM-restart so cloud + // and child stay aligned. const reg = new TrainRegistry(); const c = fakeChild(401); reg.register(c as unknown as ChildProcess, { configHash: null, trainFile: "/tmp/preready.ts", + spawnArtifactHash: "art-v1", }); - const result = reg.dispatchRebuild("first-real-hash"); + const result = reg.dispatchRebuild("first-real-hash", "art-v1"); // Neither bucket — no signal sent, nothing for the SPA to react to. expect(result.hotSwapTargets).toEqual([]); expect(result.restartTargets).toEqual([]); expect(c.kill).not.toHaveBeenCalled(); - // A subsequent dispatch with the SAME hash must take the hot- - // swap path (proves the backfill landed; without it this would - // STILL be null vs "first-real-hash" → SIGTERM). - const second = reg.dispatchRebuild("first-real-hash"); + // A subsequent dispatch with the SAME config hash must take the + // hot-swap path (proves the backfill landed; without it this + // would STILL be null vs "first-real-hash" → SIGTERM). + const second = reg.dispatchRebuild("first-real-hash", "art-v2"); expect(second.hotSwapTargets).toEqual([ { pid: 401, trainFile: "/tmp/preready.ts" }, ]); expect(second.restartTargets).toEqual([]); expect(c.kill).toHaveBeenCalledWith("SIGUSR2"); - // And a different hash on a later rebuild now correctly routes - // to SIGTERM-restart (backfilled hash is real). + // And a different config hash on a later rebuild now correctly + // routes to SIGTERM-restart (backfilled hash is real). c.kill.mockClear(); - const third = reg.dispatchRebuild("second-hash"); + const third = reg.dispatchRebuild("second-hash", "art-v3"); expect(third.restartTargets).toEqual([ { pid: 401, trainFile: "/tmp/preready.ts" }, ]); expect(c.kill).toHaveBeenCalledWith("SIGTERM"); }); + it("dispatchRebuild SIGTERM-restarts a pre-ready spawn when the artefact has changed since spawn", () => { + // Codex P2 regression: an edit landing between spawn and the + // watcher's first BUNDLE_END means the bytes the child loaded + // differ from what the new `configHash` describes. Backfilling + // unconditionally would silently teach the registry to use the + // post-edit hash as the child's baseline — later same-hash + // rebuilds would then hot-swap callbacks into a child whose + // cloud-side `JobConfig` was actually spawned against an older + // version, leaving the cloud run on a stale config. The artefact + // fingerprint mismatch (`art-stale` vs `art-fresh`) is the + // signal that the child loaded older bytes; SIGTERM-restart + // forces a clean re-spawn against the freshly-built artefact. + const reg = new TrainRegistry(); + const c = fakeChild(411); + reg.register(c as unknown as ChildProcess, { + configHash: null, + trainFile: "/tmp/preready-stale.ts", + spawnArtifactHash: "art-stale", + }); + const result = reg.dispatchRebuild("real-hash", "art-fresh"); + // SIGTERM-restart: the child's bytes are stale relative to the + // new build. Hot-swap would be unsafe (config drift); skip + // would leave the child running with no future correction + // path (the registry would treat "real-hash" as the baseline + // even though the child never loaded that build). + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([ + { pid: 411, trainFile: "/tmp/preready-stale.ts" }, + ]); + expect(c.kill).toHaveBeenCalledWith("SIGTERM"); + }); + + it("dispatchRebuild SIGTERM-restarts a pre-ready spawn when no artefact existed at spawn time", () => { + // Companion to the "artefact has changed" test: a fresh project + // never built before spawn means `coordinator.getCurrentArtifactHash()` + // returned `null`. The child's `await import` likely failed; we + // can't prove its config matches anything. Conservative + // SIGTERM-restart so the SPA re-spawns once the new bundle is + // on disk. + const reg = new TrainRegistry(); + const c = fakeChild(421); + reg.register(c as unknown as ChildProcess, { + configHash: null, + trainFile: "/tmp/preready-fresh.ts", + spawnArtifactHash: null, // no artefact when /api/train fired + }); + const result = reg.dispatchRebuild("first-real-hash", "art-fresh"); + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([ + { pid: 421, trainFile: "/tmp/preready-fresh.ts" }, + ]); + expect(c.kill).toHaveBeenCalledWith("SIGTERM"); + }); + it("isEarlyStopRequested reflects the dispatchRebuild SIGTERM flag", () => { // Regression: `/api/train`'s ReadableStream `cancel()` consults // this flag to avoid sending a *second* SIGTERM to a child that diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 6b284741..1309f22a 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -21,6 +21,21 @@ export interface ActiveTrain { * build). A null entry forces SIGTERM on the next rebuild because we * can't prove the configs match. */ configHash: string | null; + /** + * Fingerprint (mtime+ctime+size, see `core/moduleCacheBust.ts`) of + * the on-disk `.arkor/build/index.mjs` at spawn time. Used **only** + * to gate the pre-ready-spawn backfill: if a rebuild eventually + * fires while `configHash` is still null and this fingerprint + * matches the rebuild's artefact, the child is provably reading + * the same bundle bytes the new hash describes — safe to backfill + * `configHash` and skip dispatch. A mismatch (or null here) means + * the on-disk artefact has changed between spawn and rebuild + * (user edited mid-spawn, fresh project never built, …) so the + * child is running stale bytes and we MUST SIGTERM-restart to + * keep cloud-side `JobConfig` aligned with what the child + * actually loaded. Null when HMR isn't enabled or stat failed. + */ + spawnArtifactHash: string | null; /** * `true` once we've already SIGTERM'd this child for an HMR-driven * early-stop. Subsequent rebuilds (which can land before the child @@ -102,12 +117,26 @@ export class TrainRegistry { register( child: ChildProcess, - init: Omit, + init: Omit< + ActiveTrain, + "child" | "earlyStopRequested" | "spawnArtifactHash" + > & { + // Optional in the signature so tests / future callers that + // don't track the on-disk artefact fingerprint (e.g. an HMR- + // disabled server, a hand-rolled fake) can omit it. Defaults + // to `null`, which forces the pre-ready-spawn branch to fall + // through to SIGTERM-restart on the next non-null rebuild — + // the safe choice when we genuinely don't know what bytes + // the child loaded. Real `/api/train` calls in HMR mode + // capture this from `coordinator.getCurrentArtifactHash()`. + spawnArtifactHash?: string | null; + }, ): void { if (typeof child.pid !== "number") return; this.entries.set(child.pid, { child, ...init, + spawnArtifactHash: init.spawnArtifactHash ?? null, earlyStopRequested: false, }); } @@ -168,7 +197,16 @@ export class TrainRegistry { * callback edits still take effect (via a full restart) rather * than silently being ignored. */ - dispatchRebuild(nextConfigHash: string | null): DispatchResult { + dispatchRebuild( + nextConfigHash: string | null, + // Defaults to `null` so tests / pre-existing callers that don't + // pass the artefact hash get the conservative behaviour: the + // pre-ready-spawn branch's `artefactsAgree` check is `false`, + // so a null entry hash falls through to SIGTERM-restart. Real + // dispatch from `/api/train`'s HMR subscriber threads + // `event.hash` here so the backfill optimisation activates. + nextArtifactHash: string | null = null, + ): DispatchResult { const hotSwapTargets: RestartTarget[] = []; const restartTargets: RestartTarget[] = []; @@ -177,23 +215,42 @@ export class TrainRegistry { const target: RestartTarget = { pid, trainFile: entry.trainFile }; // Pre-ready spawn: this child was registered via `/api/train` // *before* the HMR watcher's first successful build, so its - // recorded `configHash` is `null`. Now that a real hash has - // arrived, treat the current build as the child's baseline: - // backfill the hash and skip signal dispatch entirely. The - // alternative (treating `null !== nextHash` as a real - // mismatch) would SIGTERM-restart the just-started child even - // though the config never actually changed — a spurious - // cancel+restart cycle that costs GPU budget for no benefit - // and that's triggered purely by startup timing (user clicked - // Run before the watcher's initial BUNDLE_END landed). The - // SIGUSR2 path is also wrong here: the child might still be - // mid-flight on its initial bundle import, and racing a - // reload signal against that load wastes work. Future rebuilds - // against this entry now compare against the backfilled hash - // like any other child. - if (entry.configHash === null && nextConfigHash !== null) { - entry.configHash = nextConfigHash; - continue; + // recorded `configHash` is `null`. Whether the rebuild's new + // hash describes the same bytes the child actually loaded + // depends on whether the on-disk artefact has changed between + // spawn and now. Tie the decision to the artefact fingerprint: + // + // - `entry.spawnArtifactHash === nextArtifactHash` → child + // read the same bytes the new hash describes. Safe to + // backfill `configHash`; future rebuilds compare against + // the backfilled value like any other child. This is the + // common case (user clicked Run before the SPA had + // refreshed its manifest, but the on-disk artefact is the + // same one the watcher just settled on). + // + // - artefact fingerprints differ (or one side is null) → + // the bytes the child loaded don't match the new hash. + // SIGTERM-restart so the cloud-side `JobConfig` and the + // child's actual config are guaranteed to align. Without + // this gate, an edit landing between spawn and the first + // BUNDLE_END would silently teach the registry to use the + // post-edit hash as the child's baseline — later + // same-hash rebuilds would then hot-swap callbacks into + // a child whose cloud-side `JobConfig` was *actually* + // spawned against an older version, leaving the cloud + // run on a stale config. + const isPreReadySpawn = + entry.configHash === null && nextConfigHash !== null; + if (isPreReadySpawn) { + const artefactsAgree = + entry.spawnArtifactHash !== null && + nextArtifactHash !== null && + entry.spawnArtifactHash === nextArtifactHash; + if (artefactsAgree) { + entry.configHash = nextConfigHash; + continue; + } + // fall through to the mismatch / SIGTERM-restart path below } const matches = nextConfigHash !== null && From d5b89df283463b812c1066184f00fdcf746c0eef Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 08:58:19 +0900 Subject: [PATCH 37/55] refactor: update comments in HMR test to clarify edit and subscribe sequence This commit refines the comments in the Hot Module Replacement (HMR) test suite, explaining the rationale behind the order of editing and subscribing. The changes aim to enhance understanding of the predicate filtering process and the implications of cached initial-build events, ensuring clarity for future developers working on the HMR functionality. --- e2e/studio/src/specs/hmr.spec.ts | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/e2e/studio/src/specs/hmr.spec.ts b/e2e/studio/src/specs/hmr.spec.ts index 1371e47e..c837794a 100644 --- a/e2e/studio/src/specs/hmr.spec.ts +++ b/e2e/studio/src/specs/hmr.spec.ts @@ -170,13 +170,16 @@ test.describe("Studio HMR", () => { studio, fixturePaths, }) => { - // Subscribe FIRST so the cached `ready` event from the watcher's - // initial BUNDLE_END is consumed before we trigger the new - // rebuild. Without draining the cached frame we'd race: if the - // initial inspection finished before our subscribe arrives, the - // first frame we see could be the stale `ready` for the seeded - // name and the predicate would match the wrong build. The - // predicate explicitly requires the post-edit name to dodge that. + // Edit BEFORE subscribing, then let the predicate filter out + // pre-edit replays. The watcher may already have a cached + // initial-build `ready` (with the seed name) by the time we + // connect; subscribing first then editing would force a + // drain step. Going edit → subscribe is simpler: the + // predicate explicitly requires `trainerName === newName`, + // which only the post-edit BUNDLE_END can satisfy — any + // cached or in-flight frame for the seed name fails the + // predicate and `awaitSseFrame` keeps reading until the + // matching one arrives. const newName = "studio-e2e-trainer-edited"; rewriteManifest(fixturePaths.projectDir, newName); From 88612dfffcae55ff5b9df59319f265d2b7882ac0 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 09:10:52 +0900 Subject: [PATCH 38/55] refactor: improve header handling in buildStudioApp response This commit refines the response header management in the buildStudioApp function. It ensures that the TRAIN_PID_HEADER is omitted entirely when the child process ID is not a number, rather than sending an empty string. This change clarifies the communication contract with the SPA, allowing for cleaner handling of the absence of a process ID and improving overall response clarity. --- packages/arkor/src/studio/server.ts | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index ff0fac9f..81e1e25b 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -558,14 +558,25 @@ export function buildStudioApp(options: StudioServerOptions) { // `/api/dev/events` broadcasts `restartTargets` / `hotSwapTargets`. // Without this, a passive tab whose run was hot-swapped could // misread a sibling tab's restart event as its own. - const pidHeader = typeof child.pid === "number" ? String(child.pid) : ""; - return new Response(stream, { - status: 200, - headers: { - "content-type": "text/plain; charset=utf-8", - [TRAIN_PID_HEADER]: pidHeader, - }, - }); + // + // Header is OMITTED entirely (rather than sent as an empty + // string) when `child.pid` isn't a number — that case happens + // when the OS hasn't assigned a pid by the time `spawn()` + // returns and the child's async `error` event will fire shortly + // (per-Node-docs `subprocess.pid` is `undefined` for + // failed-spawn children). "Header absent" is the unambiguous + // signal the SPA can read; an empty string would force callers + // to special-case `""` vs missing for the same condition. The + // SPA's `raw ? Number.parseInt(raw, 10) : NaN` handler treats + // both cases identically, but absent-only is the cleaner wire + // contract. + const headers: Record = { + "content-type": "text/plain; charset=utf-8", + }; + if (typeof child.pid === "number") { + headers[TRAIN_PID_HEADER] = String(child.pid); + } + return new Response(stream, { status: 200, headers }); }); // `/api/dev/events` — SSE stream of HMR rebuild / error notifications. From eeb7f2a001ed3b8251d9d8b648dc72a1b1d766e3 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 09:50:18 +0900 Subject: [PATCH 39/55] feat: implement HMR fast path for manifest retrieval This commit introduces a performance optimization in the manifest retrieval process by implementing a fast path for Hot Module Replacement (HMR) scenarios. When HMR is enabled and the pre-built artifact exists, the system skips the `runBuild()` call, reducing unnecessary CPU usage and preventing race conditions with the watcher. The changes include updates to the `readManifestSummary` function and corresponding tests to ensure correct behavior in both HMR-enabled and fresh scaffold scenarios. --- packages/arkor/src/studio/manifest.ts | 31 ++++++- packages/arkor/src/studio/server.test.ts | 110 +++++++++++++++++++++++ packages/arkor/src/studio/server.ts | 19 +++- 3 files changed, 158 insertions(+), 2 deletions(-) diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index 70be440e..2ed73ccf 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -1,3 +1,4 @@ +import { existsSync } from "node:fs"; import { runBuild } from "../cli/commands/build"; import { hashJobConfig } from "../core/configHash"; import { moduleCacheBustUrl } from "../core/moduleCacheBust"; @@ -71,6 +72,28 @@ export async function summariseBuiltManifest( }; } +export interface ReadManifestOptions { + /** + * HMR-aware fast path: when set and the file exists, skip the + * `runBuild()` call and inspect this artefact directly. The HMR + * coordinator already keeps `.arkor/build/index.mjs` continuously + * fresh via its rolldown watcher, so re-running `runBuild()` on + * every `/api/manifest` poll (every ~5 s + on every rebuild SSE + * event) is wasted CPU AND races the watcher writing to the + * same path. Pre-existence is checked with `existsSync` so the + * very first poll on a fresh scaffold (watcher's first + * BUNDLE_END hasn't completed yet) still bootstraps via + * `runBuild()`. Once the file appears, subsequent polls skip + * the rebuild. + * + * Pass `coordinator.outFile`-equivalent (e.g. + * `resolveBuildEntry({ cwd }).outFile`) here when the server has + * an active `HmrCoordinator`; leave undefined when HMR is off so + * the build path runs as before. + */ + prebuiltOutFile?: string; +} + /** * Build the user's `src/arkor/index.ts` and import the artifact to * extract a serialisable summary of its manifest. The Studio UI hits @@ -78,11 +101,17 @@ export async function summariseBuiltManifest( * trainer name today; deploy / eval slots when those primitives land). * * Each call rebuilds and re-imports so edits to the user's source - * surface without restarting Studio. + * surface without restarting Studio. When `prebuiltOutFile` is + * supplied (HMR-enabled servers), the `runBuild()` step is bypassed + * — see `ReadManifestOptions.prebuiltOutFile` for the rationale. */ export async function readManifestSummary( cwd: string, + opts: ReadManifestOptions = {}, ): Promise { + if (opts.prebuiltOutFile && existsSync(opts.prebuiltOutFile)) { + return summariseBuiltManifest(opts.prebuiltOutFile); + } const { outFile } = await runBuild({ cwd, quiet: true }); return summariseBuiltManifest(outFile); } diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 78c13e02..e6792d1f 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -1341,6 +1341,116 @@ process.exit(0); const body = (await res.json()) as { trainer: unknown }; expect(body.trainer).toBeNull(); }); + + it("skips runBuild() when HMR is enabled and the watcher's artefact already exists", async () => { + // Regression: previously every `/api/manifest` poll triggered a + // fresh `runBuild()` even with HMR active, so the SPA's + // ~5 s polling + per-rebuild SSE refetch would re-bundle on + // every poll AND race the watcher writing to the same + // `.arkor/build/index.mjs`. The fast path inspects the + // pre-existing artefact directly when HMR's coordinator is + // wired in. We assert by pre-writing a hand-rolled artefact + // bundle and verifying `/api/manifest` returns its trainer + // *without* the source file existing — `runBuild()` would + // throw on the missing entry, so a 200 here proves we never + // called it. + await writeCredentials(ANON_CREDS); + // Write the artefact that the HMR watcher would have produced. + // Mirrors the seed fixture's shape: `_kind: "arkor"` + trainer + // with the four required methods. + mkdirSync(join(trainCwd, ".arkor/build"), { recursive: true }); + writeFileSync( + join(trainCwd, ".arkor/build/index.mjs"), + `const trainer = { + name: "hmr-fast-path", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + }; + export const arkor = { _kind: "arkor", trainer }; + export default arkor; + `, + ); + // Notice: NO `src/arkor/index.ts`. `runBuild()` would fail with + // "Build entry not found" — the test fails if the fast path + // regresses and falls through to it. + const fakeHmr = { + subscribe: () => () => undefined, + getCurrentConfigHash: () => null, + getCurrentArtifactHash: () => null, + async dispose() {}, + }; + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fakeHmr, + }); + const res = await app.request("/api/manifest", { + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + }, + }); + expect(res.status).toBe(200); + const body = (await res.json()) as { + trainer: { name: string } | null; + }; + expect(body.trainer).toEqual({ name: "hmr-fast-path" }); + }); + + it("falls back to runBuild() when HMR is enabled but the watcher hasn't produced an artefact yet", async () => { + // Companion to the fast-path test: on a fresh scaffold the + // watcher's first BUNDLE_END may not have completed by the + // time the SPA's first /api/manifest poll lands. Without the + // existsSync gate we'd `await import(missing)` and 400 + // forever (the watcher's later writes don't retroactively + // make this poll succeed); with the gate we bootstrap via + // `runBuild()` for that single call. + await writeCredentials(ANON_CREDS); + mkdirSync(join(trainCwd, "src/arkor"), { recursive: true }); + writeFileSync( + join(trainCwd, "src/arkor/index.ts"), + `export const arkor = Object.freeze({ + _kind: "arkor", + trainer: { + name: "fallback-build", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + }, + });`, + ); + // No pre-existing `.arkor/build/index.mjs` — the artefact + // doesn't exist. `existsSync` is false → `runBuild()` runs. + const fakeHmr = { + subscribe: () => () => undefined, + getCurrentConfigHash: () => null, + getCurrentArtifactHash: () => null, + async dispose() {}, + }; + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fakeHmr, + }); + const res = await app.request("/api/manifest", { + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + }, + }); + expect(res.status).toBe(200); + const body = (await res.json()) as { + trainer: { name: string } | null; + }; + expect(body.trainer).toEqual({ name: "fallback-build" }); + }); }); describe("/api/inference/chat", () => { diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 81e1e25b..aa5144b4 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -16,6 +16,7 @@ import { recordDeprecation, tapDeprecation } from "../core/deprecation"; import { SDK_VERSION } from "../core/version"; import { ensureProjectState } from "../core/projectState"; import { readState } from "../core/state"; +import { resolveBuildEntry } from "../core/rolldownConfig"; import { readManifestSummary } from "./manifest"; import type { HmrCoordinator, HmrEvent } from "./hmr"; import { TrainRegistry, type RestartTarget } from "./trainRegistry"; @@ -258,9 +259,25 @@ export function buildStudioApp(options: StudioServerOptions) { return new Response(body, { status: res.status, headers }); }); + // Pre-resolved outFile for the HMR fast path. The path is + // deterministic per cwd (defaults from `BUILD_DEFAULTS`), so we + // compute it once at app build time rather than on every request. + // Only used when HMR is enabled — `readManifestSummary` falls + // back to `runBuild()` when this is undefined or the file doesn't + // exist yet (fresh scaffold pre-watcher-bootstrap). + const hmrOutFile = options.hmr + ? resolveBuildEntry({ cwd: trainCwd }).outFile + : undefined; app.get("/api/manifest", async (c) => { try { - const manifest = await readManifestSummary(trainCwd); + // HMR-aware fast path: when `arkor dev` wired in a coordinator, + // skip the per-request `runBuild()` and read the watcher's + // already-built artefact. Without this every SPA poll + // (~5 s + per-rebuild SSE refetch) would re-bundle and race + // the watcher writing to the same `.arkor/build/index.mjs`. + const manifest = await readManifestSummary(trainCwd, { + prebuiltOutFile: hmrOutFile, + }); return c.json(manifest); } catch (err) { // The user's `src/arkor/index.ts` may not exist yet (fresh scaffold) or From bc5485b6836d885c9d82a79041914bb890936d6b Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 10:29:59 +0900 Subject: [PATCH 40/55] feat: implement SIGKILL for user-initiated training cancellations This commit modifies the training cancellation mechanism to use SIGKILL instead of the default SIGTERM for user-initiated stops. This change ensures that training processes are terminated immediately, preventing unnecessary resource consumption during background execution. The implementation includes updates to the buildStudioApp function and adds tests to verify the new behavior, addressing a regression introduced by previous graceful shutdown handling. Additionally, it refines the handling of process termination on Windows to ensure correct behavior across platforms. --- packages/arkor/src/studio/server.test.ts | 65 +++++++++++++++++++ packages/arkor/src/studio/server.ts | 25 ++++++- .../arkor/src/studio/trainRegistry.test.ts | 47 ++++++++++++++ packages/arkor/src/studio/trainRegistry.ts | 42 ++++++++---- 4 files changed, 166 insertions(+), 13 deletions(-) diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index e6792d1f..c5841655 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -642,6 +642,71 @@ process.exit(0); expect(getCurrentCalls).toBe(1); }); + it("/api/train cancel sends SIGKILL so user-initiated stop bypasses the runner's graceful early-stop", async () => { + // Regression: a default `child.kill()` sends SIGTERM, which + // the runner's `installShutdownHandlers` now interprets as a + // graceful early-stop request (wait for the next checkpoint, + // up to ~5 min). For HMR-driven cancels that's correct, but + // for a Stop-training click the user wants the run STOPPED + // immediately — leaving it running in the background for + // minutes consuming GPU spend silently is a regression + // introduced by this PR's graceful-shutdown work. We assert + // SIGKILL by giving the bin a SIGTERM no-op handler: SIGTERM + // would be swallowed and the bin would stay alive; SIGKILL + // is uncatchable and reaps the process unconditionally. + // Probe liveness with `process.kill(pid, 0)` (ESRCH ⇒ gone). + await writeCredentials(ANON_CREDS); + const hangingBin = join(trainCwd, "hanging-bin.mjs"); + writeFileSync( + hangingBin, + // SIGTERM swallowed; setInterval keeps the event loop + // alive forever absent SIGKILL. + `process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: hangingBin, + }); + const res = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(res.status).toBe(200); + const pid = Number(res.headers.get("x-arkor-train-pid")); + expect(Number.isFinite(pid)).toBe(true); + + // Trigger the cancel() handler. + await res.body!.cancel(); + + // Give the OS a moment to deliver SIGKILL and reap. + await new Promise((r) => setTimeout(r, 300)); + + // `process.kill(pid, 0)` is the standard "is this pid alive?" + // probe — sends signal 0 (no-op) but the syscall still + // surfaces ESRCH for non-existent pids. SIGKILL → reaped → + // ESRCH. SIGTERM (with the bin's no-op handler) → still + // alive → no throw → test fails. + let probeError: NodeJS.ErrnoException | null = null; + try { + process.kill(pid, 0); + } catch (e) { + probeError = e as NodeJS.ErrnoException; + } + expect(probeError).not.toBeNull(); + expect(probeError?.code).toBe("ESRCH"); + }); + it("/api/train cancel handler doesn't crash when child.kill() throws", async () => { // Regression: `ReadableStream.cancel()` called `child.kill()` // without a try/catch. If the child had already exited (ESRCH diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index aa5144b4..c9c95412 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -557,6 +557,29 @@ export function buildStudioApp(options: StudioServerOptions) { activeTrains.unregister(child.pid); cancelTeardown?.(); if (earlyStopInFlight) return; + // SIGKILL (not the default SIGTERM) for user-initiated + // aborts. The runner's `installShutdownHandlers` now treats + // a single SIGTERM as the HMR-driven "graceful early-stop" + // signal — wait for the next checkpoint (up to ~5 min + // timeout) before exiting. That semantics is right for the + // HMR path but wrong for a Stop-training click: the user + // wants the run STOPPED, not left running in the background + // for minutes consuming GPU/cloud spend while the UI has + // already settled to idle. SIGKILL is uncatchable so the + // child dies immediately, eliminating the + // unregister-before-graceful-exit window where a fast new + // run could overlap an old one untracked by HMR routing. + // + // Trade-off: the runner can't POST `/v1/jobs/:id/cancel` to + // cloud-api on its way out (its early-stop chain is + // bypassed). The cloud-side job is left orphaned until the + // server reaper / TTL kicks in. This matches the pre-PR + // behaviour (the runner had no signal handler at all then; + // SIGTERM also killed it without cloud cancel). Sending a + // direct `/v1/jobs/:id/cancel` from the server here is a + // separate follow-up — would need the jobId, which the + // server doesn't currently parse out of stdout. + // // `ChildProcess.kill()` can throw (ESRCH if the process has // already exited between this handler's invocation and the // signal delivery). A throw here would surface as an unhandled @@ -564,7 +587,7 @@ export function buildStudioApp(options: StudioServerOptions) { // handler — swallow it; the close handler above has already // taken the entry out of the registry. try { - child.kill(); + child.kill("SIGKILL"); } catch { // already gone; nothing to clean up. } diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index cf3c15ef..935f95f3 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -325,6 +325,53 @@ describe("TrainRegistry", () => { expect(respawn.kill).toHaveBeenCalledTimes(1); }); + it("dispatchRebuild on win32 routes hash-matches directly to SIGTERM-restart (skips SIGUSR2 attempt)", () => { + // Regression: Node's `child.kill("SIGUSR2")` on Windows is + // documented to **forcefully terminate** the process (treats + // any unknown POSIX signal as SIGKILL-equivalent) and STILL + // returns `true` like a successful delivery. `safeKill` would + // then report `"ok"` → entry lands in `hotSwapTargets` → SPA + // shows "hot-swap" and skips restart, but the child is already + // dead. The Codex P1 fix gates the SIGUSR2 attempt behind + // `process.platform !== "win32"` so win32 routes straight to + // SIGTERM-restart, surfacing a real restart target the SPA can + // act on. + const originalPlatform = Object.getOwnPropertyDescriptor( + process, + "platform", + ); + Object.defineProperty(process, "platform", { + value: "win32", + configurable: true, + }); + try { + const reg = new TrainRegistry(); + const a = fakeChild(951); + a.kill.mockReturnValue(true); // win32 reports success even for SIGUSR2 + reg.register(a as unknown as ChildProcess, { + configHash: "match", + trainFile: "/tmp/win.ts", + }); + const result = reg.dispatchRebuild("match"); + // Restart bucket only — hot-swap is unsafe on win32 even + // when kill() reported "ok". + expect(result.hotSwapTargets).toEqual([]); + expect(result.restartTargets).toEqual([ + { pid: 951, trainFile: "/tmp/win.ts" }, + ]); + // SIGUSR2 was NEVER attempted: the platform gate skipped it + // entirely and went straight to the SIGTERM fallback path. + // (Without the gate, SIGUSR2 would have fired first and been + // misclassified as a successful hot-swap.) + expect(a.kill).toHaveBeenCalledTimes(1); + expect(a.kill).toHaveBeenCalledWith("SIGTERM"); + } finally { + if (originalPlatform) { + Object.defineProperty(process, "platform", originalPlatform); + } + } + }); + it("dispatchRebuild degrades to SIGTERM-restart when SIGUSR2 is unsupported (Windows)", () => { // Regression: Node's win32 build doesn't deliver SIGUSR2 (it // throws "ENOSYS" inside `child.kill('SIGUSR2')`). The previous diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 1309f22a..f03b9f3b 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -258,19 +258,37 @@ export class TrainRegistry { entry.configHash === nextConfigHash; if (matches) { - const r = safeKill(entry.child, "SIGUSR2"); - if (r === "ok") { - hotSwapTargets.push(target); - continue; - } - if (r === "gone") { - // Child already exited; close handler will unregister. - continue; + // On Windows, Node's `child.kill(signal)` for any unknown + // POSIX signal (including SIGUSR2) is documented to + // **forcefully terminate** the process — same effect as + // SIGKILL — and `kill()` returns `true` like a successful + // delivery. `safeKill` would then report `"ok"`, the entry + // would land in `hotSwapTargets`, and the SPA would never + // schedule a restart even though the child is *dead*. Skip + // the SIGUSR2 attempt on win32 entirely and route directly + // to the SIGTERM-restart path so the SPA learns about the + // pending restart and re-spawns when the exit line arrives. + // The user-visible outcome (callbacks reload after a brief + // restart) matches the design intent on platforms where + // the in-place hot-swap simply isn't available. + if (process.platform !== "win32") { + const r = safeKill(entry.child, "SIGUSR2"); + if (r === "ok") { + hotSwapTargets.push(target); + continue; + } + if (r === "gone") { + // Child already exited; close handler will unregister. + continue; + } + // Cross-platform safety net: SIGUSR2 reported `"unsupported"` + // on a non-win32 platform (rare — `ENOSYS` from libuv signal + // wrap on exotic builds, future Node versions removing the + // signal, etc.). Same fallback as the win32 skip above: + // route to SIGTERM-restart so callback edits still take + // effect via a full restart instead of silently being + // ignored. } - // Windows fallback: SIGUSR2 isn't supported on win32 — degrade - // to a full restart so callback edits don't silently fail to - // apply. The user-visible result (callbacks reload after a - // brief restart) matches the design intent. const fallback = safeKill(entry.child, "SIGTERM"); if (fallback === "ok") { entry.earlyStopRequested = true; From 315c55fac477ac71b26d22df05c9fb9e1ee2a0ec Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 19:43:10 +0900 Subject: [PATCH 41/55] fix: ensure early-stop checkpoint artifacts are correctly returned in trainer wait() result This commit addresses a regression where the artifacts from early-stop checkpoints were not being returned correctly in the `wait()` method of the trainer. The implementation now directly returns the checkpoint's artifacts, ensuring that the outputs are preserved as intended. Additionally, it includes tests to verify this behavior, preventing future regressions related to artifact handling during early-stop scenarios. --- packages/arkor/src/core/trainer.test.ts | 99 +++++++++++++++++++++++++ packages/arkor/src/core/trainer.ts | 14 +++- packages/arkor/src/studio/hmr.test.ts | 57 ++++++++++++++ packages/arkor/src/studio/hmr.ts | 61 +++++++++------ 4 files changed, 206 insertions(+), 25 deletions(-) diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 0f1ef06a..82651610 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1424,6 +1424,105 @@ describe("createTrainer (early stop)", () => { expect(result.job.completedAt).toBe("2026-01-01T00:00:03Z"); }); + it("early-stop checkpoint branch returns the checkpoint's artifacts in wait()'s result", async () => { + // Regression: the early-stop terminal return used + // `terminalResult?.artifacts ?? []`, but `wait()` always calls + // `dispatch(parsed, null)` so `terminalResult` was forever + // null → `wait()` resolved with `artifacts: []` even though + // the checkpoint event carries the very artefacts the + // early-stop existed to *preserve* (the whole point of the + // graceful-stop-at-next-checkpoint pattern is to keep that + // work). Now we return `event.artifacts` directly so the + // checkpoint's outputs make it into the resolved result. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + const checkpointArtifacts = [ + { kind: "lora_adapter" as const, path: "/checkpoints/step-10/" }, + { kind: "metric" as const, name: "loss", value: 0.42 }, + ]; + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: checkpoint.saved\ndata: ${JSON.stringify({ + type: "checkpoint.saved", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + step: 10, + artifacts: checkpointArtifacts, + })}\n\n`, + ]; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: () => { + void requestTrainerEarlyStop(trainer, { timeoutMs: 60_000 }); + }, + }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + let result: Awaited>; + try { + result = await trainer.wait(); + } finally { + globalThis.fetch = original; + } + // The artefacts the checkpoint event carried must travel + // through to the wait() result — that's the whole point of + // graceful-stop-at-next-checkpoint preserving the in-flight + // work. + expect(result.artifacts).toEqual(checkpointArtifacts); + // Sibling assertion: status is still terminal (covered more + // thoroughly in the dedicated test above; this one just + // ensures we didn't accidentally regress the status while + // changing the artefacts return). + expect(result.job.status).toBe("cancelled"); + }); + it("early-stop checkpoint branch still resolves the deferred when cancel() throws", async () => { // Regression: previously, an `await trainer.cancel()` that threw // (network failure / cloud-api 5xx during the cancel POST) would diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index e42a9b3a..d634f861 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -340,7 +340,19 @@ export function createTrainer( completedAt: event.timestamp, }; settleEarlyStopLatch(); - return { terminal: true, artifacts: terminalResult?.artifacts ?? [] }; + // Return the *checkpoint's* artifacts (the ones the user + // just saved) — that's the work HMR went out of its way + // to preserve before issuing cancel(). The previous + // `terminalResult?.artifacts ?? []` always resolved to + // `[]` because `wait()` calls `dispatch(parsed, null)` so + // `terminalResult` is never populated. Effect: an + // HMR-driven early-stop resolved `wait()` with empty + // `artifacts` even though the checkpoint event carried + // the very artifacts the early-stop existed to keep. + return { + terminal: true, + artifacts: (event.artifacts ?? []) as unknown[], + }; } return { terminal: false, artifacts: terminalResult?.artifacts ?? [] }; } diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 8334c090..c03adb4c 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -323,6 +323,63 @@ describe("createHmrCoordinator", () => { } }); + it("getCurrentArtifactHash() returns null when the artefact doesn't exist (vs a Date.now() fallback)", async () => { + // Regression: a previous implementation did + // `statSync(...) ; return fingerprint(...)`. Two stat calls + // means a race window where the file disappears between them: + // the existence check passes, then `fingerprint`'s catch + // branch substitutes `Date.now().toString(36)` (its + // freshness-forcing fallback for SSE dedup), and the getter + // returns a non-null, non-artefact-derived hash. That + // silently breaks `dispatchRebuild`'s pre-ready-spawn gate + // which relies on null === "no artefact, force restart". + // The fix uses `fingerprintOrNull` — single statSync, true + // null on failure. + // + // We assert the getter on a project that has NEVER built + // (no `.arkor/build/index.mjs` ever existed). The bug-fix + // version returns null; the broken version's leftover would + // have been Date.now()-derived non-null. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const hmr = createHmrCoordinator({ cwd }); + try { + // No subscribe() yet — watcher hasn't started, so no + // BUNDLE_END has written the artefact. The on-disk + // `.arkor/build/index.mjs` doesn't exist. + expect(hmr.getCurrentArtifactHash()).toBeNull(); + } finally { + await hmr.dispose(); + } + }); + + it("getCurrentArtifactHash() returns a stable mtime/ctime/size hash once the artefact exists", async () => { + // Companion to the null-on-missing test: when the artefact + // *does* exist (watcher's first BUNDLE_END landed), the + // getter returns the same `mtimeMs-ctimeMs-size` shape the + // SSE event's `hash` field uses. Symmetric value lets + // `dispatchRebuild` compare `entry.spawnArtifactHash` against + // `event.hash` directly for the pre-ready-spawn backfill + // decision. + mkdirSync(join(cwd, "src/arkor"), { recursive: true }); + writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); + + const events: HmrEvent[] = []; + const hmr = createHmrCoordinator({ cwd }); + hmr.subscribe((e) => events.push(e)); + try { + const ready = await nextEvent(events, (e) => e.type === "ready"); + const artifactHash = hmr.getCurrentArtifactHash(); + // Same shape as the SSE event's `hash` field — both feed + // through the same `mtimeMs-ctimeMs-size` formula. + expect(artifactHash).toBe(ready.hash ?? null); + expect(artifactHash).toMatch(/^[\d.]+-[\d.]+-\d+$/); + } finally { + await hmr.dispose(); + } + }); + it("getCurrentConfigHash() preserves the last-success hash across an ERROR event", async () => { // Regression: previously `getCurrentConfigHash()` returned // `lastEvent?.configHash ?? null`. After an ERROR landed, diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index e030db5a..9bcbdcba 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -70,26 +70,37 @@ export interface HmrCoordinator { export type HmrOptions = BuildEntryOptions; -function fingerprint(outFile: string): string { +/** + * Single-stat fingerprint with a clean `null` on failure — used by + * `getCurrentArtifactHash()` whose contract is "return a fingerprint + * derived from the artefact bytes, or `null` if no artefact". A + * separate exists-check + `fingerprint()` here would race: the file + * could disappear between the two stats and `fingerprint()`'s + * `Date.now()` fallback would return a non-null hash that doesn't + * describe any real bytes, silently violating the contract. + */ +function fingerprintOrNull(outFile: string): string | null { try { const s = statSync(outFile); - // Mirrors `moduleCacheBustKey`'s success-branch shape so the - // broadcast hash and the import URL move together — and so two - // distinct edits within the same millisecond that produce - // identically-sized output don't collide and silently dedup at - // the SPA layer. `ctimeMs` is the belt-and-braces guard for the - // (rare) `touch -m`-style case where mtime stays put. + // Same shape as `fingerprint()`'s success branch — `ctimeMs` is + // the belt-and-braces guard for `touch -m`-style edits where + // mtime stays put. return `${s.mtimeMs}-${s.ctimeMs}-${s.size}`; } catch { - // Different fallback than `moduleCacheBustKey`'s "0-0-0": that - // helper is for a URL query where the eventual `import()` is - // expected to surface its own missing-file error, but here a - // stable literal would let SPA dedup swallow genuinely-fresh - // events when stat racily fails. Force a unique value instead. - return Date.now().toString(36); + return null; } } +function fingerprint(outFile: string): string { + // Delegate to `fingerprintOrNull` and substitute a freshness- + // forcing token on stat failure. The `Date.now()` fallback + // matters here (vs the "0-0-0" sentinel `moduleCacheBustKey` + // uses): SPA-side SSE dedup keys off this hash, so a stable + // literal during a racy stat would silently swallow genuinely- + // fresh broadcast events. + return fingerprintOrNull(outFile) ?? Date.now().toString(36); +} + type InspectionResult = { configHash: string; trainerName: string; @@ -391,17 +402,19 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // older (next BUNDLE_END hasn't fired yet but the user just // edited and saved). For the registry's pre-ready-spawn gate // we want "what bytes will the child's `await import()` see - // RIGHT NOW", which only the live `fingerprint(outFile)` - // gives. Null falls through `fingerprint`'s catch when the - // file doesn't exist yet — equivalent to "child can't load - // anything", which dispatchRebuild treats as a forced - // SIGTERM-restart. - try { - statSync(resolved.outFile); - } catch { - return null; - } - return fingerprint(resolved.outFile); + // RIGHT NOW". + // + // `fingerprintOrNull` does ONE statSync and returns null on + // failure — preserving the documented contract. A previous + // implementation here did `statSync(...)` first and then + // called `fingerprint()` (which has a `Date.now()` fallback + // baked in for SSE dedup uniqueness). That double-stat + // raced: if the file disappeared between the two calls we'd + // return a Date.now()-derived hash that doesn't describe any + // real bytes, silently violating the "null on stat failure" + // contract dispatchRebuild relies on for its SIGTERM-restart + // routing. + return fingerprintOrNull(resolved.outFile); }, async dispose() { disposed = true; From 40ca9b44f4b4bb2ab80033c6590af5179b8687d0 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 20:19:29 +0900 Subject: [PATCH 42/55] feat: capture cloud-side job ID for user-initiated cancellations This commit introduces functionality to capture the cloud-side job ID from the runner's stdout when a job starts. The `TrainRegistry` is updated to store this job ID, allowing the server to issue a cancel request to the cloud API before terminating the local subprocess with SIGKILL. This ensures that user-initiated stops do not leave cloud jobs running unnecessarily. The implementation includes updates to the `buildStudioApp` and `TrainRegistry` classes, enhancing the cancellation mechanism and preventing resource wastage. --- packages/arkor/src/studio/server.ts | 60 ++++++++++++++++++++++ packages/arkor/src/studio/trainRegistry.ts | 51 +++++++++++++++++- 2 files changed, 110 insertions(+), 1 deletion(-) diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index c9c95412..413ee417 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -26,6 +26,18 @@ import { TrainRegistry, type RestartTarget } from "./trainRegistry"; * reads this off `Response.headers` and uses it to scope HMR * `restart` events to the run *this* tab actually started. */ const TRAIN_PID_HEADER = "x-arkor-train-pid"; +/** + * Anchor for picking the cloud-side job id out of the runner's + * stdout. `core/runner.ts` prints exactly one `Started job \n` + * line after `trainer.start()` resolves; the server intercepts + * that line in `/api/train`'s stdout forwarder so it can POST + * `/v1/jobs/:id/cancel` to cloud-api on user-initiated cancel + * (SIGKILL bypasses the runner's own shutdown handlers — see the + * cancel() comment for the full rationale). The id capture group + * matches everything up to the trailing newline so cloud-api + * formats can change without rev'ing this regex. + */ +const STARTED_JOB_PATTERN = /Started job (\S+)/; const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { @@ -452,6 +464,18 @@ export function buildStudioApp(options: StudioServerOptions) { // `String()` — same byte content, but allocates a new array. const onChunk = (d: Buffer): void => { if (closed) return; + // Watch for the runner's `Started job ` line so the + // cancel handler (below) can POST `/v1/jobs/:id/cancel` + // to cloud-api on user-initiated abort. SIGKILL bypasses + // the runner's `installShutdownHandlers`, so without + // this server-side cancel a Stop click would leave the + // cloud job running until TTL/reaper. Only parse until + // the id is recorded — the runner prints the line + // exactly once, right after `start()` resolves. + if (activeTrains.getJobId(child.pid) === null) { + const m = STARTED_JOB_PATTERN.exec(d.toString("utf8")); + if (m && m[1]) activeTrains.recordJobId(child.pid, m[1]); + } try { controller.enqueue(d); } catch { @@ -554,9 +578,45 @@ export function buildStudioApp(options: StudioServerOptions) { // OS pipe keeps draining while the child checkpoints — // see `cancelTeardown` for the backpressure rationale. const earlyStopInFlight = activeTrains.isEarlyStopRequested(child.pid); + // Capture the cloud job id BEFORE unregistering — once the + // entry is gone, `getJobId(pid)` returns null and the + // fire-and-forget POST below would no-op. + const jobIdForCancel = activeTrains.getJobId(child.pid); activeTrains.unregister(child.pid); cancelTeardown?.(); if (earlyStopInFlight) return; + // Fire-and-forget cloud-side cancel so the cloud job is + // released even though the SIGKILL below bypasses the + // runner's `installShutdownHandlers` (which would + // otherwise issue cancel itself via the graceful + // early-stop chain). Best-effort: we don't await because + // user-cancel UX should be snappy — the SIGKILL kills the + // local subprocess regardless of whether the cloud POST + // succeeded, and a transient cloud-api blip just means the + // job sits in "running" until the cloud reaper / TTL + // catches it (same fallback as a network drop). `jobId` + // is null when the runner never emitted its `Started job` + // line (early spawn failure, race against a fast cancel, + // custom user bin); skip the POST in that case. + if (jobIdForCancel) { + void (async () => { + try { + const state = await readState(trainCwd); + if (!state) return; // no scope, can't address the job + const rpc = createRpc(); + await rpc.v1.jobs[":id"].cancel.$post({ + param: { id: jobIdForCancel }, + query: { + orgSlug: state.orgSlug, + projectSlug: state.projectSlug, + }, + }); + } catch { + // Best-effort: cloud-api transient failure or scope + // drift. Cloud reaper / TTL is the safety net. + } + })(); + } // SIGKILL (not the default SIGTERM) for user-initiated // aborts. The runner's `installShutdownHandlers` now treats // a single SIGTERM as the HMR-driven "graceful early-stop" diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index f03b9f3b..dc651e66 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -46,6 +46,19 @@ export interface ActiveTrain { * internal to the registry; consumers shouldn't manage it. */ earlyStopRequested?: boolean; + /** + * Cloud-side job id, captured by parsing the runner's + * `Started job ` stdout line shortly after spawn. Populated + * via `recordJobId(pid, id)` on the first matching chunk; null + * before that or for runs whose stdout we never saw the line on + * (early spawn failure, custom user bins, etc.). The + * `/api/train` cancel handler reads this to fire a fire-and-forget + * `POST /v1/jobs/:id/cancel` before SIGKILLing the subprocess — + * SIGKILL bypasses the runner's `installShutdownHandlers`, so + * without this server-side cancel the cloud-side job would live + * until the cloud reaper / TTL fires (continued GPU spend). + */ + jobId: string | null; } export interface RestartTarget { @@ -119,7 +132,7 @@ export class TrainRegistry { child: ChildProcess, init: Omit< ActiveTrain, - "child" | "earlyStopRequested" | "spawnArtifactHash" + "child" | "earlyStopRequested" | "spawnArtifactHash" | "jobId" > & { // Optional in the signature so tests / future callers that // don't track the on-disk artefact fingerprint (e.g. an HMR- @@ -138,6 +151,11 @@ export class TrainRegistry { ...init, spawnArtifactHash: init.spawnArtifactHash ?? null, earlyStopRequested: false, + // `jobId` starts null — populated later by `recordJobId(pid, + // id)` when the server's stdout parser sees the runner's + // `Started job ` line. Tests that don't exercise the + // cancel-POST path can leave it null. + jobId: null, }); } @@ -145,6 +163,37 @@ export class TrainRegistry { if (typeof pid === "number") this.entries.delete(pid); } + /** + * Record the cloud-side job id for an active child. Called by the + * server's `/api/train` stdout parser the first time it spots + * `Started job ` in the runner's output. Idempotent: a + * second call with the same pid + id is a no-op (the runner + * only prints the line once anyway). Unknown pids are silently + * dropped (the child may have already exited and unregistered). + */ + recordJobId(pid: number | undefined, jobId: string): void { + if (typeof pid !== "number") return; + const entry = this.entries.get(pid); + if (!entry) return; + entry.jobId = jobId; + } + + /** + * Read the recorded cloud-side job id for a pid. `/api/train`'s + * cancel handler consults this to POST `/v1/jobs/:id/cancel` + * before SIGKILLing the local subprocess — without that POST, + * a user-initiated stop would leave the cloud job running + * until TTL (the SIGKILL bypasses the runner's `installShutdownHandlers` + * so the runner can't issue cancel itself). Returns null when + * the pid is unknown or the runner hasn't printed its + * `Started job` line yet (early spawn failure, race against + * a fast cancel, custom user bins). + */ + getJobId(pid: number | undefined): string | null { + if (typeof pid !== "number") return null; + return this.entries.get(pid)?.jobId ?? null; + } + /** * Whether `dispatchRebuild` has already issued a graceful-restart * SIGTERM to this child as part of an HMR cycle. Consulted by From 8cce117042af402747712898266217a3016bbcfa Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 20:24:52 +0900 Subject: [PATCH 43/55] test: add regression test for cloud job cancellation on SIGKILL This commit introduces a new test to verify that the server correctly issues a cancel POST to the cloud API when a training job is stopped using SIGKILL. The test ensures that the cloud job is released properly, preventing unnecessary resource consumption. It captures the job ID from the runner's stdout and checks that the cancel request is made with the correct parameters, reinforcing the functionality added in previous commits. --- packages/arkor/src/studio/server.test.ts | 113 +++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index c5841655..dc04d16b 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -642,6 +642,119 @@ process.exit(0); expect(getCurrentCalls).toBe(1); }); + it("/api/train cancel POSTs cloud /v1/jobs/:id/cancel so the cloud job is released even though SIGKILL bypasses the runner's shutdown handlers", async () => { + // Regression: SIGKILL kills the runner without giving its + // `installShutdownHandlers` a chance to issue the cloud + // `cancel()` POST itself. Without a server-side equivalent + // the cloud job sits in "running" until TTL/reaper, so a + // user clicking "Stop training" silently keeps consuming + // GPU spend. The fix parses the runner's `Started job ` + // stdout line, records the id on the registry entry, and + // fires a fire-and-forget POST to cloud-api on cancel + // *before* SIGKILLing. + await writeCredentials(ANON_CREDS); + // The cancel POST reads scope from `.arkor/state.json` (not + // from the anon creds' orgSlug — that's a different code + // path). Pre-seed so the POST can address the cloud job. + await writeState( + { + orgSlug: "cancel-test-org", + projectSlug: "cancel-test-project", + projectId: "p-cancel", + }, + trainCwd, + ); + // Bin prints the canonical "Started job " line then + // hangs (just like the real runner after `start()` resolves). + // The id is the same kind of identifier cloud-api would + // mint — opaque string we'll verify shows up in the cancel + // POST URL below. + const FAKE_JOB_ID = "j-cancel-test"; + const fakeBin = join(trainCwd, "started-job-bin.mjs"); + writeFileSync( + fakeBin, + `process.stdout.write("Started job ${FAKE_JOB_ID}\\n"); + process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + // Capture the cloud-api requests so we can verify the + // server's cancel POST landed with the right job id + + // scope. The default fetch in this suite would 404 our POST + // and leave it as `cancelCalls === 0`. + let cancelHits: Array<{ url: string; method: string }> = []; + const ORIG_FETCH = globalThis.fetch; + globalThis.fetch = (async ( + input: Parameters[0], + init?: Parameters[1], + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if ( + method === "POST" && + url.includes(`/v1/jobs/${FAKE_JOB_ID}/cancel`) + ) { + cancelHits.push({ url, method }); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + // Pass-through default: anything else 404s — which would + // surface as a test-side failure if our cancel POST + // doesn't match the expected URL shape. + return new Response("not found", { status: 404 }); + }) as typeof fetch; + + try { + const app = buildStudioApp({ + baseUrl: "http://mock-cloud-api", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + // Read enough of the body to ensure the runner's + // `Started job ` chunk has been processed by the + // server's stdout parser (without this, cancel could + // race ahead of the parser and find no jobId on the + // registry → no cancel POST → false test failure). + const reader = trainRes.body!.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + while (!buf.includes(`Started job ${FAKE_JOB_ID}`)) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + // Trigger cancel — should fire the cloud POST + SIGKILL. + await reader.cancel(); + // Fire-and-forget: give the void IIFE a tick to actually + // dispatch the fetch + receive the 200 response. + await new Promise((r) => setTimeout(r, 200)); + + expect(cancelHits).toHaveLength(1); + expect(cancelHits[0]?.url).toContain(`/v1/jobs/${FAKE_JOB_ID}/cancel`); + // Scope is required by the cloud-api contract — comes from + // `.arkor/state.json` (seeded above), not the anon creds. + expect(cancelHits[0]?.url).toContain("orgSlug=cancel-test-org"); + expect(cancelHits[0]?.url).toContain("projectSlug=cancel-test-project"); + } finally { + globalThis.fetch = ORIG_FETCH; + } + }); + it("/api/train cancel sends SIGKILL so user-initiated stop bypasses the runner's graceful early-stop", async () => { // Regression: a default `child.kill()` sends SIGTERM, which // the runner's `installShutdownHandlers` now interprets as a From 6970542a7c5c3a40e77e269cdeb5d6e3570ed2c5 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 21:12:01 +0900 Subject: [PATCH 44/55] fix: reject early-stop deferred on cancel error to prevent silent failures This commit updates the early-stop mechanism in the trainer to reject the deferred promise when an error occurs during the cancellation process. Previously, errors were swallowed, leading to silent failures where the cloud job continued running despite the local cancellation. The change ensures that cancellation errors are propagated, allowing the shutdown handler to log the failure, thus improving visibility and operator intervention capabilities. Additionally, the test suite is updated to verify this behavior, preventing future regressions. --- packages/arkor/src/core/trainer.test.ts | 75 +++++++++++++-------- packages/arkor/src/core/trainer.ts | 58 +++++++++++----- packages/arkor/src/studio/server.ts | 90 ++++++++++++++++++------- 3 files changed, 156 insertions(+), 67 deletions(-) diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 82651610..5be7fbeb 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1523,14 +1523,17 @@ describe("createTrainer (early stop)", () => { expect(result.job.status).toBe("cancelled"); }); - it("early-stop checkpoint branch still resolves the deferred when cancel() throws", async () => { + it("early-stop checkpoint branch rejects the deferred when cancel() throws (visible to shutdown handler)", async () => { // Regression: previously, an `await trainer.cancel()` that threw - // (network failure / cloud-api 5xx during the cancel POST) would - // propagate out of the dispatch and leave `earlyStopDeferred` - // pending forever. The runner's SIGTERM handler awaits that - // promise before exiting, so the subprocess would hang on - // shutdown. The fix swallows the cancel throw best-effort and - // still marks the run terminal locally so the deferred resolves. + // (network failure / cloud-api 5xx during the cancel POST) was + // *swallowed*, the deferred resolved cleanly, and the runner + // exited 0 — the UI declared the run cancelled while the cloud + // job kept running, orphaning GPU spend with no visible error. + // The fix REJECTS the deferred so the runner's + // `installShutdownHandlers` `.catch()` writes the failure to + // stderr, surfacing the issue to the operator. The latch is + // still always settled (resolved or rejected), so shutdown + // doesn't hang waiting for a checkpoint that will never come. await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, cwd, @@ -1582,6 +1585,14 @@ describe("createTrainer (early stop)", () => { throw new Error(`unexpected fetch: ${method} ${url}`); }) as typeof fetch; + // Capture the very-first armed early-stop promise so we can + // assert its settlement state below. The trainer is mutually + // recursive with the callback (`onLog` calls + // `requestTrainerEarlyStop(trainer, ...)`), so we declare it + // first as `let` and assign in a second step. + let armedPromise: Promise | null = null; + let armedResult: "resolved" | "rejected" | "pending" = "pending"; + let armedError: unknown = null; const trainer = createTrainer( { name: "run", @@ -1589,7 +1600,24 @@ describe("createTrainer (early stop)", () => { dataset: { type: "huggingface", name: "x" }, callbacks: { onLog: () => { - void requestTrainerEarlyStop(trainer, { timeoutMs: 60_000 }); + // Arm exactly once and capture the returned promise. + // requestTrainerEarlyStop is idempotent across repeat + // calls, but we only need the FIRST armed deferred — + // the cancel-throw rejects exactly that promise. + if (armedPromise === null) { + armedPromise = requestTrainerEarlyStop(trainer, { + timeoutMs: 60_000, + }); + armedPromise.then( + () => { + armedResult = "resolved"; + }, + (err: unknown) => { + armedResult = "rejected"; + armedError = err; + }, + ); + } }, }, }, @@ -1597,32 +1625,23 @@ describe("createTrainer (early stop)", () => { ); const original = globalThis.fetch; globalThis.fetch = fetcher; - let stopPromiseResult: "resolved" | "rejected" | "pending" = "pending"; - const stopPromise = new Promise((resolve) => { - // Don't drive the early-stop ourselves — `onLog` arms it. We - // just want to verify that whichever code path ultimately drives - // it sees a resolved deferred even though cancel() throws. - const tick = setInterval(() => { - // Probe the trainer's state via a fresh requestEarlyStop call: - // once the cancel-after-checkpoint branch ran, status is - // "cancelled" and this returns instantly. - void requestTrainerEarlyStop(trainer, { timeoutMs: 1 }).then(() => { - clearInterval(tick); - stopPromiseResult = "resolved"; - resolve(); - }); - }, 25); - }); try { await trainer.wait(); - // Wait for the probe to confirm the deferred resolves. - await stopPromise; + // Flush microtasks so the .then(resolve, reject) handler + // observes the settlement before we assert. + await new Promise((r) => setImmediate(r)); } finally { globalThis.fetch = original; } - // cancel() was attempted (and threw) but the deferred still resolved. + // cancel() was attempted (and threw). expect(cancelAttempts).toBe(1); - expect(stopPromiseResult).toBe("resolved"); + // The armed deferred REJECTED — the runner's `.catch()` would + // see this error and log it to stderr instead of silently + // exiting 0. Critically: it didn't hang on "pending"; the + // failure case still settles, just via reject not resolve. + expect(armedResult).toBe("rejected"); + expect(armedError).toBeInstanceOf(TypeError); + expect((armedError as Error).message).toBe("fetch failed"); }); it("resolves the early-stop latch when the run hits a terminal event before the next checkpoint", async () => { diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index d634f861..de66c0c4 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -175,6 +175,7 @@ export function createTrainer( let earlyStopDeferred: { promise: Promise; resolve: () => void; + reject: (err: unknown) => void; timer: NodeJS.Timeout | null; } | null = null; let earlyStopRequested = false; @@ -313,20 +314,28 @@ export function createTrainer( // Early-stop latch: a checkpoint just landed, so the in-flight work // is durable. Cancel the cloud job and end `wait()` cleanly. if (earlyStopRequested && earlyStopDeferred) { - // Best-effort `cancel()` — swallow throws so the deferred - // *always* resolves and the SIGTERM handler waiting on - // `requestEarlyStop()` can exit. Letting an error propagate - // here would leave the deferred pending and the runner - // process hung on shutdown; the local `startedJob.status` - // is set to `cancelled` regardless so subsequent - // `requestEarlyStop` calls see the terminal-status - // short-circuit. The cookbook already calls `cancel()` - // best-effort, so users tolerating a transient cloud-api - // failure here matches the documented contract. + // Capture the cancel error (if any) but DON'T swallow + // silently — propagate via the deferred's reject path so + // the runner's `installShutdownHandlers` `.catch()` writes + // the failure to stderr. The previous swallow let a + // transient cloud-api failure during early-stop appear + // as a clean cancel: the local runner exited 0, the UI + // declared the run cancelled, but the cloud job kept + // running (continued GPU spend). Keeping the error + // visible to the shutdown handler lets the operator see + // it and intervene. + // + // We still mark `startedJob.status` terminal locally + // either way — from the runner's perspective the run is + // over, and a subsequent `requestEarlyStop()` call must + // hit the `TERMINAL_STATUSES.has(...)` short-circuit + // (re-arming a fresh latch on a dead run would hang + // shutdown). + let cancelError: unknown = null; try { await trainer.cancel(); - } catch { - // intentionally ignored — see comment above. + } catch (err) { + cancelError = err; } // Reflect the cancellation locally so `wait()`'s resolved // `TrainingResult.job.status` is a terminal status (per the @@ -339,7 +348,19 @@ export function createTrainer( status: "cancelled", completedAt: event.timestamp, }; - settleEarlyStopLatch(); + if (cancelError !== null) { + // Reject (not resolve) the latch. Mirrors the success + // path's bookkeeping (clear timer, null out shared + // slot, drop the request flag) so a follow-up + // `requestEarlyStop()` won't piggyback on the rejected + // promise. + if (earlyStopDeferred.timer) clearTimeout(earlyStopDeferred.timer); + earlyStopDeferred.reject(cancelError); + earlyStopDeferred = null; + earlyStopRequested = false; + } else { + settleEarlyStopLatch(); + } // Return the *checkpoint's* artifacts (the ones the user // just saved) — that's the work HMR went out of its way // to preserve before issuing cancel(). The previous @@ -588,8 +609,10 @@ export function createTrainer( earlyStopRequested = true; let resolveFn!: () => void; - const promise = new Promise((resolve) => { + let rejectFn!: (err: unknown) => void; + const promise = new Promise((resolve, reject) => { resolveFn = resolve; + rejectFn = reject; }); const timeoutMs = opts.timeoutMs ?? DEFAULT_EARLY_STOP_TIMEOUT_MS; const timer = setTimeout(() => { @@ -625,7 +648,12 @@ export function createTrainer( // `Timer.unref` keeps the early-stop timer from blocking process exit // when the host runtime finishes for unrelated reasons. timer.unref?.(); - earlyStopDeferred = { promise, resolve: resolveFn, timer }; + earlyStopDeferred = { + promise, + resolve: resolveFn, + reject: rejectFn, + timer, + }; return promise; } diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 413ee417..6b53d0c9 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -27,17 +27,28 @@ import { TrainRegistry, type RestartTarget } from "./trainRegistry"; * `restart` events to the run *this* tab actually started. */ const TRAIN_PID_HEADER = "x-arkor-train-pid"; /** - * Anchor for picking the cloud-side job id out of the runner's - * stdout. `core/runner.ts` prints exactly one `Started job \n` - * line after `trainer.start()` resolves; the server intercepts - * that line in `/api/train`'s stdout forwarder so it can POST - * `/v1/jobs/:id/cancel` to cloud-api on user-initiated cancel - * (SIGKILL bypasses the runner's own shutdown handlers — see the - * cancel() comment for the full rationale). The id capture group - * matches everything up to the trailing newline so cloud-api - * formats can change without rev'ing this regex. + * Strict full-line match for the runner's `Started job ` line. + * `core/runner.ts` prints exactly that text — `process.stdout.write(\`Started job ${jobId}\n\`)` — + * after `trainer.start()` resolves; the server's `/api/train` + * stdout forwarder line-buffers chunks (chunk boundaries are + * arbitrary, so a substring scan against raw chunks would miss + * splits) and applies this regex to each complete line so it can + * POST `/v1/jobs/:id/cancel` to cloud-api on user-initiated + * cancel (SIGKILL bypasses the runner's own shutdown handlers — + * see the `cancel()` comment for the full rationale). + * + * Anchors `^…$` matter for two reasons: + * - Avoid false matches when a user `console.log` happens to + * contain the substring "Started job " *before* the + * runner's canonical line lands; once we record an id we + * stop scanning, so a stray earlier match would stick and + * Stop-training would POST cancel for the wrong (or + * non-existent) job. + * - Restrict the id capture to non-whitespace, mirroring what + * `runner.ts` writes (cloud-api job ids are word-shaped, + * never contain spaces). */ -const STARTED_JOB_PATTERN = /Started job (\S+)/; +const STARTED_JOB_PATTERN = /^Started job (\S+)$/; const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { @@ -462,6 +473,21 @@ export function buildStudioApp(options: StudioServerOptions) { // round-trip through `TextEncoder`. The previous code did // `enc.encode(d)` which implicitly coerced the buffer via // `String()` — same byte content, but allocates a new array. + // Carry-over buffer for line-oriented job-id extraction. + // Stream chunk boundaries are arbitrary — the runner's + // single-line `Started job ` write can land split + // across two `data` events, in which case a per-chunk + // regex would never match and the cancel POST chain + // would never fire (cloud-job orphan on Stop). We + // accumulate text until a newline, parse the complete + // line, and keep any trailing partial for the next + // chunk. Cleared the moment the id is recorded so a + // chatty bin doesn't pin memory after the marker has + // landed; capped at 4 KiB regardless to bound a + // misbehaving bin that never emits a newline before the + // marker (the canonical line is well under 100 bytes). + let stdoutLineBuf = ""; + const STARTED_JOB_BUFFER_CAP = 4096; const onChunk = (d: Buffer): void => { if (closed) return; // Watch for the runner's `Started job ` line so the @@ -469,12 +495,29 @@ export function buildStudioApp(options: StudioServerOptions) { // to cloud-api on user-initiated abort. SIGKILL bypasses // the runner's `installShutdownHandlers`, so without // this server-side cancel a Stop click would leave the - // cloud job running until TTL/reaper. Only parse until - // the id is recorded — the runner prints the line - // exactly once, right after `start()` resolves. + // cloud job running until TTL/reaper. Stop scanning + // once the id is recorded — the runner prints the line + // exactly once. if (activeTrains.getJobId(child.pid) === null) { - const m = STARTED_JOB_PATTERN.exec(d.toString("utf8")); - if (m && m[1]) activeTrains.recordJobId(child.pid, m[1]); + stdoutLineBuf += d.toString("utf8"); + let nl = stdoutLineBuf.indexOf("\n"); + while (nl !== -1) { + // Strip a possible \r so CRLF-emitting bins (rare for + // Node `process.stdout.write` but defensive) match + // the same anchored pattern. + const line = stdoutLineBuf.slice(0, nl).replace(/\r$/, ""); + stdoutLineBuf = stdoutLineBuf.slice(nl + 1); + const m = STARTED_JOB_PATTERN.exec(line); + if (m && m[1]) { + activeTrains.recordJobId(child.pid, m[1]); + stdoutLineBuf = ""; + break; + } + nl = stdoutLineBuf.indexOf("\n"); + } + if (stdoutLineBuf.length > STARTED_JOB_BUFFER_CAP) { + stdoutLineBuf = stdoutLineBuf.slice(-STARTED_JOB_BUFFER_CAP); + } } try { controller.enqueue(d); @@ -630,15 +673,14 @@ export function buildStudioApp(options: StudioServerOptions) { // unregister-before-graceful-exit window where a fast new // run could overlap an old one untracked by HMR routing. // - // Trade-off: the runner can't POST `/v1/jobs/:id/cancel` to - // cloud-api on its way out (its early-stop chain is - // bypassed). The cloud-side job is left orphaned until the - // server reaper / TTL kicks in. This matches the pre-PR - // behaviour (the runner had no signal handler at all then; - // SIGTERM also killed it without cloud cancel). Sending a - // direct `/v1/jobs/:id/cancel` from the server here is a - // separate follow-up — would need the jobId, which the - // server doesn't currently parse out of stdout. + // The cloud-side job is released by the fire-and-forget + // POST above (we recorded the runner's `Started job ` + // line on the registry; the IIFE looks it up here). SIGKILL + // alone would have left the cloud job orphaned until + // TTL/reaper because the runner can't POST cancel itself + // when the kernel reaps it without warning. Together — + // server-side cancel POST + SIGKILL — give snappy local + // teardown AND eventual cloud-side release. // // `ChildProcess.kill()` can throw (ESRCH if the process has // already exited between this handler's invocation and the From 8ed9eb304af85235c87109cf600ccafdb2f5b8bf Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sat, 9 May 2026 21:54:27 +0900 Subject: [PATCH 45/55] fix: update exit codes for signal handling in cleanup hooks This commit modifies the exit codes returned by the cleanup hooks to align with POSIX standards, ensuring that the process exits with the correct code based on the signal received (e.g., SIGINT results in exit code 130). This change allows parent shells and orchestrators to accurately distinguish between user interruptions and normal completions. Additionally, new tests are added to verify the correct exit codes for various signals, enhancing the robustness of the cleanup mechanism. --- packages/arkor/src/cli/cleanupHooks.test.ts | 51 ++++- packages/arkor/src/cli/cleanupHooks.ts | 26 ++- packages/arkor/src/cli/commands/dev.test.ts | 10 +- packages/arkor/src/studio/server.test.ts | 223 ++++++++++++++++++++ packages/arkor/src/studio/server.ts | 96 +++++---- 5 files changed, 364 insertions(+), 42 deletions(-) diff --git a/packages/arkor/src/cli/cleanupHooks.test.ts b/packages/arkor/src/cli/cleanupHooks.test.ts index 7e90f1ad..ab90031e 100644 --- a/packages/arkor/src/cli/cleanupHooks.test.ts +++ b/packages/arkor/src/cli/cleanupHooks.test.ts @@ -78,7 +78,11 @@ describe("registerCleanupHook", () => { await flushMicrotasks(); expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); - expect(codes).toEqual([0]); + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent + // shells / orchestrators can distinguish "user interrupted" + // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // cleanupHooks.ts. + expect(codes).toEqual([130]); }); it("waits for sibling async cleanups even when the exit-owning hook is registered FIRST", async () => { @@ -133,7 +137,44 @@ describe("registerCleanupHook", () => { await flushMicrotasks(); expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); - expect(codes).toEqual([0]); + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent + // shells / orchestrators can distinguish "user interrupted" + // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // cleanupHooks.ts. + expect(codes).toEqual([130]); + }); + + it("exits with the POSIX 128+signo code for each terminating signal (130/143/129)", async () => { + // Regression: the exit-owning hook used to always + // `process.exit(0)`, regardless of which signal fired the + // shutdown. Parent shells / orchestrators / CI runners that + // gate on signal-style nonzero status would mis-classify a + // Ctrl-C (SIGINT) as a clean run — `arkor dev || cleanup` + // would skip the cleanup branch and leave whatever it owned + // unreaped. POSIX convention is 128 + signo (SIGINT=2 → 130, + // SIGTERM=15 → 143, SIGHUP=1 → 129); SIGNAL_EXIT_CODE in + // cleanupHooks.ts pins the mapping. + const cases: Array<["SIGINT" | "SIGTERM" | "SIGHUP", number]> = [ + ["SIGINT", 130], + ["SIGTERM", 143], + ["SIGHUP", 129], + ]; + for (const [sig, expected] of cases) { + registerCleanupHook({ cleanup: () => {}, exitOnSignal: true }); + const codes = mockExit(); + process.emit(sig, sig); + // queueMicrotask + Promise.allSettled chain — two flushes + // mirror the existing tests. + await flushMicrotasks(); + await flushMicrotasks(); + expect(codes, `signal ${sig}`).toEqual([expected]); + // Reset for the next iteration's hook registration so the + // new SIGNAL_EXIT_CODE doesn't get clobbered by leftover + // listeners. + __resetCleanupHooksForTests(); + exitSpy?.mockRestore(); + exitSpy = null; + } }); it("auto-detaches its process listeners after firing so they don't accumulate", () => { @@ -206,6 +247,10 @@ describe("registerCleanupHook", () => { expect(invocations).toBe(1); // First SIGINT fires the handler → exit(0); follow-ups hit no // listener after auto-detach, so codes has exactly one entry. - expect(codes).toEqual([0]); + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent + // shells / orchestrators can distinguish "user interrupted" + // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // cleanupHooks.ts. + expect(codes).toEqual([130]); }); }); diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index f63b171f..a37062f6 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -1,5 +1,22 @@ const TERMINATING_SIGNALS = ["SIGINT", "SIGTERM", "SIGHUP"] as const; +/** + * POSIX-style exit code for a signal-terminated process: `128 + signo`. + * Parent shells / orchestrators rely on this to distinguish "user + * interrupted" (nonzero) from "ran to completion" (zero) — exiting 0 + * for a SIGINT'd `arkor dev` would make CI / shell loops / `&&` + * chains misclassify the interruption as success. The numbers below + * are the canonical signo values from POSIX (1=HUP, 2=INT, 15=TERM). + */ +const SIGNAL_EXIT_CODE: Record< + (typeof TERMINATING_SIGNALS)[number], + number +> = { + SIGHUP: 129, + SIGINT: 130, + SIGTERM: 143, +}; + export interface CleanupHookOptions { /** * Idempotent cleanup body. Wrapped with a `done` guard so a noisy @@ -106,6 +123,13 @@ export function registerCleanupHook(options: CleanupHookOptions): void { run(); detach(); if (!options.exitOnSignal) return; + // Capture which signal triggered shutdown so the exit code + // below reflects "interrupted by SIG" (POSIX 128 + signo) + // rather than "ran to completion" (0). Parent shells / + // orchestrators / CI runners distinguish these — a script + // that runs `arkor dev || cleanup_on_failure` would otherwise + // mis-classify a Ctrl-C as success and skip its cleanup. + const exitCode = SIGNAL_EXIT_CODE[sig]; // Snapshot `inFlightCleanups` AFTER every other signal listener // for this signal has run. Node's EventEmitter dispatches // listeners synchronously in registration order, so if the @@ -127,7 +151,7 @@ export function registerCleanupHook(options: CleanupHookOptions): void { // microtask round-trip). queueMicrotask(() => { void Promise.allSettled([...inFlightCleanups]).then(() => - process.exit(0), + process.exit(exitCode), ); }); }); diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index aec80771..a18aca8c 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -712,7 +712,10 @@ describe("runDev", () => { // Exit fires after `Promise.allSettled(asyncCleanups)` resolves — // a few microticks later. Flush to let the queued exit run. await flushMicrotasks(); - expect(exitSpy).toHaveBeenCalledWith(0); + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) — see + // SIGNAL_EXIT_CODE in cleanupHooks.ts. Parent shells need + // the nonzero code to distinguish interrupt from clean exit. + expect(exitSpy).toHaveBeenCalledWith(130); } finally { exitSpy.mockRestore(); } @@ -754,7 +757,10 @@ describe("runDev", () => { // ran (best-effort `unlinkSync` swallows ENOENT) and the // exit-on-signal arm fired (after async cleanup tails settle). await flushMicrotasks(); - expect(exitSpy).toHaveBeenCalledWith(0); + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) — see + // SIGNAL_EXIT_CODE in cleanupHooks.ts. Parent shells need + // the nonzero code to distinguish interrupt from clean exit. + expect(exitSpy).toHaveBeenCalledWith(130); } finally { exitSpy.mockRestore(); } diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index dc04d16b..0954e327 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -642,6 +642,108 @@ process.exit(0); expect(getCurrentCalls).toBe(1); }); + it("/api/train job-id parser ignores stderr so a `Started job ` line on stderr can't hijack the cancel POST", async () => { + // Regression: the job-id detector used to consume both + // stdout AND stderr through a shared `onChunk` + shared + // line buffer. A user `console.error("Started job ")` + // on stderr would then poison the buffer first; the real + // stdout marker arrives later but our `getJobId(...) === null` + // gate has already short-circuited subsequent scans, so + // Stop-training POSTs cancel for the wrong (decoy) job and + // the real one keeps running — silent cloud orphan. + // Splitting into a stdout-only `onStdoutChunk` parser and a + // forward-only `onStderrChunk` makes stderr unable to + // populate `jobId` regardless of what the user logs there. + await writeCredentials(ANON_CREDS); + await writeState( + { + orgSlug: "stderr-test-org", + projectSlug: "stderr-test-project", + projectId: "p-stderr", + }, + trainCwd, + ); + // Bin emits a decoy `Started job ` to STDERR first + // (would poison the shared buffer), then the canonical real + // line to STDOUT, then hangs. With the split we expect the + // real id to win; with the bug the decoy would win. + const REAL_JOB_ID = "real-job-id"; + const DECOY_JOB_ID = "decoy-from-stderr"; + const fakeBin = join(trainCwd, "stderr-decoy-bin.mjs"); + writeFileSync( + fakeBin, + `process.stderr.write("Started job ${DECOY_JOB_ID}\\n"); + // Slight delay so stderr lands first. + setTimeout(() => { + process.stdout.write("Started job ${REAL_JOB_ID}\\n"); + }, 30); + process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + let cancelHits: Array<{ url: string }> = []; + const ORIG_FETCH = globalThis.fetch; + globalThis.fetch = (async ( + input: Parameters[0], + init?: Parameters[1], + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && /\/v1\/jobs\/[^/]+\/cancel/.test(url)) { + cancelHits.push({ url }); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response("not found", { status: 404 }); + }) as typeof fetch; + + try { + const app = buildStudioApp({ + baseUrl: "http://mock-cloud-api", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + // Drain until the REAL line is in the body. Both the + // decoy and the real line forward through to the SPA log + // stream, so both bytes show up here regardless of which + // (if any) the parser captures. + const reader = trainRes.body!.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + while (!buf.includes(`Started job ${REAL_JOB_ID}`)) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + await reader.cancel(); + await new Promise((r) => setTimeout(r, 200)); + + // The cancel POST must target the REAL id. With the bug + // the decoy would have been recorded first → cancelHits[0] + // would contain `decoy-from-stderr` instead. + expect(cancelHits).toHaveLength(1); + expect(cancelHits[0]?.url).toContain(`/v1/jobs/${REAL_JOB_ID}/cancel`); + expect(cancelHits[0]?.url).not.toContain(DECOY_JOB_ID); + } finally { + globalThis.fetch = ORIG_FETCH; + } + }); + it("/api/train cancel POSTs cloud /v1/jobs/:id/cancel so the cloud job is released even though SIGKILL bypasses the runner's shutdown handlers", async () => { // Regression: SIGKILL kills the runner without giving its // `installShutdownHandlers` a chance to issue the cloud @@ -2013,6 +2115,127 @@ process.exit(0); expect(fake.subscriberCount).toBe(1); }); + it("/api/train cancel still fires cloud cancel POST + SIGKILL even when HMR has already requested early-stop", async () => { + // Regression: the cancel handler used to short-circuit + // (`if (earlyStopInFlight) return;`) when HMR's + // `dispatchRebuild` had already SIGTERMed the child for a + // graceful checkpoint-wait early-stop. That gate was added + // to avoid a second SIGTERM piling on top of the first + // (which would have triggered the runner's `exit(143)` + // emergency path and broken cloud cancel POSTing). With + // SIGKILL replacing the user-stop SIGTERM, the + // double-signal worry no longer applies — and the gate + // turned a Stop click during HMR's graceful window into a + // total no-op, leaving the run alive until checkpoint / + // 5-min timeout. Manual stop now overrides HMR's graceful + // path: server POSTs cloud cancel + SIGKILLs the + // subprocess regardless of `isEarlyStopRequested`. + await writeCredentials(ANON_CREDS); + await writeState( + { + orgSlug: "manual-override-org", + projectSlug: "manual-override-project", + projectId: "p-manual", + }, + trainCwd, + ); + const FAKE_JOB_ID = "manual-stop-during-hmr"; + const fakeBin = join(trainCwd, "manual-during-hmr-bin.mjs"); + // SIGTERM no-op so HMR's graceful SIGTERM doesn't terminate + // the bin — we need it alive so the subsequent manual + // cancel actually has something to SIGKILL. + writeFileSync( + fakeBin, + `process.stdout.write("Started job ${FAKE_JOB_ID}\\n"); + process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + let cancelHits: Array<{ url: string }> = []; + const ORIG_FETCH = globalThis.fetch; + globalThis.fetch = (async ( + input: Parameters[0], + init?: Parameters[1], + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && /\/v1\/jobs\/[^/]+\/cancel/.test(url)) { + cancelHits.push({ url }); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response("not found", { status: 404 }); + }) as typeof fetch; + + try { + const fake = fakeHmr("h1"); + const app = buildStudioApp({ + baseUrl: "http://mock-cloud-api", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + hmr: fake.coordinator, + }); + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + const pid = Number(trainRes.headers.get("x-arkor-train-pid")); + // Drain until the parser has recorded the job id. + const reader = trainRes.body!.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + while (!buf.includes(`Started job ${FAKE_JOB_ID}`)) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + // Emit an HMR mismatch — server's dispatch SIGTERMs the + // bin and sets `earlyStopRequested = true` on the entry. + // The bin's SIGTERM no-op keeps it alive so the manual + // cancel below has a target. + fake.emit({ + type: "ready", + outFile: "/tmp/x.mjs", + hash: "abc", + configHash: "h2", // mismatch with spawn-time "h1" + trainerName: "t", + }); + // Let the dispatch run + signal land. + await new Promise((r) => setTimeout(r, 80)); + + // Manual cancel — old code would have early-returned; new + // code POSTs cloud cancel + SIGKILLs. + await reader.cancel(); + await new Promise((r) => setTimeout(r, 250)); + + // Cloud cancel POST landed for the right job. + expect(cancelHits).toHaveLength(1); + expect(cancelHits[0]?.url).toContain(`/v1/jobs/${FAKE_JOB_ID}/cancel`); + // And the bin is dead — SIGKILL bypassed its SIGTERM + // no-op (which had been masking HMR's earlier SIGTERM). + let probeError: NodeJS.ErrnoException | null = null; + try { + process.kill(pid, 0); + } catch (e) { + probeError = e as NodeJS.ErrnoException; + } + expect(probeError?.code).toBe("ESRCH"); + } finally { + globalThis.fetch = ORIG_FETCH; + } + }); + it("dispatches HMR signals for `ready` events too (not only `rebuild`)", async () => { // Regression: previously the dispatch fired only on // `rebuild`, so a child started via `/api/train` *before* diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 6b53d0c9..8ea4c011 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -473,6 +473,19 @@ export function buildStudioApp(options: StudioServerOptions) { // round-trip through `TextEncoder`. The previous code did // `enc.encode(d)` which implicitly coerced the buffer via // `String()` — same byte content, but allocates a new array. + // Forward a chunk to the SPA stream. Shared between the + // stdout and stderr listeners — both paths surface as + // request body bytes for the SPA's log view. + const forward = (d: Buffer): void => { + if (closed) return; + try { + controller.enqueue(d); + } catch { + // Controller raced us into the closed state — flip the + // flag so subsequent chunks short-circuit. + closed = true; + } + }; // Carry-over buffer for line-oriented job-id extraction. // Stream chunk boundaries are arbitrary — the runner's // single-line `Started job ` write can land split @@ -488,16 +501,24 @@ export function buildStudioApp(options: StudioServerOptions) { // marker (the canonical line is well under 100 bytes). let stdoutLineBuf = ""; const STARTED_JOB_BUFFER_CAP = 4096; - const onChunk = (d: Buffer): void => { + // STDOUT-ONLY job-id parser. The runner writes the canonical + // `Started job ` line via `process.stdout.write` — never + // stderr — so a single shared buffer across both pipes + // would mis-match in two ways: + // 1. A user `console.error("Started job ")` would + // poison the buffer first; the real stdout marker + // arrives later but our `getJobId(...) === null` gate + // has already short-circuited subsequent scans, so + // Stop-training POSTs cancel for the wrong (or + // non-existent) job. + // 2. Interleaved stderr bytes could land between + // "Started job " and "\n" in the shared buffer, + // breaking the anchored line match → missed match → + // cloud cancel skipped on Stop. + // Two dedicated handlers share `forward` for the byte + // pipeline but only the stdout one runs the parse. + const onStdoutChunk = (d: Buffer): void => { if (closed) return; - // Watch for the runner's `Started job ` line so the - // cancel handler (below) can POST `/v1/jobs/:id/cancel` - // to cloud-api on user-initiated abort. SIGKILL bypasses - // the runner's `installShutdownHandlers`, so without - // this server-side cancel a Stop click would leave the - // cloud job running until TTL/reaper. Stop scanning - // once the id is recorded — the runner prints the line - // exactly once. if (activeTrains.getJobId(child.pid) === null) { stdoutLineBuf += d.toString("utf8"); let nl = stdoutLineBuf.indexOf("\n"); @@ -519,13 +540,13 @@ export function buildStudioApp(options: StudioServerOptions) { stdoutLineBuf = stdoutLineBuf.slice(-STARTED_JOB_BUFFER_CAP); } } - try { - controller.enqueue(d); - } catch { - // Controller raced us into the closed state — flip the - // flag so subsequent chunks short-circuit. - closed = true; - } + forward(d); + }; + const onStderrChunk = (d: Buffer): void => { + // Forward only — never scan for `Started job`. See + // `onStdoutChunk` comment for the cross-stream poisoning + // hazards this split prevents. + forward(d); }; const enc = new TextEncoder(); // Detach every listener this stream wired onto `child`. Called @@ -540,8 +561,8 @@ export function buildStudioApp(options: StudioServerOptions) { // memory pressure for an `arkor dev` session that spawns many // children over hours. const detachListeners = (): void => { - child.stdout.off("data", onChunk); - child.stderr.off("data", onChunk); + child.stdout.off("data", onStdoutChunk); + child.stderr.off("data", onStderrChunk); child.off("close", onClose); child.off("error", onError); }; @@ -580,8 +601,8 @@ export function buildStudioApp(options: StudioServerOptions) { // already cancelled; nothing more to do. } }; - child.stdout.on("data", onChunk); - child.stderr.on("data", onChunk); + child.stdout.on("data", onStdoutChunk); + child.stderr.on("data", onStderrChunk); child.on("close", onClose); child.on("error", onError); cancelTeardown = () => { @@ -606,28 +627,31 @@ export function buildStudioApp(options: StudioServerOptions) { }; }, cancel() { - // Capture the early-stop flag *before* unregistering: the - // unregister wipes the entry, after which we can't tell - // whether HMR's `dispatchRebuild` had already SIGTERMed - // this child. If it had, sending another SIGTERM here - // would land as the *second* signal on the runner side and - // trigger `installShutdownHandlers`' emergency `exit(143)` - // fast-path — which bypasses the checkpoint-preserving - // early-stop + cloud `cancel()` flow and can leave the - // cloud run alive while the local subprocess dies. The HMR - // path is already driving the child to a clean exit, so we - // just unregister + flip `closed` (via `cancelTeardown`) - // and let it run. The data listeners stay attached so the - // OS pipe keeps draining while the child checkpoints — - // see `cancelTeardown` for the backpressure rationale. - const earlyStopInFlight = activeTrains.isEarlyStopRequested(child.pid); + // The SPA-side cancel is always *user-initiated* — either an + // explicit Stop click or tab-close/navigation, which the + // user just as explicitly chose. HMR-driven SIGTERMs go + // straight from the server to the runner via + // `dispatchRebuild`; they DO NOT trigger this handler + // (the SPA waits for the train stream's `exit=` line and + // schedules auto-restart, never aborting). So manual stop + // takes precedence over any in-flight HMR graceful path: + // we POST cloud cancel + SIGKILL unconditionally. + // + // SIGKILL is uncatchable so the long-standing + // "second-SIGTERM-triggers-exit(143)-fast-path" worry + // (which used to gate this branch on + // `isEarlyStopRequested`) doesn't apply. The runner's + // graceful early-stop chain may have been trying to + // preserve a checkpoint, but the user just said no — keep + // the local subprocess teardown snappy and let the + // server-side cancel POST handle the cloud-side release. + // // Capture the cloud job id BEFORE unregistering — once the // entry is gone, `getJobId(pid)` returns null and the // fire-and-forget POST below would no-op. const jobIdForCancel = activeTrains.getJobId(child.pid); activeTrains.unregister(child.pid); cancelTeardown?.(); - if (earlyStopInFlight) return; // Fire-and-forget cloud-side cancel so the cloud job is // released even though the SIGKILL below bypasses the // runner's `installShutdownHandlers` (which would From 48a926324ca565ea5cce7b4b01ac09dd8928b499 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Sun, 10 May 2026 01:00:23 +0900 Subject: [PATCH 46/55] fix: ensure early-stop branch settles on user callback error to prevent SIGTERM hang This commit updates the early-stop mechanism in the trainer to ensure that the early-stop branch executes even if the user's `onCheckpoint` callback throws an error. By capturing the error and re-throwing it after the early-stop logic, the implementation prevents the SIGTERM handler from hanging due to unresolved promises. Additionally, the cancellation process is modified to reject the deferred promise on cancel errors, improving visibility into cancellation failures. Tests are added to verify this behavior and prevent future regressions. --- packages/arkor/src/cli/cleanupHooks.ts | 16 +- packages/arkor/src/core/trainer.test.ts | 215 ++++++++++++++++++++++++ packages/arkor/src/core/trainer.ts | 47 +++++- 3 files changed, 271 insertions(+), 7 deletions(-) diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index a37062f6..76805c3e 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -29,11 +29,19 @@ export interface CleanupHookOptions { cleanup: () => void | Promise; /** * Whether the signal-handler arm of this registration should call - * `process.exit(0)` once every in-flight cleanup (this hook + any + * `process.exit` once every in-flight cleanup (this hook + any * siblings registered in the same process) has settled. Use `true` * for the outermost cleanup responsible for terminating the * process; `false` for inner cleanups that should let a sibling * own the exit. Default: `false`. + * + * The exit code is the POSIX `128 + signo` for the signal that + * triggered shutdown — 130 for SIGINT, 143 for SIGTERM, 129 for + * SIGHUP (see `SIGNAL_EXIT_CODE`). Parent shells / orchestrators / + * CI runners distinguish "user interrupted" (nonzero) from "ran + * to completion" (zero) on this — exiting 0 for a Ctrl-C'd + * `arkor dev` would let `arkor dev || cleanup_on_failure` skip + * its cleanup branch. */ exitOnSignal?: boolean; } @@ -41,10 +49,10 @@ export interface CleanupHookOptions { /** * Module-scoped tracker of cleanup promises that haven't settled yet. * The exit-owning hook waits on the union of (its own cleanup) + - * (every other in-flight cleanup) before calling `process.exit(0)`, + * (every other in-flight cleanup) before calling `process.exit(...)`, * so a fire-and-forget async cleanup in a sibling registration — * `hmr.dispose()` is the canonical example — isn't cut off by an - * eager exit. + * eager exit. (Exit code is signal-specific — see `SIGNAL_EXIT_CODE`.) * * Auto-prunes via the `.finally(() => inFlightCleanups.delete(...))` * each `run()` attaches, so the set doesn't grow without bound across @@ -142,7 +150,7 @@ export function registerCleanupHook(options: CleanupHookOptions): void { // sibling's freshly-registered promise. Without this, an // `arkor dev` whose `scheduleStudioTokenCleanup` (exitOnSignal: // true) was registered before `scheduleHmrCleanup` (async - // dispose) would `process.exit(0)` mid-`hmr.dispose()` and + // dispose) would `process.exit(...)` mid-`hmr.dispose()` and // leak the rolldown watcher. // // Settled promises pass through `Promise.allSettled` in a diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 5be7fbeb..0799c07a 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -1523,6 +1523,134 @@ describe("createTrainer (early stop)", () => { expect(result.job.status).toBe("cancelled"); }); + it("early-stop branch still settles when the user's onCheckpoint callback throws (no SIGTERM hang)", async () => { + // Regression: the early-stop branch ran AFTER + // `await callbacks.onCheckpoint?.(ctx)`. A user-callback throw + // would propagate out of that await before the early-stop + // cancel + latch settlement could run, leaving + // `earlyStopDeferred` pending. The runner's + // `installShutdownHandlers` awaits that deferred → SIGTERM + // shutdown hangs until the (default 5-min) timeout fallback + // fires. The fix wraps `onCheckpoint` in try/catch, runs the + // early-stop branch unconditionally, then re-throws the + // captured callback error so wait()'s reconnect loop keeps + // its prior semantics. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + const sse = [ + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + `id: 2\nevent: training.log\ndata: ${JSON.stringify({ + type: "training.log", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:02Z", + step: 1, + loss: 0.5, + })}\n\n`, + `id: 3\nevent: checkpoint.saved\ndata: ${JSON.stringify({ + type: "checkpoint.saved", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:03Z", + step: 10, + })}\n\n`, + ]; + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(sseStream(sse), { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + let armedPromise: Promise | null = null; + let armedResult: "resolved" | "rejected" | "pending" = "pending"; + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + callbacks: { + onLog: () => { + if (armedPromise === null) { + armedPromise = requestTrainerEarlyStop(trainer, { + timeoutMs: 60_000, + }); + armedPromise.then( + () => { + armedResult = "resolved"; + }, + () => { + armedResult = "rejected"; + }, + ); + } + }, + onCheckpoint: () => { + // User callback throws DURING the checkpoint that + // would normally trigger early-stop. Without the + // try/catch wrap this throw would skip the + // early-stop branch → latch pending → SIGTERM hang + // for up to 60s (our `timeoutMs`). + throw new Error("user onCheckpoint boom"); + }, + }, + }, + { + baseUrl: "http://mock", + credentials: creds, + cwd, + reconnectDelayMs: 1, + // Cap reconnects at 0 so the user-callback throw + // surfaces as a wait() rejection instead of + // looping forever (handleFailure would otherwise + // reconnect after the throw escapes dispatch). + maxReconnectAttempts: 0, + }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + // wait() rejects — handleFailure wraps the user callback + // throw because maxReconnectAttempts is 0. + await expect(trainer.wait()).rejects.toThrow(); + // Critical: the latch SETTLED via the early-stop branch + // (resolve), not via the 60-second timeout. The cancel POST + // also fired (early-stop reached the cancel call before the + // throw was re-raised). Together: shutdown wouldn't hang. + await new Promise((r) => setImmediate(r)); + expect(armedResult).toBe("resolved"); + expect(cancelCalls).toBe(1); + } finally { + globalThis.fetch = original; + } + }); + it("early-stop checkpoint branch rejects the deferred when cancel() throws (visible to shutdown handler)", async () => { // Regression: previously, an `await trainer.cancel()` that threw // (network failure / cloud-api 5xx during the cancel POST) was @@ -1970,6 +2098,93 @@ describe("createTrainer (early stop)", () => { } }); + it("timeout fallback rejects the deferred when cancel() throws (visible to shutdown handler)", async () => { + // Companion to the checkpoint-branch reject test: when no + // checkpoint arrives within `timeoutMs`, the timeout fallback + // does its own `trainer.cancel()`. Old code swallowed cancel + // errors and ALWAYS resolved the deferred — same false-success + // failure mode as the checkpoint branch had: local runner + // exits cleanly while the cloud job keeps consuming GPU + // budget. The fix mirrors the checkpoint reject path: capture + // the error and reject the deferred so the runner's + // `.catch()` writes it to stderr. + await writeState( + { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, + cwd, + ); + let streamController: ReadableStreamDefaultController | null = + null; + const stallingStream = new ReadableStream({ + start(controller) { + streamController = controller; + const enc = new TextEncoder(); + controller.enqueue( + enc.encode( + `id: 1\nevent: training.started\ndata: ${JSON.stringify({ + type: "training.started", + jobId: "j-stop", + timestamp: "2026-01-01T00:00:01Z", + })}\n\n`, + ), + ); + }, + }); + + let cancelCalls = 0; + const fetcher: typeof fetch = (async ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && url.includes("/v1/jobs?")) { + return new Response(JSON.stringify({ job: minimalJobRow }), { + status: 201, + headers: { "content-type": "application/json" }, + }); + } + if (method === "GET" && url.includes("/v1/jobs/j-stop/events/stream")) { + return new Response(stallingStream, { + status: 200, + headers: { "content-type": "text/event-stream" }, + }); + } + if (method === "POST" && url.includes("/v1/jobs/j-stop/cancel")) { + cancelCalls += 1; + // Close the stream so wait() exits its loop even though we + // throw on the cancel POST itself. + streamController?.close(); + // Simulate cloud-api unreachable mid-cancel (transport). + throw new TypeError("fetch failed"); + } + throw new Error(`unexpected fetch: ${method} ${url}`); + }) as typeof fetch; + + const trainer = createTrainer( + { + name: "run", + model: "m", + dataset: { type: "huggingface", name: "x" }, + }, + { baseUrl: "http://mock", credentials: creds, cwd, reconnectDelayMs: 1 }, + ); + const original = globalThis.fetch; + globalThis.fetch = fetcher; + try { + await trainer.start(); + // Tiny timeout so the timeout fallback fires fast (no + // checkpoint will land — stream only carries + // training.started). The returned promise should REJECT + // because the cancel POST throws. + await expect( + requestTrainerEarlyStop(trainer, { timeoutMs: 5 }), + ).rejects.toThrow(/fetch failed/); + expect(cancelCalls).toBe(1); + } finally { + globalThis.fetch = original; + } + }); + it("is a no-op before start() and resolves immediately", async () => { const trainer = createTrainer( { diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index de66c0c4..0e72b7a7 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -310,7 +310,20 @@ export function createTrainer( infer, artifacts: event.artifacts, }; - await callbacks.onCheckpoint?.(ctx); + // Capture (don't propagate yet) any throw from the user's + // `onCheckpoint`. The early-stop branch below MUST run + // even on a callback throw — without this wrap a thrown + // `onCheckpoint` would skip the cancel + latch settlement, + // leaving the SIGTERM handler waiting on the deferred + // until the (default 5-min) timeout fires. Surface the + // original throw via re-throw at the end so `wait()`'s + // reconnect / failure path keeps its existing semantics. + let onCheckpointError: unknown = null; + try { + await callbacks.onCheckpoint?.(ctx); + } catch (err) { + onCheckpointError = err; + } // Early-stop latch: a checkpoint just landed, so the in-flight work // is durable. Cancel the cloud job and end `wait()` cleanly. if (earlyStopRequested && earlyStopDeferred) { @@ -370,11 +383,21 @@ export function createTrainer( // HMR-driven early-stop resolved `wait()` with empty // `artifacts` even though the checkpoint event carried // the very artifacts the early-stop existed to keep. + // Surface the user's `onCheckpoint` throw (if any) so + // `wait()`'s reconnect / failure path keeps the same + // semantics it had before the wrap — the checkpoint + // workload is preserved, but the user still sees their + // callback error. + if (onCheckpointError !== null) throw onCheckpointError; return { terminal: true, artifacts: (event.artifacts ?? []) as unknown[], }; } + // Same re-throw on the non-early-stop branch: keep + // `wait()`'s reconnect loop seeing the user's original + // callback error so reconnection counters work as before. + if (onCheckpointError !== null) throw onCheckpointError; return { terminal: false, artifacts: terminalResult?.artifacts ?? [] }; } case "training.completed": { @@ -621,9 +644,19 @@ export function createTrainer( // resolves, the checkpoint branch may have nulled out the shared // slot, but this fallback path still owns the deferred it created. const active = earlyStopDeferred; + // Capture (don't swallow) any cancel error so we can surface it + // through the deferred's reject path. Mirrors the checkpoint + // branch — a swallow here lets the runner's + // `installShutdownHandlers` exit "successfully" while the cloud + // job lives on (orphaned GPU spend with zero diagnostic), the + // exact failure mode that a "stop-after-checkpoint" deadline + // exists to PREVENT from going silent. + let cancelError: unknown = null; trainer .cancel() - .catch(() => {}) + .catch((err) => { + cancelError = err; + }) .finally(() => { // Mirror the checkpoint-triggered early-stop branch: reset // the latch and reflect the cancellation locally so a @@ -641,7 +674,15 @@ export function createTrainer( completedAt: new Date().toISOString(), }; } - if (active) active.resolve(); + if (active) { + // Resolve on success, REJECT on cancel failure so the + // SIGTERM handler's `.catch()` writes the error to + // stderr and the operator can see that the cloud job + // may still be live. The latch always settles either + // way — shutdown won't hang. + if (cancelError !== null) active.reject(cancelError); + else active.resolve(); + } if (earlyStopDeferred === active) earlyStopDeferred = null; }); }, timeoutMs); From b8db73a36a8f31c94b33f1838a678d1949823a59 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 12 May 2026 00:10:15 +0900 Subject: [PATCH 47/55] fix: implement POSIX exit codes for second signal handling and improve HMR error state management --- packages/arkor/src/core/runnerSignals.test.ts | 44 ++++++++++ packages/arkor/src/core/runnerSignals.ts | 38 +++++++- packages/arkor/src/core/trainer.ts | 35 +++++++- packages/arkor/src/studio/hmr.ts | 82 +++++++++++++++++- packages/arkor/src/studio/server.test.ts | 86 +++++++++++++++++++ packages/arkor/src/studio/server.ts | 59 +++++++++---- .../studio-app/src/components/RunTraining.tsx | 12 +++ 7 files changed, 332 insertions(+), 24 deletions(-) diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index a10922a0..a461255e 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -84,6 +84,50 @@ describe("installShutdownHandlers", () => { } }); + it("second-signal exit code is per-signal POSIX 128+signo (130 for SIGINT, 129 for SIGHUP)", async () => { + // Regression: the second-signal emergency-exit path used to + // hardcode `process.exit(143)` regardless of which signal + // fired. SIGINT (Ctrl-C twice) and SIGHUP shutdowns then + // looked like SIGTERM exits to parent shells / orchestrators, + // breaking signal-aware logic (e.g. tmux pane behaviour, CI + // job classification, `&&` / `||` chains that distinguish + // user-cancel from clean exit). Mirrors `SIGNAL_EXIT_CODE` in + // `cli/cleanupHooks.ts`. + const cases: Array<["SIGINT" | "SIGTERM" | "SIGHUP", number]> = [ + ["SIGINT", 130], + ["SIGTERM", 143], + ["SIGHUP", 129], + ]; + for (const [sig, expectedExit] of cases) { + const trainer = makeTrainer(); + const exitCodes: number[] = []; + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((code?: number) => { + exitCodes.push(code ?? 0); + return undefined as never; + }) as typeof process.exit); + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + const dispose = installShutdownHandlers(trainer); + try { + process.emit(sig, sig); + await new Promise((r) => setTimeout(r, 10)); + process.emit(sig, sig); + await new Promise((r) => setTimeout(r, 10)); + // First signal exits 0 via the early-stop chain's + // `.finally(() => process.exit(0))`; second signal exits + // with the per-signal POSIX code. + expect(exitCodes, `signal ${sig}`).toContain(expectedExit); + } finally { + dispose(); + exitSpy.mockRestore(); + stdoutSpy.mockRestore(); + } + } + }); + it("second SIGTERM exits 143 without re-invoking requestEarlyStop", async () => { const trainer = makeTrainer(); const exitCodes: number[] = []; diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index 404ad9e3..f79233d3 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -9,6 +9,23 @@ import type { Trainer, TrainerCallbacks } from "./types"; const SHUTDOWN_SIGNALS = ["SIGTERM", "SIGINT", "SIGHUP"] as const; const CALLBACK_RELOAD_SIGNAL = "SIGUSR2" as const; +/** + * POSIX-style exit code for a signal-terminated process: `128 + signo`. + * Used by the second-signal emergency-exit path so the runner's exit + * status reflects which signal actually fired (Ctrl-C vs SIGTERM vs + * SIGHUP), not a single hardcoded 143. Mirrors the SIGNAL_EXIT_CODE + * map in `cli/cleanupHooks.ts`. Parent shells / orchestrators / CI + * runners distinguish "user interrupted" by signo on POSIX. + */ +const SECOND_SIGNAL_EXIT_CODE: Record< + (typeof SHUTDOWN_SIGNALS)[number], + number +> = { + SIGHUP: 129, + SIGINT: 130, + SIGTERM: 143, +}; + /** * Two-stage shutdown handling so HMR rebuilds (Studio sends SIGTERM) * preserve the in-flight checkpoint work: @@ -16,8 +33,10 @@ const CALLBACK_RELOAD_SIGNAL = "SIGUSR2" as const; * - 1st signal → `trainer.requestEarlyStop()`. The trainer keeps * running, lets the next `checkpoint.saved` event land, then issues * `cancel()`. - * - 2nd signal → immediate `process.exit(143)`. Escape hatch for an - * impatient operator or a hung early-stop. + * - 2nd signal → immediate `process.exit(POSIX 128+signo)` — + * 130 for SIGINT, 143 for SIGTERM, 129 for SIGHUP. Escape hatch + * for an impatient operator or a hung early-stop. Per-signal + * exit code so parent shells see the actual interruption type. * * The returned dispose function removes the handlers so a normal * `wait()` completion doesn't leave stale listeners behind — important @@ -32,7 +51,20 @@ export function installShutdownHandlers(trainer: Trainer): () => void { process.stdout.write( `Received second ${signal}; exiting without waiting for checkpoint.\n`, ); - process.exit(143); + // POSIX 128 + signo so the parent shell sees the right exit + // status: 130 for SIGINT (Ctrl-C twice), 129 for SIGHUP, + // 143 for SIGTERM. Hardcoding 143 misclassifies SIGINT and + // SIGHUP shutdowns as SIGTERM-style exits and breaks + // signal-aware orchestration. The cast is safe because + // `signal` here is always one of `SHUTDOWN_SIGNALS` (Node's + // `signal` arg matches whatever name we registered the + // handler under). Defaults to 143 for any future signal we + // forget to map. + const code = + SECOND_SIGNAL_EXIT_CODE[ + signal as (typeof SHUTDOWN_SIGNALS)[number] + ] ?? 143; + process.exit(code); // Explicit return so test mocks of process.exit (which don't // actually terminate the worker) don't fall through into the // early-stop path. diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index 0e72b7a7..b23d9896 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -356,9 +356,27 @@ export function createTrainer( // surface as `status: "running"`, and a subsequent // `requestEarlyStop` would not see the // `TERMINAL_STATUSES.has(...)` short-circuit it relies on. + // + // Status is `"failed"` when the cancel POST itself threw + // (cloud-api transient failure mid-cancel) — labelling + // such runs `"cancelled"` would lie about the cloud-side + // state, which may still be running. `"failed"` is + // terminal too, so the latch / TERMINAL_STATUSES short- + // circuit still works, but `wait()`'s caller can + // distinguish "we cancelled cleanly" from "we tried but + // the cancel may not have landed". The original cancel + // error is also rejected through the deferred below for + // the SIGTERM handler's `.catch()`. startedJob = { ...startedJob, - status: "cancelled", + status: cancelError !== null ? "failed" : "cancelled", + ...(cancelError !== null && { + error: `Early-stop cancel failed: ${ + cancelError instanceof Error + ? cancelError.message + : String(cancelError) + }`, + }), completedAt: event.timestamp, }; if (cancelError !== null) { @@ -668,9 +686,22 @@ export function createTrainer( // future checkpoint event for the rest of its lifetime. earlyStopRequested = false; if (startedJob && !TERMINAL_STATUSES.has(startedJob.status)) { + // Symmetric to the checkpoint branch: `"failed"` (not + // `"cancelled"`) on cancel-throw so we don't lie + // about cloud-side state that may still be running. + // Both branches feed the same TERMINAL_STATUSES + // short-circuit, so re-armed `requestEarlyStop()` + // calls still no-op correctly. startedJob = { ...startedJob, - status: "cancelled", + status: cancelError !== null ? "failed" : "cancelled", + ...(cancelError !== null && { + error: `Early-stop cancel failed: ${ + cancelError instanceof Error + ? cancelError.message + : String(cancelError) + }`, + }), completedAt: new Date().toISOString(), }; } diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 9bcbdcba..47f89148 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -1,4 +1,5 @@ -import { existsSync, statSync } from "node:fs"; +import { createHash } from "node:crypto"; +import { existsSync, readFileSync, statSync } from "node:fs"; import { watch, type RolldownWatcher } from "rolldown"; import { hashJobConfig } from "../core/configHash"; import { moduleCacheBustUrl } from "../core/moduleCacheBust"; @@ -21,6 +22,17 @@ export interface HmrEvent { * this to dedupe replays of the same successful build. */ hash?: string; + /** + * Content-derived hash (sha256, truncated) of the artefact bytes. + * Used by `dispatchRebuild`'s pre-ready-spawn equality gate where + * `hash` would over-trigger SIGTERM-restart: a watcher build that + * rewrites identical bytes still bumps mtime/ctime, so two + * timestamp fingerprints differ even though the loaded bytes are + * the same. Comparing this content-hash instead avoids that + * spurious cancel+restart cycle in the "user clicked Run before + * the watcher's first BUNDLE_END landed" case. + */ + contentHash?: string | null; /** * Stable hash of the trainer's cloud-side `JobConfig`. When this is * unchanged across a rebuild, only the in-process callbacks moved and @@ -65,11 +77,56 @@ export interface HmrCoordinator { * doesn't exist yet, fresh project never built). */ getCurrentArtifactHash(): string | null; + /** + * Content-derived hash (sha256, truncated) of the on-disk + * artefact RIGHT NOW. Used by `/api/train` to capture a + * spawn-time content-hash for the registry's pre-ready-spawn + * equality gate — paired with the rebuild's `event.contentHash`, + * a mismatch unambiguously means the bytes changed (not just + * timestamps), so `dispatchRebuild` only SIGTERM-restarts when + * the child genuinely loaded different bytes than the new + * configHash describes. `null` on stat/read failure (artefact + * doesn't exist yet, fresh project never built). + */ + getCurrentArtifactContentHash(): string | null; + /** + * Last broadcast event's `type`, or `null` if nothing has been + * broadcast yet. `/api/manifest`'s HMR fast path consults this to + * suppress its "serve last good artefact" behaviour while the + * watcher is in an `error` state — without that gate, the SPA's + * 5 s `/api/manifest` poll would keep getting a 200 stale + * manifest and silently overwrite the SSE-driven build-error UI, + * letting users run with stale code/config while the latest + * source is still failing to compile. + */ + getLastEventType(): HmrEventType | null; dispose(): Promise; } export type HmrOptions = BuildEntryOptions; +/** + * Content-derived fingerprint of the artefact bytes (sha256, first 16 + * hex chars). Used by `dispatchRebuild`'s pre-ready-spawn gate where + * timestamp-based comparison gives false positives: a watcher rebuild + * that produces the same bytes still bumps mtime/ctime, so a child + * spawned just before `ready` would be unnecessarily SIGTERM-restarted + * even though its loaded bytes match the new build's. Hashing a few + * MB of bundle on each call is cheap relative to the GPU cost of a + * spurious cancel+restart cycle. + * + * Returns `null` on stat/read failure so the caller can treat + * "no artefact" as "force restart" (the conservative default). + */ +function contentHashOrNull(outFile: string): string | null { + try { + const bytes = readFileSync(outFile); + return createHash("sha256").update(bytes).digest("hex").slice(0, 16); + } catch { + return null; + } +} + /** * Single-stat fingerprint with a clean `null` on failure — used by * `getCurrentArtifactHash()` whose contract is "return a fingerprint @@ -267,6 +324,11 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { type, outFile: resolved.outFile, hash: fingerprint(resolved.outFile), + // Content hash powers the registry's pre-ready-spawn equality + // gate (timestamp-only would over-trigger SIGTERM-restart on + // identical-bytes rebuilds). Read once here so the broadcast + // and any spawn-time capture reference the same on-disk state. + contentHash: contentHashOrNull(resolved.outFile), configHash, trainerName: inspection?.trainerName ?? null, }); @@ -416,6 +478,24 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // routing. return fingerprintOrNull(resolved.outFile); }, + getCurrentArtifactContentHash() { + // Companion to `getCurrentArtifactHash` for the registry's + // pre-ready-spawn equality gate. Reads + sha256s the file + // at call time so the result describes the exact bytes the + // just-spawned child will see in its `await import()`. + // Same null-on-failure contract — caller treats null as + // "force restart" (the conservative default). + return contentHashOrNull(resolved.outFile); + }, + getLastEventType() { + // `lastEvent` is the latest broadcast — `ready` / `rebuild` / + // `error`. Returning the type lets `/api/manifest`'s HMR + // fast path skip serving the stale built artefact when the + // watcher is currently in `error` (current source fails to + // compile), so the SPA's poll loop doesn't paper over the + // SSE-surfaced error. + return lastEvent?.type ?? null; + }, async dispose() { disposed = true; subscribers.clear(); diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 0954e327..0532b14d 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -153,6 +153,8 @@ describe("Studio server", () => { subscribe: () => () => undefined, getCurrentConfigHash: () => null, getCurrentArtifactHash: () => null, + getCurrentArtifactContentHash: () => null, + getLastEventType: () => null, async dispose() {}, }; const app = buildStudioApp({ @@ -613,6 +615,8 @@ process.exit(0); return "spawn-time-hash"; }, getCurrentArtifactHash: () => "spawn-artefact-hash", + getCurrentArtifactContentHash: () => "spawn-artefact-content-hash", + getLastEventType: () => null, async dispose() {}, }; const fakeBin = join(trainCwd, "fake-bin.mjs"); @@ -1658,6 +1662,8 @@ process.exit(0); subscribe: () => () => undefined, getCurrentConfigHash: () => null, getCurrentArtifactHash: () => null, + getCurrentArtifactContentHash: () => null, + getLastEventType: () => null, async dispose() {}, }; const app = buildStudioApp({ @@ -1709,6 +1715,8 @@ process.exit(0); subscribe: () => () => undefined, getCurrentConfigHash: () => null, getCurrentArtifactHash: () => null, + getCurrentArtifactContentHash: () => null, + getLastEventType: () => null, async dispose() {}, }; const app = buildStudioApp({ @@ -1731,6 +1739,65 @@ process.exit(0); }; expect(body.trainer).toEqual({ name: "fallback-build" }); }); + + it("returns 400 (not stale 200) while the HMR watcher is in error state", async () => { + // Regression: the HMR fast path served the last-built artefact + // even when the watcher's most recent event was `error`. The + // SPA's `/api/manifest` poll runs every ~5s, so a successful + // 200 with stale data would silently overwrite the SSE-driven + // build-error UI within 5s of the user breaking their source — + // they'd then unknowingly run stale code/config while the + // latest edit is still failing to compile. Gating the fast + // path on `getLastEventType() === "error"` keeps both + // channels (poll + SSE) consistent. + await writeCredentials(ANON_CREDS); + mkdirSync(join(trainCwd, ".arkor/build"), { recursive: true }); + // Pre-write a previously-good artefact so the fast path + // *would* otherwise return 200 with it. + writeFileSync( + join(trainCwd, ".arkor/build/index.mjs"), + `const trainer = { + name: "stale-good-build", + start: async () => ({ jobId: "j" }), + wait: async () => ({ job: {}, artifacts: [] }), + cancel: async () => {}, + }; + export const arkor = { _kind: "arkor", trainer }; + export default arkor; + `, + ); + // Coordinator is currently in error state — the latest + // broadcast was a compile failure. + const fakeHmr = { + subscribe: () => () => undefined, + getCurrentConfigHash: () => null, + getCurrentArtifactHash: () => null, + getCurrentArtifactContentHash: () => null, + getLastEventType: () => "error" as const, + async dispose() {}, + }; + const app = buildStudioApp({ + baseUrl: "http://mock", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + hmr: fakeHmr, + }); + const res = await app.request("/api/manifest", { + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + }, + }); + // 400 — the SPA's existing 4xx-handling path renders the + // build-error hint instead of a fake-healthy manifest. + expect(res.status).toBe(400); + const body = (await res.json()) as { error?: string }; + expect(body.error).toMatch(/Build failed/); + // Sanity: the stale artefact name is NOT leaked through. + expect(JSON.stringify(body)).not.toContain("stale-good-build"); + }); }); describe("/api/inference/chat", () => { @@ -1956,6 +2023,9 @@ process.exit(0); // pre-ready-spawn path (configHash null, then a real hash) // can override via `setArtifactHash`. let currentArtifactHash: string | null = "fake-artefact-hash"; + let currentArtifactContentHash: string | null = + "fake-artefact-content-hash"; + let lastEventType: HmrEvent["type"] | null = null; const coordinator: HmrCoordinator = { subscribe(fn) { subs.add(fn); @@ -1969,6 +2039,12 @@ process.exit(0); getCurrentArtifactHash() { return currentArtifactHash; }, + getCurrentArtifactContentHash() { + return currentArtifactContentHash; + }, + getLastEventType() { + return lastEventType; + }, async dispose() { subs.clear(); }, @@ -1976,6 +2052,10 @@ process.exit(0); return { coordinator, emit(event: HmrEvent) { + // Track the latest event type so `getLastEventType()` + // mirrors the real coordinator's `lastEvent?.type` — + // the `/api/manifest` HMR-error gate consults this. + lastEventType = event.type; for (const fn of subs) fn(event); }, setConfigHash(hash: string | null) { @@ -1984,6 +2064,12 @@ process.exit(0); setArtifactHash(hash: string | null) { currentArtifactHash = hash; }, + setArtifactContentHash(hash: string | null) { + currentArtifactContentHash = hash; + }, + setLastEventType(t: HmrEvent["type"] | null) { + lastEventType = t; + }, get subscriberCount() { return subs.size; }, diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 8ea4c011..d0d41de5 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -293,6 +293,20 @@ export function buildStudioApp(options: StudioServerOptions) { : undefined; app.get("/api/manifest", async (c) => { try { + // Surface watcher build errors directly. Without this gate the + // HMR fast path below would happily serve the LAST GOOD + // artefact even when the user's current source fails to + // compile — `RunTraining` polls `/api/manifest` every ~5 s, so + // the next poll after a compile error would 200 with stale + // data and silently overwrite the SSE-surfaced error UI. + // Users would then see a "healthy" trainer in the manifest + // and unknowingly run stale code/config while the latest + // edit is still broken. Rejecting with the SSE error message + // keeps the SPA's error state consistent across both + // channels (poll + SSE). + if (options.hmr?.getLastEventType() === "error") { + return c.json({ error: "Build failed; see HMR error frame" }, 400); + } // HMR-aware fast path: when `arkor dev` wired in a coordinator, // skip the per-request `runBuild()` and read the watcher's // already-built artefact. Without this every SPA poll @@ -408,18 +422,25 @@ export function buildStudioApp(options: StudioServerOptions) { const configHash: string | null = options.hmr ? options.hmr.getCurrentConfigHash() : null; - // Spawn-time fingerprint of the on-disk build artefact. Only the - // pre-ready-spawn case in `dispatchRebuild` consults it: when a - // rebuild lands while the child's `configHash` is still null, - // backfilling the new hash is only safe if the artefact bytes - // the child loaded (= the bytes on disk *now*, at spawn) are - // the same bytes the new hash describes. Without this gate, an - // edit landing between spawn and the watcher's first BUNDLE_END - // would silently align the registry with a config the child - // never actually loaded → cloud-side `JobConfig` drift on - // subsequent same-hash hot-swaps. + // Spawn-time CONTENT-hash of the on-disk build artefact. Only + // the pre-ready-spawn case in `dispatchRebuild` consults it: + // when a rebuild lands while the child's `configHash` is still + // null, backfilling the new hash is only safe if the artefact + // bytes the child loaded (= the bytes on disk *now*, at spawn) + // are the same bytes the new hash describes. Without this + // gate, an edit landing between spawn and the watcher's first + // BUNDLE_END would silently align the registry with a config + // the child never actually loaded → cloud-side `JobConfig` + // drift on subsequent same-hash hot-swaps. + // + // Content (sha256) rather than mtime+ctime+size: the + // timestamp version had a false-positive failure mode where a + // watcher rebuild that produced identical bytes still bumped + // mtime/ctime, forcing a spurious cancel+restart cycle on a + // pre-ready spawn even though the child's loaded bytes + // actually matched the new build. Content-hash is precise. const spawnArtifactHash: string | null = options.hmr - ? options.hmr.getCurrentArtifactHash() + ? options.hmr.getCurrentArtifactContentHash() : null; const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); @@ -798,13 +819,15 @@ export function buildStudioApp(options: StudioServerOptions) { // both buckets so the SPA can react per-child rather than // assuming one global outcome. const nextHash = event.configHash ?? null; - // The artefact fingerprint travels in the same broadcast - // (`event.hash`). Pass it through so the registry can gate - // the pre-ready-spawn backfill on whether the bytes the - // child loaded match what this rebuild's hash describes - // — see `dispatchRebuild`'s comment for why a null entry - // hash + matching artefact is the only safe backfill path. - const nextArtifactHash = event.hash ?? null; + // Content-hash for the pre-ready-spawn equality gate (the + // timestamp `event.hash` would over-trigger SIGTERM-restart + // on identical-bytes rebuilds). Both sides of the + // comparison — `entry.spawnArtifactHash` (captured via + // `getCurrentArtifactContentHash()`) and this `event.contentHash` + // — are derived the same way, so a match means the + // child's loaded bytes ARE what the new configHash + // describes. + const nextArtifactHash = event.contentHash ?? null; const { hotSwapTargets, restartTargets } = activeTrains.dispatchRebuild(nextHash, nextArtifactHash); augmented = { diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 42cbd621..13ee8330 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -188,6 +188,18 @@ export function RunTraining() { if (payload.type === "error") { setManifest({ error: payload.message ?? "Build failed" }); setHmrStatus("idle"); + // Cancel any pending HMR auto-restart latched from a + // previous successful rebuild. Without this, a sequence + // like (rebuild → restartPendingRef=true → user breaks + // the source → error event → child eventually exits) would + // hit `run()`'s finally branch, see the still-set latch, + // and auto-restart from the **previous** artefact even + // though the latest source state is broken — silent + // stale-code background churn until the user notices. + // Clearing here makes the user's broken-state edit the + // source of truth: no auto-restart fires until the next + // successful rebuild re-arms the latch. + restartPendingRef.current = false; return; } // Always refresh the manifest on ready/rebuild. From f65561b89ad9cc871d4b048fb8db5a9d6abfb10c Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 12 May 2026 00:50:06 +0900 Subject: [PATCH 48/55] fix: rename spawnArtifactHash to spawnArtifactContentHash for clarity and consistency in HMR handling --- packages/arkor/src/core/runnerSignals.ts | 36 ++++-- packages/arkor/src/studio/hmr.test.ts | 9 +- packages/arkor/src/studio/server.ts | 26 ++-- .../arkor/src/studio/trainRegistry.test.ts | 8 +- packages/arkor/src/studio/trainRegistry.ts | 112 ++++++++++-------- 5 files changed, 111 insertions(+), 80 deletions(-) diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index f79233d3..aadcf5c0 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -45,7 +45,7 @@ const SECOND_SIGNAL_EXIT_CODE: Record< */ export function installShutdownHandlers(trainer: Trainer): () => void { let signalCount = 0; - const handler = (signal: NodeJS.Signals): void => { + const handler = (signal: (typeof SHUTDOWN_SIGNALS)[number]): void => { signalCount += 1; if (signalCount > 1) { process.stdout.write( @@ -55,15 +55,9 @@ export function installShutdownHandlers(trainer: Trainer): () => void { // status: 130 for SIGINT (Ctrl-C twice), 129 for SIGHUP, // 143 for SIGTERM. Hardcoding 143 misclassifies SIGINT and // SIGHUP shutdowns as SIGTERM-style exits and breaks - // signal-aware orchestration. The cast is safe because - // `signal` here is always one of `SHUTDOWN_SIGNALS` (Node's - // `signal` arg matches whatever name we registered the - // handler under). Defaults to 143 for any future signal we - // forget to map. - const code = - SECOND_SIGNAL_EXIT_CODE[ - signal as (typeof SHUTDOWN_SIGNALS)[number] - ] ?? 143; + // signal-aware orchestration. Defaults to 143 for any future + // signal we forget to map. + const code = SECOND_SIGNAL_EXIT_CODE[signal] ?? 143; process.exit(code); // Explicit return so test mocks of process.exit (which don't // actually terminate the worker) don't fall through into the @@ -86,9 +80,27 @@ export function installShutdownHandlers(trainer: Trainer): () => void { }) .finally(() => process.exit(0)); }; - for (const sig of SHUTDOWN_SIGNALS) process.on(sig, handler); + // Per-signal closure (vs a single shared listener registered on + // every signal): the closure captures `sig` at registration time + // so the handler doesn't depend on whatever Node passes as the + // event arg. Node's documented contract is to pass the signal + // name, but pinning the source via closure keeps the handler + // robust regardless and makes the registration → arg + // relationship explicit at the callsite. Stored in a Map so + // `process.off` can remove the exact closure (anonymous arrow + // would leak the listener since `process.off` matches by + // identity). + const signalHandlers = new Map< + (typeof SHUTDOWN_SIGNALS)[number], + () => void + >(); + for (const sig of SHUTDOWN_SIGNALS) { + const fn = () => handler(sig); + signalHandlers.set(sig, fn); + process.on(sig, fn); + } return () => { - for (const sig of SHUTDOWN_SIGNALS) process.off(sig, handler); + for (const [sig, fn] of signalHandlers) process.off(sig, fn); }; } diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index c03adb4c..22973b29 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -358,10 +358,11 @@ describe("createHmrCoordinator", () => { // Companion to the null-on-missing test: when the artefact // *does* exist (watcher's first BUNDLE_END landed), the // getter returns the same `mtimeMs-ctimeMs-size` shape the - // SSE event's `hash` field uses. Symmetric value lets - // `dispatchRebuild` compare `entry.spawnArtifactHash` against - // `event.hash` directly for the pre-ready-spawn backfill - // decision. + // SSE event's `hash` field uses. The two are paired for SSE + // dedup purposes; the pre-ready-spawn registry gate switched + // to content-hash (`getCurrentArtifactContentHash`) to avoid + // identical-bytes/different-timestamps false positives, but + // the timestamp hash stays as the canonical SSE event id. mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index d0d41de5..4255f638 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -439,7 +439,7 @@ export function buildStudioApp(options: StudioServerOptions) { // mtime/ctime, forcing a spurious cancel+restart cycle on a // pre-ready spawn even though the child's loaded bytes // actually matched the new build. Content-hash is precise. - const spawnArtifactHash: string | null = options.hmr + const spawnArtifactContentHash: string | null = options.hmr ? options.hmr.getCurrentArtifactContentHash() : null; const args = [trainBinPath, "start"]; @@ -470,7 +470,11 @@ export function buildStudioApp(options: StudioServerOptions) { 500, ); } - activeTrains.register(child, { trainFile, configHash, spawnArtifactHash }); + activeTrains.register(child, { + trainFile, + configHash, + spawnArtifactContentHash, + }); // Hoisted out of the `ReadableStream` underlying-source so the // `start` handler can hand its closure-bound teardown helper to // the `cancel` handler. `cancel` runs in a separate invocation, @@ -822,14 +826,16 @@ export function buildStudioApp(options: StudioServerOptions) { // Content-hash for the pre-ready-spawn equality gate (the // timestamp `event.hash` would over-trigger SIGTERM-restart // on identical-bytes rebuilds). Both sides of the - // comparison — `entry.spawnArtifactHash` (captured via - // `getCurrentArtifactContentHash()`) and this `event.contentHash` - // — are derived the same way, so a match means the - // child's loaded bytes ARE what the new configHash - // describes. - const nextArtifactHash = event.contentHash ?? null; - const { hotSwapTargets, restartTargets } = - activeTrains.dispatchRebuild(nextHash, nextArtifactHash); + // comparison — `entry.spawnArtifactContentHash` (captured + // via `getCurrentArtifactContentHash()`) and this + // `event.contentHash` — are derived the same way, so a + // match means the child's loaded bytes ARE what the new + // configHash describes. + const nextArtifactContentHash = event.contentHash ?? null; + const { hotSwapTargets, restartTargets } = activeTrains.dispatchRebuild( + nextHash, + nextArtifactContentHash, + ); augmented = { ...event, hotSwap: hotSwapTargets.length > 0, diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index 935f95f3..324797bb 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -69,7 +69,7 @@ describe("TrainRegistry", () => { // before the watcher's first BUNDLE_END" case. Whether it's // safe to backfill the new hash as the child's baseline depends // on whether the on-disk artefact has changed between spawn - // and now: if `spawnArtifactHash === nextArtifactHash`, the + // and now: if `spawnArtifactContentHash === nextArtifactContentHash`, the // child read exactly the bytes the new hash describes → // backfill + skip dispatch (no spurious cancel+restart cycle). // Otherwise — see the next test — SIGTERM-restart so cloud @@ -79,7 +79,7 @@ describe("TrainRegistry", () => { reg.register(c as unknown as ChildProcess, { configHash: null, trainFile: "/tmp/preready.ts", - spawnArtifactHash: "art-v1", + spawnArtifactContentHash: "art-v1", }); const result = reg.dispatchRebuild("first-real-hash", "art-v1"); // Neither bucket — no signal sent, nothing for the SPA to react to. @@ -122,7 +122,7 @@ describe("TrainRegistry", () => { reg.register(c as unknown as ChildProcess, { configHash: null, trainFile: "/tmp/preready-stale.ts", - spawnArtifactHash: "art-stale", + spawnArtifactContentHash: "art-stale", }); const result = reg.dispatchRebuild("real-hash", "art-fresh"); // SIGTERM-restart: the child's bytes are stale relative to the @@ -149,7 +149,7 @@ describe("TrainRegistry", () => { reg.register(c as unknown as ChildProcess, { configHash: null, trainFile: "/tmp/preready-fresh.ts", - spawnArtifactHash: null, // no artefact when /api/train fired + spawnArtifactContentHash: null, // no artefact when /api/train fired }); const result = reg.dispatchRebuild("first-real-hash", "art-fresh"); expect(result.hotSwapTargets).toEqual([]); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index dc651e66..260ec2cd 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -22,20 +22,28 @@ export interface ActiveTrain { * can't prove the configs match. */ configHash: string | null; /** - * Fingerprint (mtime+ctime+size, see `core/moduleCacheBust.ts`) of - * the on-disk `.arkor/build/index.mjs` at spawn time. Used **only** - * to gate the pre-ready-spawn backfill: if a rebuild eventually - * fires while `configHash` is still null and this fingerprint - * matches the rebuild's artefact, the child is provably reading - * the same bundle bytes the new hash describes — safe to backfill - * `configHash` and skip dispatch. A mismatch (or null here) means - * the on-disk artefact has changed between spawn and rebuild - * (user edited mid-spawn, fresh project never built, …) so the - * child is running stale bytes and we MUST SIGTERM-restart to - * keep cloud-side `JobConfig` aligned with what the child - * actually loaded. Null when HMR isn't enabled or stat failed. + * Content hash (sha256, truncated — see `studio/hmr.ts`'s + * `contentHashOrNull`) of the on-disk `.arkor/build/index.mjs` + * at spawn time. Used **only** to gate the pre-ready-spawn + * backfill: if a rebuild eventually fires while `configHash` is + * still null and this content hash equals the rebuild's + * `event.contentHash`, the child is provably reading the same + * bundle bytes the new hash describes — safe to backfill + * `configHash` and skip dispatch. A mismatch (or null here) + * means the on-disk artefact has changed between spawn and + * rebuild (user edited mid-spawn, fresh project never built, …) + * so the child is running stale bytes and we MUST SIGTERM-restart + * to keep cloud-side `JobConfig` aligned with what the child + * actually loaded. + * + * Content-hash (vs the timestamp `mtime+ctime+size` shape used + * by `event.hash` for SSE dedup) avoids a false-positive + * mismatch when a watcher rebuild produces identical bytes — + * timestamps still bump, but content is the same and we + * shouldn't force a spurious cancel+restart cycle. Null when + * HMR isn't enabled or read failed. */ - spawnArtifactHash: string | null; + spawnArtifactContentHash: string | null; /** * `true` once we've already SIGTERM'd this child for an HMR-driven * early-stop. Subsequent rebuilds (which can land before the child @@ -132,24 +140,25 @@ export class TrainRegistry { child: ChildProcess, init: Omit< ActiveTrain, - "child" | "earlyStopRequested" | "spawnArtifactHash" | "jobId" + "child" | "earlyStopRequested" | "spawnArtifactContentHash" | "jobId" > & { // Optional in the signature so tests / future callers that - // don't track the on-disk artefact fingerprint (e.g. an HMR- - // disabled server, a hand-rolled fake) can omit it. Defaults - // to `null`, which forces the pre-ready-spawn branch to fall - // through to SIGTERM-restart on the next non-null rebuild — - // the safe choice when we genuinely don't know what bytes - // the child loaded. Real `/api/train` calls in HMR mode - // capture this from `coordinator.getCurrentArtifactHash()`. - spawnArtifactHash?: string | null; + // don't track the on-disk artefact content hash (e.g. an + // HMR-disabled server, a hand-rolled fake) can omit it. + // Defaults to `null`, which forces the pre-ready-spawn + // branch to fall through to SIGTERM-restart on the next + // non-null rebuild — the safe choice when we genuinely + // don't know what bytes the child loaded. Real `/api/train` + // calls in HMR mode capture this from + // `coordinator.getCurrentArtifactContentHash()`. + spawnArtifactContentHash?: string | null; }, ): void { if (typeof child.pid !== "number") return; this.entries.set(child.pid, { child, ...init, - spawnArtifactHash: init.spawnArtifactHash ?? null, + spawnArtifactContentHash: init.spawnArtifactContentHash ?? null, earlyStopRequested: false, // `jobId` starts null — populated later by `recordJobId(pid, // id)` when the server's stdout parser sees the runner's @@ -248,13 +257,16 @@ export class TrainRegistry { */ dispatchRebuild( nextConfigHash: string | null, - // Defaults to `null` so tests / pre-existing callers that don't - // pass the artefact hash get the conservative behaviour: the - // pre-ready-spawn branch's `artefactsAgree` check is `false`, - // so a null entry hash falls through to SIGTERM-restart. Real - // dispatch from `/api/train`'s HMR subscriber threads - // `event.hash` here so the backfill optimisation activates. - nextArtifactHash: string | null = null, + // Content hash (sha256-derived; see `studio/hmr.ts`) of the + // freshly-built artefact, paired with `entry.spawnArtifactContentHash` + // for the pre-ready-spawn equality gate. Defaults to `null` so + // tests / pre-existing callers that don't pass a hash get the + // conservative behaviour: a null entry hash falls through to + // SIGTERM-restart. Real dispatch from `/api/train`'s HMR + // subscriber threads `event.contentHash` here so the backfill + // optimisation activates only when the child's loaded bytes + // genuinely match. + nextArtifactContentHash: string | null = null, ): DispatchResult { const hotSwapTargets: RestartTarget[] = []; const restartTargets: RestartTarget[] = []; @@ -267,34 +279,34 @@ export class TrainRegistry { // recorded `configHash` is `null`. Whether the rebuild's new // hash describes the same bytes the child actually loaded // depends on whether the on-disk artefact has changed between - // spawn and now. Tie the decision to the artefact fingerprint: + // spawn and now. Tie the decision to the artefact content + // hash: // - // - `entry.spawnArtifactHash === nextArtifactHash` → child - // read the same bytes the new hash describes. Safe to - // backfill `configHash`; future rebuilds compare against - // the backfilled value like any other child. This is the - // common case (user clicked Run before the SPA had + // - `entry.spawnArtifactContentHash === nextArtifactContentHash` + // → child read the same bytes the new hash describes. + // Safe to backfill `configHash`; future rebuilds compare + // against the backfilled value like any other child. This + // is the common case (user clicked Run before the SPA had // refreshed its manifest, but the on-disk artefact is the // same one the watcher just settled on). // - // - artefact fingerprints differ (or one side is null) → - // the bytes the child loaded don't match the new hash. - // SIGTERM-restart so the cloud-side `JobConfig` and the - // child's actual config are guaranteed to align. Without - // this gate, an edit landing between spawn and the first - // BUNDLE_END would silently teach the registry to use the - // post-edit hash as the child's baseline — later - // same-hash rebuilds would then hot-swap callbacks into - // a child whose cloud-side `JobConfig` was *actually* - // spawned against an older version, leaving the cloud - // run on a stale config. + // - content hashes differ (or one side is null) → the bytes + // the child loaded don't match the new hash. SIGTERM-restart + // so the cloud-side `JobConfig` and the child's actual + // config are guaranteed to align. Without this gate, an + // edit landing between spawn and the first BUNDLE_END would + // silently teach the registry to use the post-edit hash as + // the child's baseline — later same-hash rebuilds would + // then hot-swap callbacks into a child whose cloud-side + // `JobConfig` was *actually* spawned against an older + // version, leaving the cloud run on a stale config. const isPreReadySpawn = entry.configHash === null && nextConfigHash !== null; if (isPreReadySpawn) { const artefactsAgree = - entry.spawnArtifactHash !== null && - nextArtifactHash !== null && - entry.spawnArtifactHash === nextArtifactHash; + entry.spawnArtifactContentHash !== null && + nextArtifactContentHash !== null && + entry.spawnArtifactContentHash === nextArtifactContentHash; if (artefactsAgree) { entry.configHash = nextConfigHash; continue; From 19e3b8449200b2c306ce79d79f940605e96af0b1 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Mon, 18 May 2026 19:57:02 +0900 Subject: [PATCH 49/55] fix: prevent concurrent token file deletion on failed persist during dev session --- packages/arkor/src/cli/commands/dev.test.ts | 55 +++++ packages/arkor/src/cli/commands/dev.ts | 39 ++- packages/arkor/src/core/runner.ts | 26 +- packages/arkor/src/studio/server.test.ts | 222 +++++++++++++++++- packages/arkor/src/studio/server.ts | 160 +++++++++---- packages/arkor/src/studio/trainRegistry.ts | 43 +++- .../studio-app/src/components/RunTraining.tsx | 9 + 7 files changed, 497 insertions(+), 57 deletions(-) diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index a18aca8c..bd00d5fd 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -4,6 +4,7 @@ import { mkdtempSync, readFileSync, rmSync, + writeFileSync, } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; @@ -766,6 +767,60 @@ describe("runDev", () => { } }); + it("does NOT unlink a pre-existing token file when this process failed to persist its own token (concurrent arkor dev safety)", async () => { + // Regression: a failed-persist `arkor dev` used to unconditionally + // `unlinkSync(studioTokenPath())` on shutdown. If a concurrent + // `arkor dev` (different port, same user) had already persisted a + // valid token to the shared path, this run's cleanup would wipe + // it out from under them, breaking that session's Vite SPA dev + // workflow with mystery 403s on /api/*. The fix gates the unlink + // on `tokenPersisted` so a failed-persist run is a no-op at + // shutdown. + if (typeof process.getuid === "function" && process.getuid() === 0) { + // Root bypasses chmod permission checks: skip on root containers. + return; + } + // Pre-place a "concurrent" token (the other dev session's). Body + // content lets us assert byte-equality after cleanup, not just + // file existence, to rule out an unlink+recreate cycle. + const path = studioTokenPath(); + writeFileSync(path, "concurrent-token-value", { mode: 0o600 }); + // Make the FILE unwritable so persistStudioToken's `writeFile` + // throws EACCES, but leave the *directory* writable so unlinkSync + // (which requires dir-write, not file-write perms) would happily + // delete the file if the cleanup hook weren't gated. + chmodSync(path, 0o444); + + const stdoutSpy = vi + .spyOn(process.stdout, "write") + .mockImplementation((() => true) as typeof process.stdout.write); + try { + await expect(runDev({ port: 4207 })).resolves.toBeUndefined(); + } finally { + stdoutSpy.mockRestore(); + } + + const exitSpy = vi + .spyOn(process, "exit") + .mockImplementation(((_code?: number) => { + return undefined as never; + }) as typeof process.exit); + try { + // Restore read perms so we can `readFileSync` to verify content. + chmodSync(path, 0o644); + const sigintListeners = process.listeners("SIGINT"); + const handler = sigintListeners[sigintListeners.length - 1] as () => void; + handler(); + await flushMicrotasks(); + // The pre-existing token is still on disk AND unchanged: this + // failed-persist run did not wipe it. + expect(existsSync(path)).toBe(true); + expect(readFileSync(path, "utf8")).toBe("concurrent-token-value"); + } finally { + exitSpy.mockRestore(); + } + }); + it("registers a cleanup listener that removes the studio-token file on exit", async () => { const stdoutSpy = vi .spyOn(process.stdout, "write") diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index 14047a5b..d6b924a2 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -172,9 +172,25 @@ async function persistStudioToken(token: string): Promise { return path; } -function scheduleStudioTokenCleanup(path: string): void { +function scheduleStudioTokenCleanup( + path: string, + // Read at cleanup time so a `persistStudioToken` call that's still + // in flight when the user hits Ctrl-C (or one that resolved + // successfully *after* this scheduler ran) has its outcome + // respected. A plain boolean parameter would be captured at hook + // registration time, well before persist resolves. + shouldUnlink: () => boolean, +): void { registerCleanupHook({ cleanup: () => { + // Skip the unlink entirely if THIS process never persisted the + // file. Without this gate, a failed-persist `arkor dev` would + // happily `unlinkSync` on shutdown, and if a concurrent + // `arkor dev` process (different port, same user) had persisted + // a valid token to the same shared path, our cleanup would + // wipe it out from under them, breaking that session's Vite + // SPA dev workflow with mystery 403s on /api/*. + if (!shouldUnlink()) return; try { unlinkSync(path); } catch { @@ -227,14 +243,20 @@ export async function runDev(options: DevOptions = {}): Promise { // is the only one that calls `process.exit(0)` on SIGINT/SIGTERM/SIGHUP // (the HMR hook above only disposes), and `registerCleanupHook` overrides // Node's default "exit on signal" behaviour for any signal it listens - // on. If we were to gate this hook behind a successful `persistStudioToken` - // and the persist threw, Ctrl-C would run the HMR dispose and then leave - // the server idle in the foreground — no exit ever fires. Registering - // first means the hook is in place even if persist fails; the cleanup - // body is best-effort (`unlinkSync` in a try/catch) so calling it on a - // file that was never written is a silent no-op. + // on. If we were to gate registration behind a successful + // `persistStudioToken` and the persist threw, Ctrl-C would run the HMR + // dispose and then leave the server idle in the foreground: no exit + // ever fires. + // + // The cleanup body itself, however, gates `unlinkSync` on + // `tokenPersisted` (set only after `persistStudioToken` resolves) so a + // failed-persist run doesn't clobber a concurrent `arkor dev` process's + // valid token at the shared `~/.arkor/studio-token` path. Both + // protections together: hook is always registered (so exits behave), + // but only deletes a file *we* wrote. const tokenPath = studioTokenPath(); - scheduleStudioTokenCleanup(tokenPath); + let tokenPersisted = false; + scheduleStudioTokenCleanup(tokenPath, () => tokenPersisted); // Persisting the token to disk is *only* needed for the Vite SPA dev // workflow. The bundled `:port` flow injects the meta tag at request time @@ -242,6 +264,7 @@ export async function runDev(options: DevOptions = {}): Promise { // locked-down CI / restrictive umask) must not block the server. try { await persistStudioToken(studioToken); + tokenPersisted = true; } catch (err) { ui.log.warn( `Could not write ${tokenPath} (${ diff --git a/packages/arkor/src/core/runner.ts b/packages/arkor/src/core/runner.ts index 2d26c53b..efacc557 100644 --- a/packages/arkor/src/core/runner.ts +++ b/packages/arkor/src/core/runner.ts @@ -10,6 +10,27 @@ import type { Trainer } from "./types"; const DEFAULT_ENTRY = "src/arkor/index.ts"; +/** + * Per-spawn nonce that `/api/train` injects via env so the server can + * recognise the runner's `Started job ` line without it being + * forgeable from user code. Captured at module load (i.e. BEFORE + * `runTrainer` does its `await import(userEntry)`) and the env var + * is deleted right after so the dynamically-imported user module + * cannot read it via `process.env`. If a user callback then writes + * `Started job ` to stdout, the line won't carry the nonce + * prefix and the server's anchored regex will reject it: no + * spoofed cloud `cancel()` POST against an attacker-chosen job id. + * + * Null when the runner was launched directly (e.g. `arkor start` from + * a shell), in which case the runner falls back to the plain + * `Started job ` form for backwards compatibility. The server only + * uses the nonce-prefixed form because every server spawn sets the + * env var. + */ +const STARTED_JOB_NONCE: string | null = + process.env.ARKOR_JOB_ID_MARKER_NONCE ?? null; +delete process.env.ARKOR_JOB_ID_MARKER_NONCE; + function isTrainer(value: unknown): value is Trainer { if (!value || typeof value !== "object") return false; const t = value as Record; @@ -61,7 +82,10 @@ export async function runTrainer(file?: string): Promise { const removeCallbackReload = installCallbackReloadHandler(trainer, abs); try { const { jobId } = await trainer.start(); - process.stdout.write(`Started job ${jobId}\n`); + const startedJobPrefix = STARTED_JOB_NONCE + ? `[arkor:${STARTED_JOB_NONCE}] ` + : ""; + process.stdout.write(`${startedJobPrefix}Started job ${jobId}\n`); const result = await trainer.wait(); process.stdout.write( `Job ${result.job.id} finished with status=${result.job.status}\n`, diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index 0532b14d..c587332a 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -674,12 +674,20 @@ process.exit(0); const REAL_JOB_ID = "real-job-id"; const DECOY_JOB_ID = "decoy-from-stderr"; const fakeBin = join(trainCwd, "stderr-decoy-bin.mjs"); + // The real runner prefixes its canonical line with the + // per-spawn nonce the server injected via + // ARKOR_JOB_ID_MARKER_NONCE; the decoy on stderr deliberately + // uses the nonce too (worst-case: a user who somehow learned + // the nonce still can't hijack the parser by writing to the + // wrong stream). With the parser correctly stdout-only the + // real line wins regardless. writeFileSync( fakeBin, - `process.stderr.write("Started job ${DECOY_JOB_ID}\\n"); + `const nonce = process.env.ARKOR_JOB_ID_MARKER_NONCE ?? ""; + process.stderr.write(\`[arkor:\${nonce}] Started job ${DECOY_JOB_ID}\\n\`); // Slight delay so stderr lands first. setTimeout(() => { - process.stdout.write("Started job ${REAL_JOB_ID}\\n"); + process.stdout.write(\`[arkor:\${nonce}] Started job ${REAL_JOB_ID}\\n\`); }, 30); process.on("SIGTERM", () => {}); setInterval(() => {}, 60_000); @@ -777,9 +785,15 @@ process.exit(0); // POST URL below. const FAKE_JOB_ID = "j-cancel-test"; const fakeBin = join(trainCwd, "started-job-bin.mjs"); + // Prefix the marker with the per-spawn nonce the server + // injected via ARKOR_JOB_ID_MARKER_NONCE: that's the only + // shape the server's parser accepts, since user code can't + // know the nonce ahead of time (real runner deletes the env + // var before importing user modules). writeFileSync( fakeBin, - `process.stdout.write("Started job ${FAKE_JOB_ID}\\n"); + `const nonce = process.env.ARKOR_JOB_ID_MARKER_NONCE ?? ""; + process.stdout.write(\`[arkor:\${nonce}] Started job ${FAKE_JOB_ID}\\n\`); process.on("SIGTERM", () => {}); setInterval(() => {}, 60_000); `, @@ -861,6 +875,202 @@ process.exit(0); } }); + it("/api/train cancel uses the spawn-time scope from the registry even when state.json was deleted mid-training", async () => { + // Regression: the cancel handler used to re-read + // `.arkor/state.json` at stop time to address the cloud cancel + // POST. If the user removed or made the file unreadable + // mid-training (rm -rf .arkor, accidental git clean -fdx, fs + // unmounted), the read returned null and the handler silently + // skipped the POST: the local SIGKILL still tore down the + // subprocess but the cloud job orphaned until TTL/reaper. The + // fix captures `{orgSlug, projectSlug}` on the registry entry + // at spawn time so the cancel POST is decoupled from + // mutable filesystem state. + await writeCredentials(ANON_CREDS); + await writeState( + { + orgSlug: "scope-pin-org", + projectSlug: "scope-pin-project", + projectId: "p-scope-pin", + }, + trainCwd, + ); + const FAKE_JOB_ID = "j-scope-pin"; + const fakeBin = join(trainCwd, "scope-pin-bin.mjs"); + writeFileSync( + fakeBin, + `const nonce = process.env.ARKOR_JOB_ID_MARKER_NONCE ?? ""; + process.stdout.write(\`[arkor:\${nonce}] Started job ${FAKE_JOB_ID}\\n\`); + process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + let cancelHits: Array<{ url: string }> = []; + const ORIG_FETCH = globalThis.fetch; + globalThis.fetch = (async ( + input: Parameters[0], + init?: Parameters[1], + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && /\/v1\/jobs\/[^/]+\/cancel/.test(url)) { + cancelHits.push({ url }); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response("not found", { status: 404 }); + }) as typeof fetch; + + try { + const app = buildStudioApp({ + baseUrl: "http://mock-cloud-api", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + const reader = trainRes.body!.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + while (!buf.includes(`Started job ${FAKE_JOB_ID}`)) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + // The hostile mid-training mutation: nuke the state file + // that the OLD code would have re-read at cancel time. + rmSync(join(trainCwd, ".arkor"), { recursive: true, force: true }); + // Cancel: under the bug, the handler's state read returns + // null and the cancel POST is silently skipped. With the + // fix, the registry-pinned scope is used and the POST goes + // out anyway. + await reader.cancel(); + await new Promise((r) => setTimeout(r, 200)); + + expect(cancelHits).toHaveLength(1); + expect(cancelHits[0]?.url).toContain(`/v1/jobs/${FAKE_JOB_ID}/cancel`); + expect(cancelHits[0]?.url).toContain("orgSlug=scope-pin-org"); + expect(cancelHits[0]?.url).toContain("projectSlug=scope-pin-project"); + } finally { + globalThis.fetch = ORIG_FETCH; + } + }); + + it("/api/train job-id parser ignores stdout lines that lack the per-spawn nonce prefix so user code can't forge a `Started job` marker", async () => { + // Regression: the parser used to match any `Started job ` + // line in stdout. User code (which runs inside the runner's + // `await import(userEntry)` chain and therefore shares the + // child's stdout) could write `console.log("Started job + // attacker-chosen-id")` before the runner's canonical line + // arrives, the parser would record the attacker's id, and + // Stop-training would POST `/v1/jobs//cancel` + // against a job the attacker picked. The fix injects a + // per-spawn 32-hex nonce via ARKOR_JOB_ID_MARKER_NONCE that + // the server's regex anchors on; runner.ts deletes the env + // var before dynamically importing the user module, so user + // code can't read the nonce via `process.env` either. + await writeCredentials(ANON_CREDS); + await writeState( + { + orgSlug: "nonce-org", + projectSlug: "nonce-project", + projectId: "p-nonce", + }, + trainCwd, + ); + const REAL_JOB_ID = "real-nonce-job"; + const SPOOF_JOB_ID = "attacker-chosen-id"; + const fakeBin = join(trainCwd, "spoof-bin.mjs"); + // Bin first emits an UNPREFIXED spoof on stdout (mimicking + // hostile user code), THEN the real nonce-prefixed canonical + // line. With the fix the spoof is rejected; the real line + // wins and the cancel POST targets the real id. + writeFileSync( + fakeBin, + `const nonce = process.env.ARKOR_JOB_ID_MARKER_NONCE ?? ""; + process.stdout.write("Started job ${SPOOF_JOB_ID}\\n"); + setTimeout(() => { + process.stdout.write(\`[arkor:\${nonce}] Started job ${REAL_JOB_ID}\\n\`); + }, 30); + process.on("SIGTERM", () => {}); + setInterval(() => {}, 60_000); + `, + ); + let cancelHits: Array<{ url: string }> = []; + const ORIG_FETCH = globalThis.fetch; + globalThis.fetch = (async ( + input: Parameters[0], + init?: Parameters[1], + ) => { + const url = typeof input === "string" ? input : input.toString(); + const method = init?.method ?? "GET"; + if (method === "POST" && /\/v1\/jobs\/[^/]+\/cancel/.test(url)) { + cancelHits.push({ url }); + return new Response(JSON.stringify({ ok: true }), { + status: 200, + headers: { "content-type": "application/json" }, + }); + } + return new Response("not found", { status: 404 }); + }) as typeof fetch; + + try { + const app = buildStudioApp({ + baseUrl: "http://mock-cloud-api", + assetsDir, + autoAnonymous: false, + studioToken: STUDIO_TOKEN, + cwd: trainCwd, + binPath: fakeBin, + }); + const trainRes = await app.request("/api/train", { + method: "POST", + headers: { + host: "127.0.0.1:4000", + "x-arkor-studio-token": STUDIO_TOKEN, + "content-type": "application/json", + }, + body: JSON.stringify({}), + }); + expect(trainRes.status).toBe(200); + const reader = trainRes.body!.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + // Wait for the REAL line (with nonce prefix) to be visible + // in the body. Both lines forward to the SPA log + // regardless of which (if any) the parser captures, so the + // body is a reliable readiness signal. + while (!buf.includes(`Started job ${REAL_JOB_ID}`)) { + const { value, done } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + } + await reader.cancel(); + await new Promise((r) => setTimeout(r, 200)); + + // Cancel POST landed against the REAL id: the spoof was + // rejected by the anchored nonce-prefixed regex. + expect(cancelHits).toHaveLength(1); + expect(cancelHits[0]?.url).toContain(`/v1/jobs/${REAL_JOB_ID}/cancel`); + expect(cancelHits[0]?.url).not.toContain(SPOOF_JOB_ID); + } finally { + globalThis.fetch = ORIG_FETCH; + } + }); + it("/api/train cancel sends SIGKILL so user-initiated stop bypasses the runner's graceful early-stop", async () => { // Regression: a default `child.kill()` sends SIGTERM, which // the runner's `installShutdownHandlers` now interprets as a @@ -2229,10 +2439,12 @@ process.exit(0); const fakeBin = join(trainCwd, "manual-during-hmr-bin.mjs"); // SIGTERM no-op so HMR's graceful SIGTERM doesn't terminate // the bin — we need it alive so the subsequent manual - // cancel actually has something to SIGKILL. + // cancel actually has something to SIGKILL. Marker uses the + // server-injected nonce prefix so the parser accepts it. writeFileSync( fakeBin, - `process.stdout.write("Started job ${FAKE_JOB_ID}\\n"); + `const nonce = process.env.ARKOR_JOB_ID_MARKER_NONCE ?? ""; + process.stdout.write(\`[arkor:\${nonce}] Started job ${FAKE_JOB_ID}\\n\`); process.on("SIGTERM", () => {}); setInterval(() => {}, 60_000); `, diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 4255f638..2506935c 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -1,7 +1,7 @@ import { spawn, type ChildProcessByStdio } from "node:child_process"; import type { Readable, Writable } from "node:stream"; import { readFile, realpath } from "node:fs/promises"; -import { timingSafeEqual } from "node:crypto"; +import { randomBytes, timingSafeEqual } from "node:crypto"; import { Hono } from "hono"; import { createClient } from "@arkor/cloud-api-client"; import { CloudApiClient, CloudApiError } from "../core/client"; @@ -27,28 +27,31 @@ import { TrainRegistry, type RestartTarget } from "./trainRegistry"; * `restart` events to the run *this* tab actually started. */ const TRAIN_PID_HEADER = "x-arkor-train-pid"; /** - * Strict full-line match for the runner's `Started job ` line. - * `core/runner.ts` prints exactly that text — `process.stdout.write(\`Started job ${jobId}\n\`)` — - * after `trainer.start()` resolves; the server's `/api/train` - * stdout forwarder line-buffers chunks (chunk boundaries are - * arbitrary, so a substring scan against raw chunks would miss - * splits) and applies this regex to each complete line so it can - * POST `/v1/jobs/:id/cancel` to cloud-api on user-initiated - * cancel (SIGKILL bypasses the runner's own shutdown handlers — - * see the `cancel()` comment for the full rationale). + * Build the strict full-line match for the runner's `[arkor:] Started job ` line. * - * Anchors `^…$` matter for two reasons: - * - Avoid false matches when a user `console.log` happens to - * contain the substring "Started job " *before* the - * runner's canonical line lands; once we record an id we - * stop scanning, so a stray earlier match would stick and - * Stop-training would POST cancel for the wrong (or - * non-existent) job. - * - Restrict the id capture to non-whitespace, mirroring what - * `runner.ts` writes (cloud-api job ids are word-shaped, - * never contain spaces). + * `core/runner.ts` prefixes that text with the per-spawn nonce we + * inject via `ARKOR_JOB_ID_MARKER_NONCE`; without the prefix, a + * user `console.log("Started job ")` from inside + * `trainer.start()` / `onCheckpoint` / etc. could land in stdout + * *before* the runner's real line and we'd record the wrong id, so + * Stop-training would then POST `/v1/jobs/:attacker-id/cancel` + * against a job the attacker chose. Anchoring on a 32-hex nonce + * known only to the server + runner (the env var is deleted by + * runner.ts BEFORE the user module is dynamically imported, so the + * user can't read it) closes that hole. + * + * Pattern is per-spawn because the nonce is per-spawn. + * + * Anchors `^…$` and `(\S+)` job-id capture mirror the runner's + * exact write shape (cloud-api job ids never contain whitespace), + * so a chatty bin that wraps the line in other content cannot + * collide either. */ -const STARTED_JOB_PATTERN = /^Started job (\S+)$/; +function buildStartedJobPattern(nonce: string): RegExp { + // Nonce is a 32-char hex string from `randomBytes(16).toString("hex")`, + // i.e. only `[0-9a-f]` (safe to interpolate into the regex literal). + return new RegExp(`^\\[arkor:${nonce}\\] Started job (\\S+)$`); +} const DEPRECATION_HEADERS = ["Deprecation", "Sunset", "Warning"] as const; function copyDeprecationHeaders(from: Headers, to: Headers): void { @@ -442,8 +445,30 @@ export function buildStudioApp(options: StudioServerOptions) { const spawnArtifactContentHash: string | null = options.hmr ? options.hmr.getCurrentArtifactContentHash() : null; + // Capture the cloud-api scope NOW (at spawn time) so the cancel + // handler can POST `/v1/jobs/:id/cancel` without re-reading + // `.arkor/state.json` at stop time. If the user removed or made + // the state file unreadable mid-training, the stop-time read + // would return null and the cancel POST would silently skip: + // local SIGKILL still tears down the subprocess but the cloud + // run orphans. Pinning the scope on the registry entry + // decouples cancel correctness from mutable filesystem state. + const spawnState = await readState(trainCwd); + const spawnScope = spawnState + ? { orgSlug: spawnState.orgSlug, projectSlug: spawnState.projectSlug } + : null; const args = [trainBinPath, "start"]; if (trainFile) args.push(trainFile); + // Per-spawn 16-byte nonce passed via env var so the runner can + // prefix its `Started job ` line with `[arkor:] `. The + // server matches that nonce-prefixed shape (see + // `buildStartedJobPattern` for why). 32-hex chars of entropy + // guarantees a user-code spoof attempt can't guess the prefix in + // a single shot, and `core/runner.ts` deletes the env var BEFORE + // dynamically importing the user module so user code can't read + // it via `process.env` either. + const startedJobNonce = randomBytes(16).toString("hex"); + const startedJobPattern = buildStartedJobPattern(startedJobNonce); // `spawn()` is mostly async (filesystem failures surface as the // child's `error` event), but Node can still throw synchronously // for argument-shape problems (e.g. invalid stdio descriptor on @@ -462,6 +487,10 @@ export function buildStudioApp(options: StudioServerOptions) { child = spawn(process.execPath, args, { stdio: "pipe", cwd: trainCwd, + env: { + ...process.env, + ARKOR_JOB_ID_MARKER_NONCE: startedJobNonce, + }, }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); @@ -474,6 +503,7 @@ export function buildStudioApp(options: StudioServerOptions) { trainFile, configHash, spawnArtifactContentHash, + scope: spawnScope, }); // Hoisted out of the `ReadableStream` underlying-source so the // `start` handler can hand its closure-bound teardown helper to @@ -481,6 +511,16 @@ export function buildStudioApp(options: StudioServerOptions) { // not through `controller`, so the two need a parent-scope // rendez-vous variable. let cancelTeardown: (() => void) | null = null; + // Mirror of the cloud `jobId` parsed out of the runner's + // stdout, accessible to both the `start` (parser writes) and + // `cancel` (post-unregister read) handlers. We can't just call + // `activeTrains.getJobId(pid)` from `cancel` because cancel + // unregisters the entry first, so subsequent reads of the + // registry would always be `null` even if the parser races a + // late line in afterwards. This closure variable keeps the id + // observable even after unregister, so the cancel POST poll + // below can pick up a jobId that lands a few ms after Stop. + let parsedJobId: string | null = null; const stream = new ReadableStream({ start(controller) { // After `cancel()` runs, calling `controller.enqueue` / @@ -553,9 +593,14 @@ export function buildStudioApp(options: StudioServerOptions) { // the same anchored pattern. const line = stdoutLineBuf.slice(0, nl).replace(/\r$/, ""); stdoutLineBuf = stdoutLineBuf.slice(nl + 1); - const m = STARTED_JOB_PATTERN.exec(line); + const m = startedJobPattern.exec(line); if (m && m[1]) { activeTrains.recordJobId(child.pid, m[1]); + // Mirror to the parent-scope closure so the cancel + // handler can pick this up even AFTER it called + // `activeTrains.unregister(...)` (the registry + // read would return null post-unregister). + parsedJobId = m[1]; stdoutLineBuf = ""; break; } @@ -671,36 +716,67 @@ export function buildStudioApp(options: StudioServerOptions) { // the local subprocess teardown snappy and let the // server-side cancel POST handle the cloud-side release. // - // Capture the cloud job id BEFORE unregistering — once the - // entry is gone, `getJobId(pid)` returns null and the - // fire-and-forget POST below would no-op. - const jobIdForCancel = activeTrains.getJobId(child.pid); - activeTrains.unregister(child.pid); + // Capture the cloud job id + spawn-time scope BEFORE + // unregistering: once the entry is gone, the getters + // return null and the fire-and-forget POST below would + // no-op. + // + // `pid` is captured once here because the closure below + // runs after `unregister` and we want a stable handle. + const cancelPid = child.pid; + // Scope: pinned at spawn time on the registry entry, NOT + // re-read from `.arkor/state.json` here. A user who + // deleted or made state unreadable mid-training shouldn't + // be able to silently orphan their cloud job by losing + // the cancel-time read. + const scopeForCancel = activeTrains.getScope(cancelPid); + activeTrains.unregister(cancelPid); cancelTeardown?.(); // Fire-and-forget cloud-side cancel so the cloud job is // released even though the SIGKILL below bypasses the // runner's `installShutdownHandlers` (which would // otherwise issue cancel itself via the graceful - // early-stop chain). Best-effort: we don't await because - // user-cancel UX should be snappy — the SIGKILL kills the - // local subprocess regardless of whether the cloud POST - // succeeded, and a transient cloud-api blip just means the - // job sits in "running" until the cloud reaper / TTL - // catches it (same fallback as a network drop). `jobId` - // is null when the runner never emitted its `Started job` - // line (early spawn failure, race against a fast cancel, - // custom user bin); skip the POST in that case. - if (jobIdForCancel) { + // early-stop chain). The IIFE polls for the jobId + // *briefly* before giving up: there's a real race + // window where the user clicks Stop after the cloud + // job has been created but before the runner's + // `Started job ` line has been parsed (cloud + // createJob roundtrip is ~50-200ms; UI clicks can land + // sub-100ms into that window). Polling closes the most + // common case; beyond ~500 ms we accept the cloud-side + // orphan as a follow-up (the cloud reaper / TTL is the + // safety net, and the alternative of querying cloud-api + // for matching jobs at cancel time is brittle in + // multi-tab/multi-spawn scenarios). + if (scopeForCancel) { void (async () => { + // Brief poll on `parsedJobId` (the closure mirror, + // see top-of-handler for why it can't be the + // registry's `getJobId`): the runner's + // `Started job ` line may not have been parsed by + // the time the user clicked Stop. Most runs hit it + // within ~50-200 ms of spawn (cloud createJob + // roundtrip), so polling for up to ~500 ms catches + // nearly all races. Beyond that we accept the + // cloud-side orphan as a documented follow-up: cloud + // reaper / TTL is the safety net, and the + // alternative (querying cloud-api for matching jobs + // at cancel time) is brittle for multi-tab / + // multi-spawn cases. + if (parsedJobId === null) { + const start = Date.now(); + while (parsedJobId === null && Date.now() - start < 500) { + await new Promise((r) => setTimeout(r, 25)); + } + } + if (parsedJobId === null) return; try { - const state = await readState(trainCwd); - if (!state) return; // no scope, can't address the job const rpc = createRpc(); await rpc.v1.jobs[":id"].cancel.$post({ - param: { id: jobIdForCancel }, + param: { id: parsedJobId }, query: { - orgSlug: state.orgSlug, - projectSlug: state.projectSlug, + orgSlug: scopeForCancel.orgSlug, + projectSlug: scopeForCancel.projectSlug, }, }); } catch { diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index 260ec2cd..baad7386 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -67,6 +67,20 @@ export interface ActiveTrain { * until the cloud reaper / TTL fires (continued GPU spend). */ jobId: string | null; + /** + * Cloud-api scope (org + project slugs) captured at spawn time + * from `.arkor/state.json`. Pinned on the registry entry so the + * `/api/train` cancel handler can address the cloud cancel POST + * without re-reading the filesystem at stop time. Without this + * pin, a user who deleted or made unreadable `.arkor/state.json` + * mid-training would have their manual stop silently skip the + * cancel POST (state read returns null, handler bails) and + * the cloud job would orphan. Null when `/api/train` ran without + * state (auto-anonymous bootstrap failed, etc.); cancel POST is + * skipped then too, but the SIGKILL still tears down the local + * subprocess. + */ + scope: { orgSlug: string; projectSlug: string } | null; } export interface RestartTarget { @@ -140,7 +154,11 @@ export class TrainRegistry { child: ChildProcess, init: Omit< ActiveTrain, - "child" | "earlyStopRequested" | "spawnArtifactContentHash" | "jobId" + | "child" + | "earlyStopRequested" + | "spawnArtifactContentHash" + | "jobId" + | "scope" > & { // Optional in the signature so tests / future callers that // don't track the on-disk artefact content hash (e.g. an @@ -152,6 +170,12 @@ export class TrainRegistry { // calls in HMR mode capture this from // `coordinator.getCurrentArtifactContentHash()`. spawnArtifactContentHash?: string | null; + // Optional too: tests don't need scope for HMR-routing + // assertions. Real `/api/train` calls in production pass a + // non-null scope captured from `.arkor/state.json` so the + // cancel POST can address the cloud job without re-reading + // the filesystem at stop time. + scope?: { orgSlug: string; projectSlug: string } | null; }, ): void { if (typeof child.pid !== "number") return; @@ -159,6 +183,7 @@ export class TrainRegistry { child, ...init, spawnArtifactContentHash: init.spawnArtifactContentHash ?? null, + scope: init.scope ?? null, earlyStopRequested: false, // `jobId` starts null — populated later by `recordJobId(pid, // id)` when the server's stdout parser sees the runner's @@ -203,6 +228,22 @@ export class TrainRegistry { return this.entries.get(pid)?.jobId ?? null; } + /** + * Read the spawn-time cloud-api scope for a pid. Paired with + * `getJobId` by `/api/train`'s cancel handler to build the cloud + * cancel POST URL without re-reading `.arkor/state.json` at stop + * time: if the file was deleted or made unreadable mid-training, + * the read would return null and the cancel POST would silently + * skip, orphaning the cloud run. Captured at spawn time, immutable + * for the entry's lifetime. + */ + getScope( + pid: number | undefined, + ): { orgSlug: string; projectSlug: string } | null { + if (typeof pid !== "number") return null; + return this.entries.get(pid)?.scope ?? null; + } + /** * Whether `dispatchRebuild` has already issued a graceful-restart * SIGTERM to this child as part of an HMR cycle. Consulted by diff --git a/packages/studio-app/src/components/RunTraining.tsx b/packages/studio-app/src/components/RunTraining.tsx index 13ee8330..c286b4f8 100644 --- a/packages/studio-app/src/components/RunTraining.tsx +++ b/packages/studio-app/src/components/RunTraining.tsx @@ -200,6 +200,15 @@ export function RunTraining() { // source of truth: no auto-restart fires until the next // successful rebuild re-arms the latch. restartPendingRef.current = false; + // Same hazard via the pre-spawn buffer: a `rebuild` event + // that landed before `onSpawn` populated the pid is parked + // in `pendingPreSpawnEventsRef`. If the user then breaks + // the source and the next event is `error`, `onSpawn`'s + // later drain would still find the stale restart target + // and latch `restartPendingRef = true` → auto-restart + // against the broken state. Drop the buffer alongside the + // latch so the error event is the new source of truth. + pendingPreSpawnEventsRef.current = []; return; } // Always refresh the manifest on ready/rebuild. From 77fb46a22f0500a2001232f3ef3a2a7b6d311930 Mon Sep 17 00:00:00 2001 From: k-taro56 <121674121+k-taro56@users.noreply.github.com> Date: Tue, 19 May 2026 00:11:34 +0900 Subject: [PATCH 50/55] fix: improve comments for clarity and consistency across various files - Updated comments in trainRegistry.test.ts to enhance readability and understanding of the test cases. - Clarified comments in trainRegistry.ts regarding the behavior of signals and process management. - Enhanced documentation in templates.ts to ensure consistency in phrasing and clarity. - Improved comments in RunTraining.tsx to better explain the logic and flow of the component. - Refined comments in api.test.ts and api.ts to provide clearer context and reasoning for the code. - Adjusted comments in baseModels.test.ts to clarify the case-sensitivity of model IDs. --- AGENTS.md | 20 ++--- docs/concepts/studio.mdx | 2 +- docs/ja/studio/jobs.mdx | 10 +-- e2e/studio/src/specs/hmr.spec.ts | 12 +-- packages/arkor/src/cli/cleanupHooks.test.ts | 14 ++-- packages/arkor/src/cli/cleanupHooks.ts | 16 ++-- packages/arkor/src/cli/commands/dev.test.ts | 38 ++++----- packages/arkor/src/cli/commands/dev.ts | 10 +-- packages/arkor/src/core/configHash.test.ts | 8 +- packages/arkor/src/core/configHash.ts | 6 +- .../arkor/src/core/moduleCacheBust.test.ts | 4 +- packages/arkor/src/core/moduleCacheBust.ts | 6 +- packages/arkor/src/core/projectState.test.ts | 6 +- packages/arkor/src/core/rolldownConfig.ts | 6 +- packages/arkor/src/core/runner.test.ts | 16 ++-- packages/arkor/src/core/runnerSignals.test.ts | 14 ++-- packages/arkor/src/core/runnerSignals.ts | 12 +-- packages/arkor/src/core/schemas.test.ts | 2 +- packages/arkor/src/core/trainer.test.ts | 72 ++++++++-------- packages/arkor/src/core/trainer.ts | 50 +++++------ .../arkor/src/core/trainerInspection.test.ts | 10 +-- packages/arkor/src/core/trainerInspection.ts | 24 +++--- packages/arkor/src/studio/hmr.test.ts | 28 +++---- packages/arkor/src/studio/hmr.ts | 58 ++++++------- packages/arkor/src/studio/manifest.ts | 6 +- packages/arkor/src/studio/server.test.ts | 82 +++++++++---------- packages/arkor/src/studio/server.ts | 78 +++++++++--------- .../arkor/src/studio/trainRegistry.test.ts | 24 +++--- packages/arkor/src/studio/trainRegistry.ts | 46 +++++------ packages/cli-internal/src/templates.ts | 20 ++--- .../studio-app/src/components/RunTraining.tsx | 34 ++++---- packages/studio-app/src/lib/api.test.ts | 16 ++-- packages/studio-app/src/lib/api.ts | 20 ++--- .../studio-app/src/lib/baseModels.test.ts | 2 +- 34 files changed, 386 insertions(+), 386 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 5171011f..986e792b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -62,15 +62,15 @@ cd my-arkor-app && pnpm dev # Studio at http://127.0. `arkor dev` generates a 32-byte base64url token per launch ([packages/arkor/src/cli/commands/dev.ts](packages/arkor/src/cli/commands/dev.ts)) and: -1. Passes it to `buildStudioApp({ studioToken })`. The Hono server validates every `/api/*` request via `X-Arkor-Studio-Token` header (or `?studioToken=` query for `EventSource`, which can't set headers). Comparison uses `timingSafeEqual`. The query-token allow-list lives in `eventStreamPathPattern` in [packages/arkor/src/studio/server.ts](packages/arkor/src/studio/server.ts) — currently `/api/jobs/:id/events` and `/api/dev/events`. **Adding to that regex is CSRF-sensitive: each entry must be a GET stream-only route, never a mutation endpoint.** -2. Persists it to `~/.arkor/studio-token` (mode 0600) so the SPA dev workflow (`pnpm --filter @arkor/studio-app dev`) can read it via the `arkor-studio-token` Vite plugin in [packages/studio-app/vite.config.ts](packages/studio-app/vite.config.ts), which injects `` into `index.html` on each request. Persistence failure must NOT block server start (read-only `$HOME` on Docker, etc.) — just warn. +1. Passes it to `buildStudioApp({ studioToken })`. The Hono server validates every `/api/*` request via `X-Arkor-Studio-Token` header (or `?studioToken=` query for `EventSource`, which can't set headers). Comparison uses `timingSafeEqual`. The query-token allow-list lives in `eventStreamPathPattern` in [packages/arkor/src/studio/server.ts](packages/arkor/src/studio/server.ts), currently `/api/jobs/:id/events` and `/api/dev/events`. **Adding to that regex is CSRF-sensitive: each entry must be a GET stream-only route, never a mutation endpoint.** +2. Persists it to `~/.arkor/studio-token` (mode 0600) so the SPA dev workflow (`pnpm --filter @arkor/studio-app dev`) can read it via the `arkor-studio-token` Vite plugin in [packages/studio-app/vite.config.ts](packages/studio-app/vite.config.ts), which injects `` into `index.html` on each request. Persistence failure must NOT block server start (read-only `$HOME` on Docker, etc.); just warn. 3. Cleans up on `exit`/SIGINT/SIGTERM/SIGHUP via `unlinkSync`. -`/api/*` middleware also enforces a host-header allow-list (`127.0.0.1`/`localhost`) for DNS-rebinding defence. **CORS is intentionally NOT configured** — the SPA is same-origin so reflecting `*` would let "simple" cross-origin POSTs reach handlers. The token check rejects those; cross-origin tabs cannot read the SPA's ``. +`/api/*` middleware also enforces a host-header allow-list (`127.0.0.1`/`localhost`) for DNS-rebinding defence. **CORS is intentionally NOT configured**: the SPA is same-origin so reflecting `*` would let "simple" cross-origin POSTs reach handlers. The token check rejects those; cross-origin tabs cannot read the SPA's ``. -The whole point: prevents another browser tab on the same machine from POSTing `/api/train` (which spawns `arkor train` and dynamically imports user TS — RCE-grade). +The whole point: prevents another browser tab on the same machine from POSTing `/api/train` (which spawns `arkor train` and dynamically imports user TS, an RCE-grade exposure). -When touching the Studio server or SPA fetch layer, preserve: token via header for `fetch`, query param for `EventSource`, host-header guard, no CORS, timing-safe compare. The Vite plugin is dev-only (`apply: "serve"`) — running it during `vite build` would bake a stale per-launch token into the production `index.html` and shadow the runtime tag, causing every `/api/*` call to 403. +When touching the Studio server or SPA fetch layer, preserve: token via header for `fetch`, query param for `EventSource`, host-header guard, no CORS, timing-safe compare. The Vite plugin is dev-only (`apply: "serve"`): running it during `vite build` would bake a stale per-launch token into the production `index.html` and shadow the runtime tag, causing every `/api/*` call to 403. ### HMR + graceful early-stop + callback hot-swap @@ -78,10 +78,10 @@ When touching the Studio server or SPA fetch layer, preserve: token via header f When a rebuild lands while a `/api/train`-spawned subprocess is in flight, the server makes a per-child decision in [packages/arkor/src/studio/trainRegistry.ts](packages/arkor/src/studio/trainRegistry.ts): -- **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and rotates the trainer's callback cell via the internal `Symbol.for("arkor.trainer.replaceCallbacks")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts). The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. Don't add a `replaceCallbacks()` method to the public `Trainer` interface — keeping the mutator behind a `Symbol.for` brand is what stops the dev-only HMR primitive from leaking into the SDK's published surface. -- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` drives the trainer's internal early-stop entry point via the `Symbol.for("arkor.trainer.requestEarlyStop")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts), which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately — emergency escape hatch for a hung cancel. +- **`configHash` matches the spawn-time hash** → SIGUSR2. The child's `installCallbackReloadHandler` re-imports the artifact and rotates the trainer's callback cell via the internal `Symbol.for("arkor.trainer.replaceCallbacks")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts). The cloud-side run is untouched. Use this whenever a code change is contained inside the `callbacks: { ... }` object. Don't add a `replaceCallbacks()` method to the public `Trainer` interface: keeping the mutator behind a `Symbol.for` brand is what stops the dev-only HMR primitive from leaking into the SDK's published surface. +- **`configHash` differs (or is null because the new bundle didn't inspect)** → SIGTERM. `installShutdownHandlers` drives the trainer's internal early-stop entry point via the `Symbol.for("arkor.trainer.requestEarlyStop")` brand exposed by [packages/arkor/src/core/trainerInspection.ts](packages/arkor/src/core/trainerInspection.ts), which lets the next `checkpoint.saved` event finish (work preserved) before issuing `cancel()` and exiting cleanly. The SPA auto-restarts the run with the rebuilt artifact via the `restart: true` flag on the SSE event. A second SIGTERM bypasses the early-stop and exits 143 immediately, as an emergency escape hatch for a hung cancel. -Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. Don't surface `requestEarlyStop()` (or `replaceCallbacks()`) as a method on the public `Trainer` interface — both are dev-only HMR primitives, and keeping them behind `Symbol.for` brands is what stops them from leaking into the published SDK shape; user code that wants similar semantics should compose `abortSignal` + `cancel()` per the cookbook. +Don't replace the SIGTERM-and-let-the-child-handle-it pattern with a SIGKILL escalation in the server: that would orphan Cloud-side jobs (no `cancel()` POST goes out) and waste GPU budget. Don't widen the SIGUSR2 path to "always hot-swap, server-side": the `configHash` check is what guarantees a hot-swap can't silently leave a child running with a stale `JobConfig`. Don't surface `requestEarlyStop()` (or `replaceCallbacks()`) as a method on the public `Trainer` interface: both are dev-only HMR primitives, and keeping them behind `Symbol.for` brands is what stops them from leaking into the published SDK shape; user code that wants similar semantics should compose `abortSignal` + `cancel()` per the cookbook. ### Project entry-point discovery @@ -91,7 +91,7 @@ The CLI/Studio look at `src/arkor/index.ts` in user projects. Discovery in [pack ### E2E suite specifics -Both [e2e/cli](e2e/cli) and [e2e/studio](e2e/studio) declare `arkor` (and, for `e2e/cli`, `create-arkor`) as `workspace:*` `devDependencies`, so Turbo's `^build` produces `dist/bin.mjs` exactly once before `#test`/`#test:coverage` runs — no `pretest` hooks, no concurrent rebuilds racing on `dist/`. Standalone runs (`pnpm --filter @arkor/e2e-* test`) need a prior `pnpm build`. Every supported Node (≥22.22.0) is in rolldown's compatible range (^20.19 || >=22.12), so the previous "rolldown-incompatible" CI bypass path was removed. +Both [e2e/cli](e2e/cli) and [e2e/studio](e2e/studio) declare `arkor` (and, for `e2e/cli`, `create-arkor`) as `workspace:*` `devDependencies`, so Turbo's `^build` produces `dist/bin.mjs` exactly once before `#test`/`#test:coverage` runs (no `pretest` hooks, no concurrent rebuilds racing on `dist/`). Standalone runs (`pnpm --filter @arkor/e2e-* test`) need a prior `pnpm build`. Every supported Node (≥22.22.0) is in rolldown's compatible range (^20.19 || >=22.12), so the previous "rolldown-incompatible" CI bypass path was removed. Tests rely on `ARKOR_INTERNAL_SCAFFOLD_ARKOR_SPEC=file:.../packages/arkor` so the scaffolded fixtures install the workspace `arkor` instead of the npm-published one. Both this var and `SKIP_E2E_INSTALL` are declared in [turbo.json](turbo.json) so they pass through Turbo's hash. @@ -106,7 +106,7 @@ When implementing anything (new feature, SDK/CLI/Studio behaviour change, schema 1. **Docs in both languages.** This repo pairs English/Japanese docs: `README.md` ↔ `README.ja.md`, `CONTRIBUTING.md` ↔ `CONTRIBUTING.ja.md`, and `docs/` ↔ `docs/ja/`. If you edit the English side, update the Japanese side in the same PR. Don't leave Japanese docs to be retro-translated later. 2. **Tests.** Add vitest cases under `packages/*/src/**/*.test.ts` for SDK/CLI/scaffold logic changes. For CLI flow changes, consider an `e2e/cli` scenario. -Don't split these into "docs in a follow-up PR" or "tests later" — land them in the same PR. Skip only when the user explicitly says to. +Don't split these into "docs in a follow-up PR" or "tests later"; land them in the same PR. Skip only when the user explicitly says to. ## Non-obvious gotchas diff --git a/docs/concepts/studio.mdx b/docs/concepts/studio.mdx index b91b831c..fbbd4b21 100644 --- a/docs/concepts/studio.mdx +++ b/docs/concepts/studio.mdx @@ -18,7 +18,7 @@ A note on the dev loop: Studio runs a [Rolldown](https://rolldown.rs) watcher ov - **Same hash (only callbacks changed).** The runner is signalled with SIGUSR2; it re-imports the rebuilt artifact and rotates the trainer's callback cell in place via an internal HMR brand. The cloud-side training run is untouched, no GPU time is wasted, and the SPA shows a brief "Callbacks hot-swapped" indicator. - **Different hash (model / dataset / hyperparameters changed).** The runner is signalled with SIGTERM; the trainer's internal early-stop entry point lets the next checkpoint upload finish before issuing `cancel()`, then the SPA re-spawns the run with the rebuilt artifact. The previous Cloud-side job reaches `cancelled` after the checkpoint is uploaded, so the partial work is preserved as an artifact. -If you want this "stop after the next checkpoint" behaviour from your own code (rather than from the dev loop), build it on top of the public [`abortSignal` + `cancel()`](/sdk/trainer-control#abortsignal) pair — the [Early stopping recipe](/cookbook/early-stopping) walks through it. +If you want this "stop after the next checkpoint" behaviour from your own code (rather than from the dev loop), build it on top of the public [`abortSignal` + `cancel()`](/sdk/trainer-control#abortsignal) pair. The [Early stopping recipe](/cookbook/early-stopping) walks through it. ## Where Studio runs diff --git a/docs/ja/studio/jobs.mdx b/docs/ja/studio/jobs.mdx index 245a2a8d..d95c3137 100644 --- a/docs/ja/studio/jobs.mdx +++ b/docs/ja/studio/jobs.mdx @@ -55,8 +55,8 @@ Jobs リストはマウント時に 1 度、その後 5 秒ごとに `GET /api/j Loss チャートは `training.log` イベントから描画される SVG プロットです。Y 軸は最小値と最大値によるスケーリング、X 軸はステップ番号で、最大 2 系列を表示します: -- **Training loss** — 実線のティール色。数値 `loss` を含むイベントごとに 1 頂点。 -- **Eval loss** — 破線のピンク色(点マーカー付き)。数値 `evalLoss` を含むイベント(通常は `evalSteps` 刻み)から描画。系列はイベントから直接構築するため、`evalLoss` のみを持ち `loss` を含まない eval-only フレームも線・凡例・統計に反映されます。Eval ポイントが 1 つも来ていない間は凡例にも表示されません。 +- **Training loss**: 実線のティール色。数値 `loss` を含むイベントごとに 1 頂点。 +- **Eval loss**: 破線のピンク色(点マーカー付き)。数値 `evalLoss` を含むイベント(通常は `evalSteps` 刻み)から描画。系列はイベントから直接構築するため、`evalLoss` のみを持ち `loss` を含まない eval-only フレームも線・凡例・統計に反映されます。Eval ポイントが 1 つも来ていない間は凡例にも表示されません。 ホバーすると最寄りステップと、そのステップに含まれる `loss` / `evalLoss` のうち存在する値が表示されます(eval-only ステップでは `loss` 値は出ず、その逆も同様)。チャートは `loss` または `evalLoss` のいずれかが数値であるイベントが 1 件以上届くまで `Waiting for training.log events…`(`training.log` イベント待ち)プレースホルダーを表示します。両方とも null / 省略の `training.log` フレームはカウントされません。 @@ -64,9 +64,9 @@ Loss チャートは `training.log` イベントから描画される SVG プロ チャートヘッダーの **Advanced** トグルを ON にすると、系列ごとの統計パネルが現れます。各カードに表示される項目: -- **Mean loss ± 95% CI** — Loss 値の標本平均と 95% 信頼区間の半幅(Student の t 分布。n > 31 では z = 1.96 にフォールバック)。 -- **Std dev** と **Variance** — Bessel 補正済みの不偏推定量(`ddof=1`)。 -- **p90** と **p95** — numpy のデフォルトに合わせた線形補間パーセンタイル。 +- **Mean loss ± 95% CI**: Loss 値の標本平均と 95% 信頼区間の半幅(Student の t 分布。n > 31 では z = 1.96 にフォールバック)。 +- **Std dev** と **Variance**: Bessel 補正済みの不偏推定量(`ddof=1`)。 +- **p90** と **p95**: numpy のデフォルトに合わせた線形補間パーセンタイル。 Eval カードは数値 `evalLoss` を含む `training.log` イベントが届くまでは空のままです。 diff --git a/e2e/studio/src/specs/hmr.spec.ts b/e2e/studio/src/specs/hmr.spec.ts index c837794a..94346b38 100644 --- a/e2e/studio/src/specs/hmr.spec.ts +++ b/e2e/studio/src/specs/hmr.spec.ts @@ -13,13 +13,13 @@ import { expect, test } from "../harness/fixture"; * * 1. The trainer carries the `Symbol.for("arkor.trainer.inspect")` * brand so `findInspectableTrainer` (used by `studio/hmr.ts`'s - * `inspectBundle`) can read its name + config — without the + * `inspectBundle`) can read its name + config: without the * brand, every SSE rebuild frame gets `trainerName: null` and * the SSE-level test below can't distinguish the post-edit * rebuild from the cached initial-build replay. The seed * fixture skips the brand because its existing tests only * exercise the `/api/manifest` path (which uses - * `findTrainerInModule`, brand-less) — extending it would + * `findTrainerInModule`, brand-less). Extending it would * couple every test to inspection internals it doesn't care * about. * @@ -33,7 +33,7 @@ import { expect, test } from "../harness/fixture"; * * `Symbol.for` keys round-trip across the dev process / built * bundle realm boundary because they live in the global symbol - * registry — same mechanism `core/trainerInspection.ts` documents + * registry, the same mechanism `core/trainerInspection.ts` documents * for the runtime CLI / `.arkor/build/index.mjs` split. */ function rewriteManifest(projectDir: string, name: string): void { @@ -160,7 +160,7 @@ test.describe("Studio HMR", () => { await expect(hmrMeta).toHaveAttribute("content", "true"); // Endpoint sanity-check: a GET without the studio token must 403 - // (regression for the CSRF allow-list — `eventStreamPathPattern` + // (regression for the CSRF allow-list: `eventStreamPathPattern` // permits the query-token form, but a raw GET stays gated). const noToken = await fetch(`${studio.url}/api/dev/events`); expect(noToken.status).toBe(403); @@ -176,7 +176,7 @@ test.describe("Studio HMR", () => { // connect; subscribing first then editing would force a // drain step. Going edit → subscribe is simpler: the // predicate explicitly requires `trainerName === newName`, - // which only the post-edit BUNDLE_END can satisfy — any + // which only the post-edit BUNDLE_END can satisfy; any // cached or in-flight frame for the seed name fails the // predicate and `awaitSseFrame` keeps reading until the // matching one arrives. @@ -228,7 +228,7 @@ test.describe("Studio HMR", () => { // dynamic-imports the freshly-built artefact via // `summariseBuiltManifest`. The HMR rebuild must have completed // *and* the cache-bust URL must reflect the new bytes for this - // assertion to pass — exercises the rebuild → write artefact → + // assertion to pass: exercises the rebuild → write artefact → // re-import → return summary chain end-to-end. const newName = `studio-e2e-trainer-renamed-${Date.now()}`; rewriteManifest(fixturePaths.projectDir, newName); diff --git a/packages/arkor/src/cli/cleanupHooks.test.ts b/packages/arkor/src/cli/cleanupHooks.test.ts index ab90031e..864feb9f 100644 --- a/packages/arkor/src/cli/cleanupHooks.test.ts +++ b/packages/arkor/src/cli/cleanupHooks.test.ts @@ -80,7 +80,7 @@ describe("registerCleanupHook", () => { expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent // shells / orchestrators can distinguish "user interrupted" - // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // from "ran to completion (0)"; see SIGNAL_EXIT_CODE in // cleanupHooks.ts. expect(codes).toEqual([130]); }); @@ -139,7 +139,7 @@ describe("registerCleanupHook", () => { expect(order).toEqual(["sync-cleanup", "async-cleanup-finished"]); // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent // shells / orchestrators can distinguish "user interrupted" - // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // from "ran to completion (0)"; see SIGNAL_EXIT_CODE in // cleanupHooks.ts. expect(codes).toEqual([130]); }); @@ -149,7 +149,7 @@ describe("registerCleanupHook", () => { // `process.exit(0)`, regardless of which signal fired the // shutdown. Parent shells / orchestrators / CI runners that // gate on signal-style nonzero status would mis-classify a - // Ctrl-C (SIGINT) as a clean run — `arkor dev || cleanup` + // Ctrl-C (SIGINT) as a clean run: `arkor dev || cleanup` // would skip the cleanup branch and leave whatever it owned // unreaped. POSIX convention is 128 + signo (SIGINT=2 → 130, // SIGTERM=15 → 143, SIGHUP=1 → 129); SIGNAL_EXIT_CODE in @@ -163,7 +163,7 @@ describe("registerCleanupHook", () => { registerCleanupHook({ cleanup: () => {}, exitOnSignal: true }); const codes = mockExit(); process.emit(sig, sig); - // queueMicrotask + Promise.allSettled chain — two flushes + // queueMicrotask + Promise.allSettled chain: two flushes // mirror the existing tests. await flushMicrotasks(); await flushMicrotasks(); @@ -201,7 +201,7 @@ describe("registerCleanupHook", () => { expect(process.listeners("SIGHUP").length).toBe(sighupBefore + 1); // Firing one signal must detach BOTH that registration's signal - // listener AND its sibling exit listener — the registration is + // listener AND its sibling exit listener: the registration is // done after first fire regardless of which channel triggered it. process.emit("SIGINT", "SIGINT"); @@ -213,7 +213,7 @@ describe("registerCleanupHook", () => { it("__resetCleanupHooksForTests detaches every still-armed registration", () => { // Test-only escape hatch for registrations whose handler never - // fires inside the test (no signal emitted) — without it, those + // fires inside the test (no signal emitted); without it, those // listeners would persist across the vitest worker's test queue. const exitBefore = process.listeners("exit").length; registerCleanupHook({ cleanup: () => {}, exitOnSignal: false }); @@ -249,7 +249,7 @@ describe("registerCleanupHook", () => { // listener after auto-detach, so codes has exactly one entry. // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) so parent // shells / orchestrators can distinguish "user interrupted" - // from "ran to completion (0)" — see SIGNAL_EXIT_CODE in + // from "ran to completion (0)"; see SIGNAL_EXIT_CODE in // cleanupHooks.ts. expect(codes).toEqual([130]); }); diff --git a/packages/arkor/src/cli/cleanupHooks.ts b/packages/arkor/src/cli/cleanupHooks.ts index 76805c3e..6c560aa0 100644 --- a/packages/arkor/src/cli/cleanupHooks.ts +++ b/packages/arkor/src/cli/cleanupHooks.ts @@ -3,7 +3,7 @@ const TERMINATING_SIGNALS = ["SIGINT", "SIGTERM", "SIGHUP"] as const; /** * POSIX-style exit code for a signal-terminated process: `128 + signo`. * Parent shells / orchestrators rely on this to distinguish "user - * interrupted" (nonzero) from "ran to completion" (zero) — exiting 0 + * interrupted" (nonzero) from "ran to completion" (zero): exiting 0 * for a SIGINT'd `arkor dev` would make CI / shell loops / `&&` * chains misclassify the interruption as success. The numbers below * are the canonical signo values from POSIX (1=HUP, 2=INT, 15=TERM). @@ -36,10 +36,10 @@ export interface CleanupHookOptions { * own the exit. Default: `false`. * * The exit code is the POSIX `128 + signo` for the signal that - * triggered shutdown — 130 for SIGINT, 143 for SIGTERM, 129 for + * triggered shutdown: 130 for SIGINT, 143 for SIGTERM, 129 for * SIGHUP (see `SIGNAL_EXIT_CODE`). Parent shells / orchestrators / * CI runners distinguish "user interrupted" (nonzero) from "ran - * to completion" (zero) on this — exiting 0 for a Ctrl-C'd + * to completion" (zero) on this: exiting 0 for a Ctrl-C'd * `arkor dev` would let `arkor dev || cleanup_on_failure` skip * its cleanup branch. */ @@ -50,9 +50,9 @@ export interface CleanupHookOptions { * Module-scoped tracker of cleanup promises that haven't settled yet. * The exit-owning hook waits on the union of (its own cleanup) + * (every other in-flight cleanup) before calling `process.exit(...)`, - * so a fire-and-forget async cleanup in a sibling registration — - * `hmr.dispose()` is the canonical example — isn't cut off by an - * eager exit. (Exit code is signal-specific — see `SIGNAL_EXIT_CODE`.) + * so a fire-and-forget async cleanup in a sibling registration + * (`hmr.dispose()` is the canonical example) isn't cut off by an + * eager exit. (Exit code is signal-specific; see `SIGNAL_EXIT_CODE`.) * * Auto-prunes via the `.finally(() => inFlightCleanups.delete(...))` * each `run()` attaches, so the set doesn't grow without bound across @@ -87,7 +87,7 @@ const attachedHandlers = new Set<() => void>(); * a process that goes through many register → fire cycles doesn't * accumulate stale listeners on `process`. * - * `process.on("exit", ...)` listeners cannot be async — Node fires + * `process.on("exit", ...)` listeners cannot be async: Node fires * them right before the process terminates and discards any returned * promise. We still register so sync cleanups (e.g. `unlinkSync`) run * on a normal `process.exit(0)` path that never reached a signal @@ -134,7 +134,7 @@ export function registerCleanupHook(options: CleanupHookOptions): void { // Capture which signal triggered shutdown so the exit code // below reflects "interrupted by SIG" (POSIX 128 + signo) // rather than "ran to completion" (0). Parent shells / - // orchestrators / CI runners distinguish these — a script + // orchestrators / CI runners distinguish these: a script // that runs `arkor dev || cleanup_on_failure` would otherwise // mis-classify a Ctrl-C as success and skip its cleanup. const exitCode = SIGNAL_EXIT_CODE[sig]; diff --git a/packages/arkor/src/cli/commands/dev.test.ts b/packages/arkor/src/cli/commands/dev.test.ts index bd00d5fd..e750e588 100644 --- a/packages/arkor/src/cli/commands/dev.test.ts +++ b/packages/arkor/src/cli/commands/dev.test.ts @@ -36,7 +36,7 @@ import { __resetCleanupHooksForTests } from "../cleanupHooks"; import { ensureCredentialsForStudio, runDev } from "./dev"; /** - * Yield one `setImmediate` tick — enough for the cleanupHooks + * Yield one `setImmediate` tick: enough for the cleanupHooks * coordinator's `Promise.allSettled(...).then(() => process.exit(0))` * chain to drain when there are no async cleanups in flight (the * common case in this file: signal handler → queueMicrotask → @@ -46,13 +46,13 @@ import { ensureCredentialsForStudio, runDev } from "./dev"; * * `setImmediate` is the right primitive (vs `Promise.resolve` / * `queueMicrotask`) because we need the event loop to actually - * turn — the `process.exit` mock fires inside a `.then` callback + * turn: the `process.exit` mock fires inside a `.then` callback * scheduled from a previous microtask checkpoint, and a microtask- * only flush would resume *before* that callback gets to run. * * Tests that drive a chain with extra microtask hops (e.g. async * sibling cleanups whose promises also pass through - * `Promise.allSettled`) await this helper twice in a row — see + * `Promise.allSettled`) await this helper twice in a row; see * the cleanupHooks tests. */ function flushMicrotasks(): Promise { @@ -109,7 +109,7 @@ describe("ensureCredentialsForStudio", () => { }); // When OAuth is advertised by the deployment, `arkor dev` no longer - // hands off to `runLogin` — that would block the Studio launch on a + // hands off to `runLogin`; that would block the Studio launch on a // browser flow. Instead we bootstrap anon and show a hint pointing at // `arkor login`, leaving the upgrade in the user's hands. it("bootstraps anonymous credentials even when OAuth is configured", async () => { @@ -184,7 +184,7 @@ describe("ensureCredentialsForStudio", () => { }); }); - // Regression for ENG-403 — when the cloud-api is unreachable, `arkor dev` + // Regression for ENG-403: when the cloud-api is unreachable, `arkor dev` // previously failed to start because the anonymous bootstrap's network // error wasn't caught. it("does not throw when the anonymous bootstrap fails after a successful config fetch", async () => { @@ -266,7 +266,7 @@ describe("ensureCredentialsForStudio", () => { // must surface at startup instead of being silently warned. it("re-throws when ARKOR_CLOUD_API_URL is malformed (config error)", async () => { process.env.ARKOR_CLOUD_API_URL = ""; - // No fetch mock — let real fetch raise the URL parse error so we + // No fetch mock: let real fetch raise the URL parse error so we // exercise the actual undici contract, not a synthetic TypeError. await expect(ensureCredentialsForStudio()).rejects.toThrow(TypeError); await expect(ensureCredentialsForStudio()).rejects.not.toThrow( @@ -307,7 +307,7 @@ describe("ensureCredentialsForStudio", () => { ); }); - // Codex P1 review on PR #65 — OAuth-only deployments advertise Auth0 in + // Codex P1 review on PR #65: OAuth-only deployments advertise Auth0 in // /v1/auth/cli/config but reject /v1/auth/anonymous. The new "always try // anon first" flow used to leave first-run users on those deployments // with a bare "Failed to acquire anonymous token (4xx)" error and no way @@ -346,7 +346,7 @@ describe("ensureCredentialsForStudio", () => { expect(await readCredentials()).toBeNull(); }); - // Codex P2 review on PR #65 — the OAuth-only wrap used to span the whole + // Codex P2 review on PR #65: the OAuth-only wrap used to span the whole // anon bootstrap, so fs errors from `writeCredentials` were also rewritten // as "deployment may require sign-in", hiding the actionable fs cause. // @@ -356,8 +356,8 @@ describe("ensureCredentialsForStudio", () => { // `writeFile` would raise EACCES under the bootstrap) only works on // POSIX as a non-root user: root bypasses chmod (Codex on PR #65), and // on Windows POSIX permission bits don't durably block writes inside a - // directory at all — Node maps `chmod` to the legacy read-only - // attribute, which NTFS only enforces on files. Both edges silently + // directory at all (Node maps `chmod` to the legacy read-only + // attribute, which NTFS only enforces on files). Both edges silently // turned the test green for the wrong reason. Mocking lifts the // "produce an EACCES" half of the test out of the host filesystem // entirely so every CI matrix entry exercises the wrap-narrowing @@ -421,7 +421,7 @@ describe("ensureCredentialsForStudio", () => { ); } if (url.endsWith("/v1/auth/anonymous")) { - // Missing `personalOrg` — anonymousTokenResponseSchema rejects. + // Missing `personalOrg`: anonymousTokenResponseSchema rejects. return new Response( JSON.stringify({ token: "t", anonymousId: "a", kind: "cli" }), { status: 200 }, @@ -439,7 +439,7 @@ describe("ensureCredentialsForStudio", () => { it("forwards a non-Error throwable from requestAnonymousToken (String() coercion)", async () => { // Defensive coverage of the `err instanceof Error ? err.message : String(err)` // helper inside the warn branch isn't exercised here because the - // helper is in the dev.ts catch — but the symmetrical path inside + // helper is in the dev.ts catch; but the symmetrical path inside // the schema-error case rethrows with the original value preserved. globalThis.fetch = vi.fn(async (input) => { const url = String(input); @@ -475,7 +475,7 @@ describe("ensureCredentialsForStudio", () => { ); } if (url.endsWith("/v1/auth/anonymous")) { - // Missing `personalOrg` — anonymousTokenResponseSchema rejects. + // Missing `personalOrg`: anonymousTokenResponseSchema rejects. return new Response( JSON.stringify({ token: "t", anonymousId: "a", kind: "cli" }), { status: 200 }, @@ -664,7 +664,7 @@ describe("runDev", () => { // ~/.arkor read-only after writeCredentials (so readCredentials still // works) so the per-launch token write hits EACCES. if (typeof process.getuid === "function" && process.getuid() === 0) { - // Root bypasses chmod permission checks — skip on root containers. + // Root bypasses chmod permission checks; skip on root containers. return; } chmodSync(join(fakeHome, ".arkor"), 0o555); @@ -710,10 +710,10 @@ describe("runDev", () => { // Sync side effect (token unlink) lands inside the synchronous // portion of the handler. expect(existsSync(studioTokenPath())).toBe(false); - // Exit fires after `Promise.allSettled(asyncCleanups)` resolves — + // Exit fires after `Promise.allSettled(asyncCleanups)` resolves; // a few microticks later. Flush to let the queued exit run. await flushMicrotasks(); - // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) — see + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2): see // SIGNAL_EXIT_CODE in cleanupHooks.ts. Parent shells need // the nonzero code to distinguish interrupt from clean exit. expect(exitSpy).toHaveBeenCalledWith(130); @@ -724,14 +724,14 @@ describe("runDev", () => { it("keeps the SIGINT exit handler armed even when persisting the studio token fails", async () => { // Regression: if `persistStudioToken` threw, the previous code - // skipped `scheduleStudioTokenCleanup` — and that was the *only* + // skipped `scheduleStudioTokenCleanup`, and that was the *only* // hook that called `process.exit(0)` on SIGINT. The leftover HMR // hook overrides Node's default "exit on SIGINT" behaviour, so the // dev server would idle in the foreground forever. The fix // registers the token cleanup unconditionally; here we make // persist throw and verify SIGINT still terminates. if (typeof process.getuid === "function" && process.getuid() === 0) { - // Root bypasses chmod permission checks — skip on root containers. + // Root bypasses chmod permission checks; skip on root containers. return; } chmodSync(join(fakeHome, ".arkor"), 0o555); @@ -758,7 +758,7 @@ describe("runDev", () => { // ran (best-effort `unlinkSync` swallows ENOENT) and the // exit-on-signal arm fired (after async cleanup tails settle). await flushMicrotasks(); - // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2) — see + // SIGINT exits 130 (POSIX 128 + signo for SIGINT=2): see // SIGNAL_EXIT_CODE in cleanupHooks.ts. Parent shells need // the nonzero code to distinguish interrupt from clean exit. expect(exitSpy).toHaveBeenCalledWith(130); diff --git a/packages/arkor/src/cli/commands/dev.ts b/packages/arkor/src/cli/commands/dev.ts index d6b924a2..374a3c87 100644 --- a/packages/arkor/src/cli/commands/dev.ts +++ b/packages/arkor/src/cli/commands/dev.ts @@ -118,7 +118,7 @@ export async function ensureCredentialsForStudio(): Promise { // wrap fires only for genuine deployment rejection (401/403/404 et // al). 5xx is a transient cloud-api failure where retrying makes // sense, ZodErrors signal a malformed response (server bug), and fs - // failures are out of scope for the anon endpoint entirely — none of + // failures are out of scope for the anon endpoint entirely; none of // these should be mislabelled as a sign-in requirement. if ( err instanceof AnonymousTokenRejectedError && @@ -126,7 +126,7 @@ export async function ensureCredentialsForStudio(): Promise { err.status < 500 && oauthAvailable ) { - // Surface only the status code at the top level — the inner + // Surface only the status code at the top level: the inner // `err.message` already starts with "Failed to acquire…" and // includes the response-body snippet, which would double-prefix the // wrap and risk leaking noisy HTML/JSON error pages. The full @@ -205,7 +205,7 @@ function scheduleStudioTokenCleanup( function scheduleHmrCleanup(hmr: { dispose: () => Promise }): void { // Registered before the studio-token cleanup so it runs first on - // shutdown — Node fires signal handlers in registration order, and we + // shutdown: Node fires signal handlers in registration order, and we // want the watcher to release file handles before the outermost // process.exit. registerCleanupHook({ cleanup: () => hmr.dispose() }); @@ -224,7 +224,7 @@ export async function runDev(options: DevOptions = {}): Promise { // `src/arkor` graph. The coordinator itself is lazy (`subscribe()` // is what starts the watcher, not `createHmrCoordinator`), but // `buildStudioApp` registers its per-rebuild signal-dispatch - // subscriber unconditionally — that subscriber needs to run on + // subscriber unconditionally: that subscriber needs to run on // every BUNDLE_END regardless of whether any SSE client is // connected, so it can SIGUSR2/SIGTERM active `/api/train` // children and keep `lastSuccessConfigHash` warm for spawn-time @@ -278,7 +278,7 @@ export async function runDev(options: DevOptions = {}): Promise { // attempt above failed (e.g. cloud-api was unreachable at launch). const app = buildStudioApp({ studioToken, hmr }); // Bind to 127.0.0.1 (not "localhost") so the listener can't end up on `::1` - // only — `@hono/node-server` passes hostname to `net.Server.listen`, which + // only; `@hono/node-server` passes hostname to `net.Server.listen`, which // calls `dns.lookup`. On hosts where `/etc/hosts` orders `::1 localhost` // before `127.0.0.1 localhost`, a "localhost" bind would refuse IPv4 // connections, breaking the studio-app Vite proxy (hardcoded to diff --git a/packages/arkor/src/core/configHash.test.ts b/packages/arkor/src/core/configHash.test.ts index 4d7566f1..ec681124 100644 --- a/packages/arkor/src/core/configHash.test.ts +++ b/packages/arkor/src/core/configHash.test.ts @@ -52,7 +52,7 @@ describe("hashJobConfig", () => { it("treats `undefined` object properties identically to omitted ones (JSON parity)", () => { // Regression: the previous `stableStringify` delegated to // `JSON.stringify(undefined)` which returns `undefined` (not a - // string) — concatenated via template literal that became the + // string), concatenated via template literal that became the // substring `"undefined"` in the hash input. So `{ a: 1 }` and // `{ a: 1, b: undefined }` produced different hashes even though // they're indistinguishable on the wire (`JSON.stringify` drops @@ -122,7 +122,7 @@ describe("hashJobConfig", () => { // // The fixture's `toJSON(key)` returns `"key="`. Compare // against an explicit string field holding what JSON.stringify - // would produce — matching hashes prove the key reached toJSON. + // would produce; matching hashes prove the key reached toJSON. const ctx = { toJSON(key: string) { return `key=${key}`; @@ -143,13 +143,13 @@ describe("hashJobConfig", () => { it("omits an object property whose `toJSON(key)` returns undefined (JSON parity)", () => { // Regression: `JSON.stringify({ a: { toJSON: () => undefined } })` - // produces `"{}"` — `toJSON` returning `undefined` is the spec's + // produces `"{}"`: `toJSON` returning `undefined` is the spec's // "skip me" signal in object position. The previous // `stableStringify` collapsed every non-representable value to // the literal string `"null"` at recursion time, so the same // input hashed as `{"a":null}` instead of `{}`. That divergence // forced unnecessary SIGTERM restarts whenever a `JobConfig` - // field's serialiser opted out — `configHash` would diverge from + // field's serialiser opted out: `configHash` would diverge from // the wire-format payload (which DOES omit the field). const omitting = { toJSON() { diff --git a/packages/arkor/src/core/configHash.ts b/packages/arkor/src/core/configHash.ts index fb76d1f1..2e407094 100644 --- a/packages/arkor/src/core/configHash.ts +++ b/packages/arkor/src/core/configHash.ts @@ -8,7 +8,7 @@ import type { JobConfig } from "./types"; * `buildJobConfig` revisions or user-side spread-merge tricks. * * Returns `string | undefined`. `undefined` is the "omit me from my - * containing object" sentinel — it propagates from any value + * containing object" sentinel: it propagates from any value * `JSON.stringify` would silently drop in object position * (`undefined`, functions, symbols, *and* objects whose `toJSON(key)` * returns one of those). Callers sit at three boundaries: @@ -22,7 +22,7 @@ import type { JobConfig } from "./types"; * * The previous implementation collapsed every non-representable to * the literal string `"null"` at recursion time, which leaked into - * object slots as `{"a":null}` instead of the JSON-correct `{}` — + * object slots as `{"a":null}` instead of the JSON-correct `{}`, * making `configHash` diverge from the wire-format payload for * `JobConfig` fields whose `toJSON(key)` happened to return * `undefined` (the spec-defined "skip me" signal). That divergence @@ -44,7 +44,7 @@ function stableStringify(value: unknown, key: string = ""): string | undefined { // user-side `toJSON(key)` implementations that branch on the // hosting property/index see the same value JSON.stringify would. // If `toJSON` returns `undefined`, that propagates as the omit - // sentinel — the spec-defined "skip me" path. + // sentinel: the spec-defined "skip me" path. const maybeToJSON = (value as { toJSON?: unknown }).toJSON; if (typeof maybeToJSON === "function") { return stableStringify( diff --git a/packages/arkor/src/core/moduleCacheBust.test.ts b/packages/arkor/src/core/moduleCacheBust.test.ts index f4aaca00..40b8509a 100644 --- a/packages/arkor/src/core/moduleCacheBust.test.ts +++ b/packages/arkor/src/core/moduleCacheBust.test.ts @@ -49,8 +49,8 @@ describe("moduleCacheBustKey", () => { // The eventual `await import(url)` will throw on a missing // file; the helper itself should produce a value rather than // bubbling the stat error and turning every consumer into a - // try/catch site. Three zeros — one each for mtimeMs, ctimeMs, - // size — to keep the shape uniform with the success branch. + // try/catch site. Three zeros (one each for mtimeMs, ctimeMs, + // size) to keep the shape uniform with the success branch. expect(moduleCacheBustKey(join(dir, "does-not-exist.mjs"))).toBe("0-0-0"); }); }); diff --git a/packages/arkor/src/core/moduleCacheBust.ts b/packages/arkor/src/core/moduleCacheBust.ts index da238291..22f160a5 100644 --- a/packages/arkor/src/core/moduleCacheBust.ts +++ b/packages/arkor/src/core/moduleCacheBust.ts @@ -7,9 +7,9 @@ import { pathToFileURL } from "node:url"; * Why this matters: Node's ESM loader caches every dynamically-imported * URL for the lifetime of the process and exposes no API to evict a * record. A naive `?t=Date.now()` cache-bust produces a fresh URL on - * every call, so a long-running `arkor dev` session — where the SPA + * every call, so a long-running `arkor dev` session (where the SPA * polls `/api/manifest` every few seconds and every save fires - * `BUNDLE_END` + SIGUSR2 — accumulates one module record per call, + * `BUNDLE_END` + SIGUSR2) accumulates one module record per call, * unbounded. * * Keying on `mtimeMs + ctimeMs + size` collapses repeated reads of the @@ -24,7 +24,7 @@ import { pathToFileURL } from "node:url"; * the same key, which made Node's loader return the *stale* module * for the second edit (HMR/manifest staleness on fast filesystems). * `ctimeMs` is included as belt-and-braces against the (rare) case - * where mtime collides but ctime moves — `touch -m` and some build + * where mtime collides but ctime moves: `touch -m` and some build * tools update one without the other. * * Falls back to a stable literal on stat failure so the eventual diff --git a/packages/arkor/src/core/projectState.test.ts b/packages/arkor/src/core/projectState.test.ts index 9dc7c516..08dc7aac 100644 --- a/packages/arkor/src/core/projectState.test.ts +++ b/packages/arkor/src/core/projectState.test.ts @@ -37,7 +37,7 @@ function fakeClient( // Construct a real CloudApiClient (so type-compatibility holds), then // monkey-patch only the methods exercised by ensureProjectState. The // other methods would throw on first use because no fetcher is wired, - // which is fine — projectState should never reach them. + // which is fine; projectState should never reach them. const client = new CloudApiClient({ baseUrl: "http://mock", credentials: anonCreds, @@ -84,7 +84,7 @@ describe("ensureProjectState", () => { expect(createProject).not.toHaveBeenCalled(); }); - it("throws for auth0 callers without state — they must run `arkor init`", async () => { + it("throws for auth0 callers without state: they must run `arkor init`", async () => { const client = fakeClient(); await expect( ensureProjectState({ cwd, client, credentials: auth0Creds }), @@ -116,7 +116,7 @@ describe("ensureProjectState", () => { expect(createProject).toHaveBeenCalledWith({ orgSlug: "anon-abc", name: expect.stringMatching(/^my-app/), - // Sanitised slug — basename starts with "my-app-", and we + // Sanitised slug: basename starts with "my-app-", and we // expect the sanitiser to keep dashes. slug: expect.stringMatching(/^my-app/), }); diff --git a/packages/arkor/src/core/rolldownConfig.ts b/packages/arkor/src/core/rolldownConfig.ts index 035b754f..66e87c29 100644 --- a/packages/arkor/src/core/rolldownConfig.ts +++ b/packages/arkor/src/core/rolldownConfig.ts @@ -41,7 +41,7 @@ export function resolveBuildEntry(opts: BuildEntryOptions): ResolvedBuildEntry { * `process.execPath`), so the bundle can target precisely what will execute it. */ export function resolveNodeTarget(): string { - // Fallback aligns with the published `engines.node` floor — see + // Fallback aligns with the published `engines.node` floor; see // [packages/arkor/package.json] / `AGENTS.md`'s "Node version" note. const [major = "22", minor = "22"] = process.versions.node.split("."); return `node${major}.${minor}`; @@ -50,8 +50,8 @@ export function resolveNodeTarget(): string { /** * Build the shared rolldown options object used by both `runBuild` (one-shot) * and the HMR coordinator (`watch()`). Centralising the configuration here - * keeps the two pipelines aligned: anything that affects the bundle shape — - * external resolution, transform target, platform — is set in one place so + * keeps the two pipelines aligned: anything that affects the bundle shape + * (external resolution, transform target, platform) is set in one place so * the artifact a watcher writes is byte-equivalent to a one-shot rebuild. */ export function rolldownInputOptions( diff --git a/packages/arkor/src/core/runner.test.ts b/packages/arkor/src/core/runner.test.ts index ae1cf53e..cdabfb15 100644 --- a/packages/arkor/src/core/runner.test.ts +++ b/packages/arkor/src/core/runner.test.ts @@ -49,7 +49,7 @@ afterEach(() => { rmSync(cwd, { recursive: true, force: true }); }); -describe("runTrainer — entry extraction", () => { +describe("runTrainer: entry extraction", () => { it("throws when the entry file does not exist", async () => { await expect(runTrainer("missing.ts")).rejects.toThrow( /Training entry not found/, @@ -124,7 +124,7 @@ describe("runTrainer — entry extraction", () => { }); it("throws when default export is a primitive (typeof !== 'object' branch)", async () => { - // The second half of `mod.default && typeof mod.default === "object"` — + // The second half of `mod.default && typeof mod.default === "object"`: // a primitive default like `42` or `"foo"` must short-circuit out of // the nested-trainer probe. const entry = join(cwd, "primitive-default.mjs"); @@ -135,7 +135,7 @@ describe("runTrainer — entry extraction", () => { }); it("accepts a default export wrapping a `trainer` field (legacy power-user shape)", async () => { - // Hits the `if (isTrainer(nested)) return nested` branch — the only + // Hits the `if (isTrainer(nested)) return nested` branch: the only // place line 38 is reachable. const entry = join(cwd, "default-with-trainer.mjs"); writeFileSync( @@ -154,7 +154,7 @@ describe("runTrainer — entry extraction", () => { it("falls back to DEFAULT_ENTRY (src/arkor/index.ts) when called with no argument", async () => { // Branch coverage for `file ?? DEFAULT_ENTRY`. Place the entry at - // `/src/arkor/index.ts` and invoke runTrainer() — the default + // `/src/arkor/index.ts` and invoke runTrainer(): the default // path is what `arkor start` and Studio's "Run training" button use. const arkorDir = join(cwd, "src", "arkor"); mkdirSync(arkorDir, { recursive: true }); @@ -174,8 +174,8 @@ describe("runTrainer — entry extraction", () => { join(arkorDir, "index.ts"), `export * from "./index.mjs";\n`, ); - // Pass undefined explicitly to exercise the `?? DEFAULT_ENTRY` branch - // — Node's built-in TypeScript stripping handles the .ts extension at + // Pass undefined explicitly to exercise the `?? DEFAULT_ENTRY` branch. + // Node's built-in TypeScript stripping handles the .ts extension at // runtime. (vitest also strips TS so this works under test too.) await expect(runTrainer()).resolves.toBeUndefined(); }); @@ -208,7 +208,7 @@ describe("runTrainer — entry extraction", () => { }); }); -describe("runTrainer — shutdown signal handling", () => { +describe("runTrainer: shutdown signal handling", () => { it("first SIGTERM calls trainer.requestEarlyStop and exits 0; second SIGTERM exits 143", async () => { // Fake trainer whose `wait()` hangs until the test manually resolves it // (via a global helper). This lets us hold the run in flight long @@ -265,7 +265,7 @@ describe("runTrainer — shutdown signal handling", () => { const runPromise = runTrainer("src/arkor/index.mjs"); // Wait for import + start() to settle so the handler is registered // before we synthesise SIGTERM. Poll for the probe rather than - // relying on a fixed timer — under load (e.g. running alongside + // relying on a fixed timer: under load (e.g. running alongside // sibling test files in turbo) the dynamic import + top-level // body can take longer than a hardcoded 25 ms window. type Probe = { earlyStopCalls: number; finishWait: () => void }; diff --git a/packages/arkor/src/core/runnerSignals.test.ts b/packages/arkor/src/core/runnerSignals.test.ts index a461255e..1a9dcb4d 100644 --- a/packages/arkor/src/core/runnerSignals.test.ts +++ b/packages/arkor/src/core/runnerSignals.test.ts @@ -48,7 +48,7 @@ function makeTrainer(): Trainer & { // Wire the internal callback-replacer + early-stop brands the same // way `createTrainer` does. SIGUSR2 looks them up via // `replaceTrainerCallbacks` and SIGTERM via `requestTrainerEarlyStop` - // — there are no public methods on `Trainer` for either any more. + // (there are no public methods on `Trainer` for either any more). attachTrainerCallbackReplacer(trainer, (cbs) => { replace.lastCallbacks = cbs; replace.calls += 1; @@ -226,7 +226,7 @@ describe("installCallbackReloadHandler", () => { // (notably Windows). Previously this would surface as a hard // crash at `arkor start` boot. The handler now wraps the // registration in try/catch and degrades to a no-op disposer so - // the rest of the runner stays up — the server's + // the rest of the runner stays up: the server's // `safeKill(child, "SIGUSR2")` already detects the same // condition and falls back to SIGTERM-restart there. const trainer = makeTrainer(); @@ -260,13 +260,13 @@ describe("installCallbackReloadHandler", () => { it("drops a stale reload's result when a newer SIGUSR2 starts before the import resolves", async () => { // Regression: each SIGUSR2 starts a fire-and-forget // `import()` + `replaceTrainerCallbacks`. Two same-`configHash` - // rebuilds firing back-to-back can race — the earlier import's + // rebuilds firing back-to-back can race: the earlier import's // bytes sometimes resolve *after* the newer one, and // `replaceTrainerCallbacks` overwrites the freshly-loaded // callbacks with the prior version. The fix version-gates each // reload via a monotonic `loadSeq`; this test pins the contract // by firing two signals back-to-back and asserting that - // `replaceTrainerCallbacks` was invoked exactly **once** — + // `replaceTrainerCallbacks` was invoked exactly **once**: // proving the older IIFE dropped its result at the // `seq !== loadSeq` check before reaching the replace call. const trainer = makeTrainer(); @@ -288,14 +288,14 @@ describe("installCallbackReloadHandler", () => { .mockImplementation((() => true) as typeof process.stderr.write); const dispose = installCallbackReloadHandler(trainer, file); try { - // First signal — captures seq=1 inside the IIFE. + // First signal: captures seq=1 inside the IIFE. process.emit("SIGUSR2", "SIGUSR2"); // Rewrite the bundle to v2 BEFORE letting either import // resolve. mtime+ctime+size change → distinct cache-bust URL. writeUserBundle("v2"); - // Second signal — captures seq=2, bumps loadSeq to 2. + // Second signal: captures seq=2, bumps loadSeq to 2. process.emit("SIGUSR2", "SIGUSR2"); - // Generous fixed wait so both imports definitely settle — + // Generous fixed wait so both imports definitely settle; // we can't poll on `lastCallbacks !== null` because the v1 // IIFE might land first and short-circuit our wait, hiding // the count assertion below. diff --git a/packages/arkor/src/core/runnerSignals.ts b/packages/arkor/src/core/runnerSignals.ts index aadcf5c0..8c3e6f41 100644 --- a/packages/arkor/src/core/runnerSignals.ts +++ b/packages/arkor/src/core/runnerSignals.ts @@ -33,13 +33,13 @@ const SECOND_SIGNAL_EXIT_CODE: Record< * - 1st signal → `trainer.requestEarlyStop()`. The trainer keeps * running, lets the next `checkpoint.saved` event land, then issues * `cancel()`. - * - 2nd signal → immediate `process.exit(POSIX 128+signo)` — + * - 2nd signal → immediate `process.exit(POSIX 128+signo)`: * 130 for SIGINT, 143 for SIGTERM, 129 for SIGHUP. Escape hatch * for an impatient operator or a hung early-stop. Per-signal * exit code so parent shells see the actual interruption type. * * The returned dispose function removes the handlers so a normal - * `wait()` completion doesn't leave stale listeners behind — important + * `wait()` completion doesn't leave stale listeners behind: important * because `runTrainer` can be called multiple times in tests within a * single Node process. */ @@ -108,7 +108,7 @@ export function installShutdownHandlers(trainer: Trainer): () => void { * SIGUSR2 handler: re-import the freshly-rebuilt artefact and rotate * the trainer's callback cell via the internal * `Symbol.for("arkor.trainer.replaceCallbacks")` brand. The cloud-side - * training run is untouched — only the in-process callbacks change. + * training run is untouched; only the in-process callbacks change. * * Studio sends SIGUSR2 from the `/api/dev/events` HMR pipeline when * (and only when) the rebuilt bundle's `JobConfig` hash matches the @@ -125,7 +125,7 @@ export function installCallbackReloadHandler( * dynamic-import await begins, so each in-flight reload knows its * arrival order. When the import resolves, the IIFE compares its * captured `seq` against `loadSeq` and silently drops the result - * if a newer signal already started a newer reload — without this, + * if a newer signal already started a newer reload. Without this, * two same-`configHash` rebuilds firing back-to-back can race on * the import: the earlier import's bytes (now stale on disk) * resolve *after* the newer one, and `replaceTrainerCallbacks` @@ -142,7 +142,7 @@ export function installCallbackReloadHandler( // session with frequent SIGUSR2 reloads would accumulate one // record per signal forever. Keying on the actual artefact bytes // (via `moduleCacheBustUrl`) collapses no-op signals onto the - // same URL — the leak is bounded to "one per real edit", which + // same URL; the leak is bounded to "one per real edit", which // is fundamentally what HMR has to retain. const url = moduleCacheBustUrl(entryPath); void (async () => { @@ -174,7 +174,7 @@ export function installCallbackReloadHandler( // escapes to userland on some Node versions). The server-side // `trainRegistry.safeKill(child, "SIGUSR2")` already detects this // ("unsupported" → falls back to SIGTERM-restart), so an unarmed - // listener here is the documented contract on those platforms — + // listener here is the documented contract on those platforms: // quietly degrade to a no-op disposer rather than crashing // `arkor start` at boot. try { diff --git a/packages/arkor/src/core/schemas.test.ts b/packages/arkor/src/core/schemas.test.ts index 8456b427..100101eb 100644 --- a/packages/arkor/src/core/schemas.test.ts +++ b/packages/arkor/src/core/schemas.test.ts @@ -57,7 +57,7 @@ describe("trainingJobSchema", () => { }); it("normalises non-null startedAt/completedAt strings via the truthy branch", () => { - // Branch coverage for the `v ? String(v) : null` transforms — the + // Branch coverage for the `v ? String(v) : null` transforms: the // `null` branch is exercised by every other test in this file // (the `valid` fixture has both fields null), but the `String(v)` // branch only fires when the field carries an actual timestamp. diff --git a/packages/arkor/src/core/trainer.test.ts b/packages/arkor/src/core/trainer.test.ts index 0799c07a..2398538a 100644 --- a/packages/arkor/src/core/trainer.test.ts +++ b/packages/arkor/src/core/trainer.test.ts @@ -270,7 +270,7 @@ describe("createTrainer (credentials defaulting)", () => { model: "m", dataset: { type: "huggingface", name: "x" }, }, - // Note: NO `credentials` here — trainer must call ensureCredentials. + // Note: NO `credentials` here, so trainer must call ensureCredentials. { baseUrl: "http://mock", cwd: localCwd, @@ -562,7 +562,7 @@ describe("createTrainer (SSE event stream)", () => { }); }); -// Regression for ENG-406 — the previous reconnect loop had no upper bound +// Regression for ENG-406: the previous reconnect loop had no upper bound // and no jitter, so a permanently-down cloud-api would keep retrying every // `reconnectDelayMs` forever (and on recovery several SDK clients would // reconnect at exactly the same instant). @@ -690,7 +690,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { step: 1, loss: 1, })}\n\n`, - // No terminal event — stream closes cleanly, outer loop reconnects. + // No terminal event: stream closes cleanly, outer loop reconnects. ], }, { kind: "throw", error: new TypeError("fetch failed") }, @@ -728,8 +728,8 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { // when `Math.random()` lands near 1. // Codex review on PR #13 (round 3) flagged that a 200-OK stream that // EOFs without emitting any frame would loop forever at the base delay - // — `maxReconnectAttempts` was bypassed because clean closes never - // touched the failure counter. Misconfigured proxies / load-balancers + // because `maxReconnectAttempts` was bypassed (clean closes never + // touched the failure counter). Misconfigured proxies / load-balancers // that accept the connection and immediately drop it would hang // `wait()` indefinitely. it("counts clean closes with no frames toward maxReconnectAttempts", async () => { @@ -850,7 +850,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { }; // The trainer fires `POST /v1/jobs` synchronously inside the start() // path, so cancel() needs the job row to be assigned. We never open the - // event stream — cancel() should not depend on it. + // event stream; cancel() should not depend on it. const sse = [ `id: 1\nevent: training.completed\ndata: ${JSON.stringify({ type: "training.completed", @@ -907,7 +907,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { const original = globalThis.fetch; globalThis.fetch = fetcher; try { - // Start the run by awaiting wait() — the streamed completion event + // Start the run by awaiting wait(): the streamed completion event // closes the loop quickly so cancel() runs against a fully-resolved // startedJob/scope pair. await trainer.wait(); @@ -981,7 +981,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { }); it("skips malformed event payloads without aborting the stream", async () => { - // Branch coverage for the `try/catch` around JSON.parse — a single + // Branch coverage for the `try/catch` around JSON.parse: a single // malformed `data:` line shouldn't tear down the whole training run. // Send one garbage frame followed by a real terminal event. await writeState( @@ -1048,7 +1048,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { }); it("recovers when the SSE body itself errors mid-stream", async () => { - // Branch coverage for the catch around the for-await iterator — + // Branch coverage for the catch around the for-await iterator: // covers the case where the stream's underlying body emits an error // (e.g. a network disconnect partway through). The reconnect loop // should treat it as a failure, count it toward the limit, then @@ -1158,7 +1158,7 @@ describe("createTrainer (reconnect backoff + max attempts)", () => { { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, cwd, ); - // No fetch mock at all — if cancel() reached the API we'd see a real + // No fetch mock at all: if cancel() reached the API we'd see a real // network error. Safety net for callers that wire up cancel() to // SIGINT before kicking off the run. const trainer = createTrainer( @@ -1392,8 +1392,8 @@ describe("createTrainer (early stop)", () => { dataset: { type: "huggingface", name: "x" }, callbacks: { // Arm the early-stop latch from inside the on-log callback so it - // fires before the checkpoint dispatch — mirrors the real CLI - // path where SIGTERM arrives mid-run. Fire-and-forget so the + // fires before the checkpoint dispatch (mirrors the real CLI + // path where SIGTERM arrives mid-run). Fire-and-forget so the // dispatch loop isn't blocked waiting for the latch's own // checkpoint trigger to arrive. onLog: () => { @@ -1416,7 +1416,7 @@ describe("createTrainer (early stop)", () => { // `{ terminal: true }` to break out of `wait()`'s loop without // waiting for a cloud-side terminal event. The `TrainingResult` // it resolves with must therefore reflect a terminal status - // locally — otherwise `wait()` violates its documented contract + // locally; otherwise `wait()` violates its documented contract // ("Resolve when the job reaches a terminal status") and a // subsequent `requestEarlyStop` wouldn't see the // `TERMINAL_STATUSES` short-circuit. @@ -1512,7 +1512,7 @@ describe("createTrainer (early stop)", () => { globalThis.fetch = original; } // The artefacts the checkpoint event carried must travel - // through to the wait() result — that's the whole point of + // through to the wait() result; that's the whole point of // graceful-stop-at-next-checkpoint preserving the in-flight // work. expect(result.artifacts).toEqual(checkpointArtifacts); @@ -1636,7 +1636,7 @@ describe("createTrainer (early stop)", () => { const original = globalThis.fetch; globalThis.fetch = fetcher; try { - // wait() rejects — handleFailure wraps the user callback + // wait() rejects: handleFailure wraps the user callback // throw because maxReconnectAttempts is 0. await expect(trainer.wait()).rejects.toThrow(); // Critical: the latch SETTLED via the early-stop branch @@ -1655,7 +1655,7 @@ describe("createTrainer (early stop)", () => { // Regression: previously, an `await trainer.cancel()` that threw // (network failure / cloud-api 5xx during the cancel POST) was // *swallowed*, the deferred resolved cleanly, and the runner - // exited 0 — the UI declared the run cancelled while the cloud + // exited 0: the UI declared the run cancelled while the cloud // job kept running, orphaning GPU spend with no visible error. // The fix REJECTS the deferred so the runner's // `installShutdownHandlers` `.catch()` writes the failure to @@ -1730,7 +1730,7 @@ describe("createTrainer (early stop)", () => { onLog: () => { // Arm exactly once and capture the returned promise. // requestTrainerEarlyStop is idempotent across repeat - // calls, but we only need the FIRST armed deferred — + // calls, but we only need the FIRST armed deferred: // the cancel-throw rejects exactly that promise. if (armedPromise === null) { armedPromise = requestTrainerEarlyStop(trainer, { @@ -1763,7 +1763,7 @@ describe("createTrainer (early stop)", () => { } // cancel() was attempted (and threw). expect(cancelAttempts).toBe(1); - // The armed deferred REJECTED — the runner's `.catch()` would + // The armed deferred REJECTED: the runner's `.catch()` would // see this error and log it to stderr instead of silently // exiting 0. Critically: it didn't hang on "pending"; the // failure case still settles, just via reject not resolve. @@ -1780,7 +1780,7 @@ describe("createTrainer (early stop)", () => { // checkpoint landed (a common case for short jobs or runs that // had already saved their last checkpoint when SIGTERM arrived), // the deferred stayed pending until the (default 5-min) timeout - // fired — the SIGTERM handler in `installShutdownHandlers` + // fired; the SIGTERM handler in `installShutdownHandlers` // awaits that promise before exit, so shutdown was delayed up to // `timeoutMs`. Both terminal branches now settle the latch // explicitly so the signal path completes immediately when the @@ -1791,7 +1791,7 @@ describe("createTrainer (early stop)", () => { ); // started → log (arms early-stop) → completed; no checkpoint.saved // in between, so the checkpoint-triggered resolution path is *not* - // exercised — only the new terminal-branch settlement is. + // exercised; only the new terminal-branch settlement is. const sse = [ `id: 1\nevent: training.started\ndata: ${JSON.stringify({ type: "training.started", @@ -1873,7 +1873,7 @@ describe("createTrainer (early stop)", () => { // observes the resolution before we assert. await new Promise((r) => setImmediate(r)); expect(result.job.status).toBe("completed"); - // No cancel POST was issued — the terminal branch just + // No cancel POST was issued: the terminal branch just // releases the latch; it doesn't cancel a run that already // completed on its own. expect(cancelCalls).toBe(0); @@ -1890,7 +1890,7 @@ describe("createTrainer (early stop)", () => { // Regression: previously `settleEarlyStopLatch()` was called // *after* awaiting `callbacks.onCompleted` / `onFailed`. A // thrown user callback propagated out of `dispatch()` before - // the settle ran, leaving `earlyStopDeferred` pending — the + // the settle ran, leaving `earlyStopDeferred` pending; the // SIGTERM handler in `installShutdownHandlers` would block on // that promise until the (default 5-min) timeout fired, // delaying shutdown for a user-code bug. Wrapping in @@ -1952,7 +1952,7 @@ describe("createTrainer (early stop)", () => { dataset: { type: "huggingface", name: "x" }, callbacks: { onLog: () => { - // Arm early-stop with a long timeout — if the latch + // Arm early-stop with a long timeout; if the latch // isn't released by `finally`, this would hang for the // full 60 seconds. void requestTrainerEarlyStop(trainer, { @@ -1980,7 +1980,7 @@ describe("createTrainer (early stop)", () => { // its reconnect loop; with the default unbounded retry the // user-callback throw above would loop forever and the test // would just time out. Cap retries at 0 so the first thrown - // dispatch surfaces as a `wait()` rejection — that lets us + // dispatch surfaces as a `wait()` rejection; that lets us // observe the *latch* settlement (the actual contract under // test) cleanly. maxReconnectAttempts: 0, @@ -1993,7 +1993,7 @@ describe("createTrainer (early stop)", () => { // The user-callback throw is wrapped by `handleFailure` after // `maxReconnectAttempts: 0` exhausts; the original error is // preserved as `cause`. We just need wait() to settle so the - // test doesn't hang — the *body* of the assertion is the + // test doesn't hang. The *body* of the assertion is the // latch state below. await expect(trainer.wait()).rejects.toThrow(); // The latch must have settled (via `finally`) BEFORE wait() @@ -2014,7 +2014,7 @@ describe("createTrainer (early stop)", () => { { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, cwd, ); - // No checkpoint in the stream — only training.completed, which would + // No checkpoint in the stream, only training.completed, which would // normally finish the run. We hand-roll a stream that never ends so // the timeout fallback is what actually triggers cancel. let streamController: ReadableStreamDefaultController | null = @@ -2102,7 +2102,7 @@ describe("createTrainer (early stop)", () => { // Companion to the checkpoint-branch reject test: when no // checkpoint arrives within `timeoutMs`, the timeout fallback // does its own `trainer.cancel()`. Old code swallowed cancel - // errors and ALWAYS resolved the deferred — same false-success + // errors and ALWAYS resolved the deferred: same false-success // failure mode as the checkpoint branch had: local runner // exits cleanly while the cloud job keeps consuming GPU // budget. The fix mirrors the checkpoint reject path: capture @@ -2173,7 +2173,7 @@ describe("createTrainer (early stop)", () => { try { await trainer.start(); // Tiny timeout so the timeout fallback fires fast (no - // checkpoint will land — stream only carries + // checkpoint will land; stream only carries // training.started). The returned promise should REJECT // because the cancel POST throws. await expect( @@ -2204,7 +2204,7 @@ describe("createTrainer (early stop)", () => { // job is being created but `startedJob` is still null. If a // runner-side SIGTERM lands in that window, an immediate // "no-op" early-stop would let `installShutdownHandlers` exit - // the process — leaving the just-created cloud job running + // the process, leaving the just-created cloud job running // with no cancel POST. The fix is to await the in-flight // `start()` promise inside `requestEarlyStop()` so the cancel // path sees a definite job id (or a definite start failure). @@ -2226,7 +2226,7 @@ describe("createTrainer (early stop)", () => { if (method === "POST" && url.includes("/v1/jobs?")) { // Hold createJob open so we can fire `requestEarlyStop` // mid-flight. Once the test releases the gate, return a - // valid job — that establishes the post-create state + // valid job: that establishes the post-create state // requestEarlyStop should then act on (cancel POST). await createJobReleased; return new Response(JSON.stringify({ job: minimalJobRow }), { @@ -2256,7 +2256,7 @@ describe("createTrainer (early stop)", () => { const original = globalThis.fetch; globalThis.fetch = fetcher; try { - // Fire start() but DON'T await — its createJob is gated. + // Fire start() but DON'T await; its createJob is gated. const startPromise = trainer.start(); // Yield once so the start microtasks queue up to the // `await client.createJob`. @@ -2264,14 +2264,14 @@ describe("createTrainer (early stop)", () => { // requestEarlyStop fires while start() is mid-flight. With // the fix it awaits start() rather than no-op'ing immediately. // Tiny `timeoutMs` so once `start()` resolves the latch's - // timeout-fallback fires the cancel POST quickly — there's no + // timeout-fallback fires the cancel POST quickly. There's no // SSE stream in this test, so the checkpoint-driven path // never arrives. We're testing the "stop awaited start()" leg // of the contract, not the checkpoint plumbing. const stopPromise = requestTrainerEarlyStop(trainer, { timeoutMs: 50, }); - // Sanity: stop hasn't resolved yet — it's blocked on + // Sanity: stop hasn't resolved yet; it's blocked on // start() which is blocked on createJob. let stopSettled = false; void stopPromise.then(() => { @@ -2356,8 +2356,8 @@ describe("createTrainer (early stop)", () => { calls.push(`v1:onLog(${step})`); // After the first onLog call, swap to v2 callbacks via the // internal `Symbol.for("arkor.trainer.replaceCallbacks")` - // brand — the same brand `arkor dev`'s SIGUSR2 handler - // uses. The next event must dispatch via the new object. + // brand (the same brand `arkor dev`'s SIGUSR2 handler + // uses). The next event must dispatch via the new object. if (step === 1) { replaceTrainerCallbacks(trainer, { onLog: ({ step: s }) => void calls.push(`v2:onLog(${s})`), @@ -2378,7 +2378,7 @@ describe("createTrainer (early stop)", () => { expect(calls).toEqual(["v1:onLog(1)", "v2:onLog(2)"]); }); - it("is idempotent — repeated calls share the same in-flight promise", async () => { + it("is idempotent: repeated calls share the same in-flight promise", async () => { await writeState( { orgSlug: "anon-org", projectSlug: "proj", projectId: "p1" }, cwd, diff --git a/packages/arkor/src/core/trainer.ts b/packages/arkor/src/core/trainer.ts index b23d9896..df9dffa0 100644 --- a/packages/arkor/src/core/trainer.ts +++ b/packages/arkor/src/core/trainer.ts @@ -27,7 +27,7 @@ import type { const TERMINAL_STATUSES = new Set(["completed", "failed", "cancelled"]); /** - * Internal runtime context. Not part of the public API surface — exposed only + * Internal runtime context. Not part of the public API surface; exposed only * for tests and advanced power-user scenarios that need to inject a mock * `fetch` or override the working directory. * @@ -120,7 +120,7 @@ function buildJobConfig(input: TrainerInput): JobConfig { /** * Build a `Trainer` bound to the user's configuration. * - * Public signature: `createTrainer(input)` — runtime options like + * Public signature: `createTrainer(input)`. Runtime options like * `baseUrl` / `credentials` / `cwd` come from the environment and `.arkor/` * state, never from user code. The optional second argument is reserved for * tests and advanced overrides. @@ -163,14 +163,14 @@ export function createTrainer( // effect on the next event. Events already mid-await keep their // old reference until they resolve, which matches the "replace, // don't interrupt" contract. Public `Trainer` deliberately doesn't - // expose this — it's a dev-only HMR primitive driven by the + // expose this; it's a dev-only HMR primitive driven by the // SIGUSR2 path in `core/runnerSignals.ts`. let currentCallbacks: Partial = input.callbacks ?? {}; // Early-stop state. `requestEarlyStop()` arms the latch; the next // `checkpoint.saved` dispatch (or the timeout, whichever fires first) // calls cancel() and resolves the deferred. Idempotent across repeat - // calls — they share the same deferred. + // calls (they share the same deferred). const DEFAULT_EARLY_STOP_TIMEOUT_MS = 5 * 60 * 1000; let earlyStopDeferred: { promise: Promise; @@ -184,13 +184,13 @@ export function createTrainer( * Drop the early-stop latch (clear timer + resolve deferred + reset * the request flag). Called from any path that means "wait()'s * cancel-after-checkpoint promise is no longer waiting on anything" - * — the checkpoint-driven cancel branch, the terminal `completed` + * (the checkpoint-driven cancel branch, the terminal `completed` * / `failed` branches, and the up-front guard in - * `requestEarlyStop()` when the job is already terminal. Without + * `requestEarlyStop()` when the job is already terminal). Without * this called from terminal branches, a `requestEarlyStop()` armed * mid-run that races a `training.completed` / `training.failed` * before the next `checkpoint.saved` would leave the deferred - * pending until the (default 5-min) timeout fires — the SIGTERM + * pending until the (default 5-min) timeout fires; the SIGTERM * handler in `installShutdownHandlers` would block on that promise * and delay shutdown for up to `timeoutMs`. */ @@ -224,7 +224,7 @@ export function createTrainer( * many SDK clients retry at once. * * The final value is clamped at `maxReconnectDelayMs` because jitter - * sits *outside* the exponential clamp — without the outer clamp, a + * sits *outside* the exponential clamp; without the outer clamp, a * long outage where `exp` already hit the cap could wait up to 1.25 × * the documented cap when `Math.random()` lands near 1. */ @@ -312,7 +312,7 @@ export function createTrainer( }; // Capture (don't propagate yet) any throw from the user's // `onCheckpoint`. The early-stop branch below MUST run - // even on a callback throw — without this wrap a thrown + // even on a callback throw; without this wrap a thrown // `onCheckpoint` would skip the cancel + latch settlement, // leaving the SIGTERM handler waiting on the deferred // until the (default 5-min) timeout fires. Surface the @@ -328,7 +328,7 @@ export function createTrainer( // is durable. Cancel the cloud job and end `wait()` cleanly. if (earlyStopRequested && earlyStopDeferred) { // Capture the cancel error (if any) but DON'T swallow - // silently — propagate via the deferred's reject path so + // silently; propagate via the deferred's reject path so // the runner's `installShutdownHandlers` `.catch()` writes // the failure to stderr. The previous swallow let a // transient cloud-api failure during early-stop appear @@ -339,7 +339,7 @@ export function createTrainer( // it and intervene. // // We still mark `startedJob.status` terminal locally - // either way — from the runner's perspective the run is + // either way: from the runner's perspective the run is // over, and a subsequent `requestEarlyStop()` call must // hit the `TERMINAL_STATUSES.has(...)` short-circuit // (re-arming a fresh latch on a dead run would hang @@ -358,7 +358,7 @@ export function createTrainer( // `TERMINAL_STATUSES.has(...)` short-circuit it relies on. // // Status is `"failed"` when the cancel POST itself threw - // (cloud-api transient failure mid-cancel) — labelling + // (cloud-api transient failure mid-cancel): labelling // such runs `"cancelled"` would lie about the cloud-side // state, which may still be running. `"failed"` is // terminal too, so the latch / TERMINAL_STATUSES short- @@ -393,7 +393,7 @@ export function createTrainer( settleEarlyStopLatch(); } // Return the *checkpoint's* artifacts (the ones the user - // just saved) — that's the work HMR went out of its way + // just saved): that's the work HMR went out of its way // to preserve before issuing cancel(). The previous // `terminalResult?.artifacts ?? []` always resolved to // `[]` because `wait()` calls `dispatch(parsed, null)` so @@ -403,7 +403,7 @@ export function createTrainer( // the very artifacts the early-stop existed to keep. // Surface the user's `onCheckpoint` throw (if any) so // `wait()`'s reconnect / failure path keeps the same - // semantics it had before the wrap — the checkpoint + // semantics it had before the wrap: the checkpoint // workload is preserved, but the user still sees their // callback error. if (onCheckpointError !== null) throw onCheckpointError; @@ -431,7 +431,7 @@ export function createTrainer( // SIGTERM handler awaiting `requestEarlyStop()` would block // until the timeout (default 5 min). The throw still // propagates through `dispatch()` → `wait()` so callers see - // the original error — we just don't strand the shutdown + // the original error; we just don't strand the shutdown // path along with it. try { await callbacks.onCompleted?.({ job: startedJob, artifacts }); @@ -447,7 +447,7 @@ export function createTrainer( error: event.error, completedAt: event.timestamp, }; - // Symmetric to the `completed` branch above — terminal + // Symmetric to the `completed` branch above: terminal // status settles the latch even when the run failed *and* // the user's `onFailed` callback itself throws. try { @@ -479,7 +479,7 @@ export function createTrainer( // out the `client.createJob` POST. We set `scope` *before* // the await (it's needed by the await itself), so a SIGTERM // landing during the await would otherwise see - // `!startedJob && scope` and exit immediately — leaving the + // `!startedJob && scope` and exit immediately, leaving the // newly created cloud job uncancelled. const startPromise = (async () => { const client = await getClient(); @@ -554,7 +554,7 @@ export function createTrainer( try { for await (const sse of iterateEvents(response)) { // Any frame from the server (including pings) means we're - // connected and making progress — reset the failure counter + // connected and making progress; reset the failure counter // so subsequent transient blips get the full retry budget. receivedAny = true; attempt = 0; @@ -585,7 +585,7 @@ export function createTrainer( if (terminal) break; if (receivedAny) { - // Stream had real activity then closed cleanly. Not a failure — + // Stream had real activity then closed cleanly. Not a failure; // reconnect with Last-Event-ID at the base delay (no exponential // backoff, no counter increment). await delay(initialReconnectDelayMs, abortSignal); @@ -630,14 +630,14 @@ export function createTrainer( // after this returns, leaving the newly created cloud job // running with no cancel POST. Awaiting `startInFlight` collapses // the race onto a definite startedJob (success) or a definite - // start failure (rejection) — either way the branches below + // start failure (rejection); either way the branches below // can decide on real state. Swallow the rejection: if `start()` // failed there's nothing to cancel anyway. if (startInFlight) { try { await startInFlight; } catch { - // intentionally ignored — failed start has no job to cancel + // intentionally ignored: failed start has no job to cancel } } // Nothing in flight: cleanup any prior latch and resolve. @@ -657,14 +657,14 @@ export function createTrainer( }); const timeoutMs = opts.timeoutMs ?? DEFAULT_EARLY_STOP_TIMEOUT_MS; const timer = setTimeout(() => { - // Timed out waiting for a checkpoint — fall back to immediate cancel. + // Timed out waiting for a checkpoint; fall back to immediate cancel. // Capture the active deferred reference: by the time the cancel POST // resolves, the checkpoint branch may have nulled out the shared // slot, but this fallback path still owns the deferred it created. const active = earlyStopDeferred; // Capture (don't swallow) any cancel error so we can surface it // through the deferred's reject path. Mirrors the checkpoint - // branch — a swallow here lets the runner's + // branch: a swallow here lets the runner's // `installShutdownHandlers` exit "successfully" while the cloud // job lives on (orphaned GPU spend with zero diagnostic), the // exact failure mode that a "stop-after-checkpoint" deadline @@ -710,7 +710,7 @@ export function createTrainer( // SIGTERM handler's `.catch()` writes the error to // stderr and the operator can see that the cloud job // may still be live. The latch always settles either - // way — shutdown won't hang. + // way; shutdown won't hang. if (cancelError !== null) active.reject(cancelError); else active.resolve(); } @@ -735,7 +735,7 @@ export function createTrainer( // subprocess on SIGUSR2, and (c) drive a graceful "stop after the // next checkpoint" on SIGTERM. All three brands live behind // `Symbol.for` keys so they don't appear on the public `Trainer` - // interface — see `trainerInspection.ts` for the rationale. + // interface (see `trainerInspection.ts` for the rationale). attachTrainerInspection(trainer, () => ({ name: input.name, config, diff --git a/packages/arkor/src/core/trainerInspection.test.ts b/packages/arkor/src/core/trainerInspection.test.ts index ae830017..7c64613a 100644 --- a/packages/arkor/src/core/trainerInspection.test.ts +++ b/packages/arkor/src/core/trainerInspection.test.ts @@ -12,7 +12,7 @@ import type { Trainer } from "./types"; function brandedTrainer(name: string) { // Real `createTrainer` attaches the inspection brand. We only need - // a no-op trainer for these shape tests — `start`/`wait` etc. are + // a no-op trainer for these shape tests; `start`/`wait` etc. are // never invoked. return createTrainer({ name, @@ -22,7 +22,7 @@ function brandedTrainer(name: string) { } function unbrandedTrainer(name: string) { - // Hand-rolled trainer — passes the `start`/`wait`/`cancel` shape + // Hand-rolled trainer: passes the `start`/`wait`/`cancel` shape // check `findTrainerInModule` requires but DOESN'T carry the SDK // inspection brand. Mirrors a user who wraps or re-exports a // trainer outside the SDK helpers. @@ -93,7 +93,7 @@ describe("findInspectableTrainer (brand-required path)", () => { // `mod.arkor ?? mod.default`, missing shapes #2 and #4. As a // result, projects bare-exporting `trainer` always produced // `configHash: null` and HMR conservatively SIGTERM-restarted on - // every rebuild — never hot-swapping callbacks. The fix routes + // every rebuild, never hot-swapping callbacks. The fix routes // through `findInspectableTrainer` which walks every supported // shape via `findTrainerInModule` and pulls inspection off the // discovered trainer. @@ -125,7 +125,7 @@ describe("findInspectableTrainer (brand-required path)", () => { // HMR can't compute their `configHash`. The Studio still shows // the trainer name (via `findTrainerInModule` in // `summariseBuiltManifest`), but HMR routing falls back to the - // SIGTERM-restart-everything path — which is the documented + // SIGTERM-restart-everything path, which is the documented // safe behaviour when configs can't be diffed. const trainer = unbrandedTrainer("plain"); expect(findInspectableTrainer({ trainer })).toBeNull(); @@ -137,7 +137,7 @@ describe("requestTrainerEarlyStop / replaceTrainerCallbacks brand-missing fallba // Regression: previously these helpers asserted the brand was // present and threw a synchronous TypeError on hand-rolled trainers. // `runner.ts`'s `extractTrainer` accepts ANY `{start, wait, cancel}` - // shape — that's a documented public path for unbranded trainers — + // shape (a documented public path for unbranded trainers), // so the SIGTERM handler crashed instead of stopping the run. it("requestTrainerEarlyStop falls back to trainer.cancel() for unbranded trainers", async () => { diff --git a/packages/arkor/src/core/trainerInspection.ts b/packages/arkor/src/core/trainerInspection.ts index de3a7497..88b3e797 100644 --- a/packages/arkor/src/core/trainerInspection.ts +++ b/packages/arkor/src/core/trainerInspection.ts @@ -7,7 +7,7 @@ import type { Arkor, JobConfig, Trainer, TrainerCallbacks } from "./types"; * "callbacks-only vs full restart" decision and (b) extract the new * callbacks reference when hot-swapping. * - * **Internal API — not part of the user-facing SDK surface.** Both this + * **Internal API (not part of the user-facing SDK surface).** Both this * snapshot and the companion `replaceTrainerCallbacks` mutator are * exposed only via `Symbol.for(...)`-keyed properties on the trainer * object so they don't appear on the public `Trainer` type. They exist @@ -27,7 +27,7 @@ export interface TrainerInspection { /** * The CLI runtime (`dist/bin.mjs`) and the user's compiled bundle * (`.arkor/build/index.mjs`, which keeps `arkor` external) end up loading - * two separate copies of this SDK as distinct ESM module records — so a + * two separate copies of this SDK as distinct ESM module records, so a * module-local `WeakMap` would split into two halves that * can't see each other. * @@ -69,7 +69,7 @@ export function attachTrainerInspection( /** * Pull the snapshot off a Trainer-like value. Returns `null` for plain - * objects that don't carry the brand — used by the Studio server to + * objects that don't carry the brand; used by the Studio server to * gracefully ignore third-party wrappers or pre-SDK shapes. */ export function getTrainerInspection( @@ -116,7 +116,7 @@ export function attachTrainerCallbackReplacer( * Replace the trainer's lifecycle callbacks atomically. The brand is * attached by `createTrainer`, but `runTrainer`'s `extractTrainer` * also accepts hand-rolled trainers (any `{ start, wait, cancel }` - * shape) — those don't carry the brand. The HMR pipeline never + * shape), and those don't carry the brand. The HMR pipeline never * routes SIGUSR2 to such trainers in practice (they always produce * `configHash: null` upstream, which forces the SIGTERM-restart * path), so this helper is a no-op for them rather than throwing. @@ -137,8 +137,8 @@ export function replaceTrainerCallbacks( * in the runner subprocess can request a graceful "stop after the next * checkpoint" without us exposing the operation on the public `Trainer` * interface. User code that wants the same semantics should compose - * the cookbook's `abortSignal` + `cancel()` recipe instead — see - * `docs/cookbook/early-stopping.mdx`. + * the cookbook's `abortSignal` + `cancel()` recipe instead (see + * `docs/cookbook/early-stopping.mdx`). */ export function attachTrainerEarlyStopper( trainer: object, @@ -159,7 +159,7 @@ export function attachTrainerEarlyStopper( * * `createTrainer` attaches the brand unconditionally, but * `runTrainer`'s `extractTrainer` also accepts hand-rolled trainers - * — any `{ start, wait, cancel }` shape — which legitimately don't + * (any `{ start, wait, cancel }` shape), which legitimately don't * carry the brand. Falling back to the public `Trainer.cancel()` for * those is the closest semantic match available without the SDK's * checkpoint-aware machinery; it's also what the runner's SIGTERM @@ -170,7 +170,7 @@ export function attachTrainerEarlyStopper( */ // async wrapper (rather than a bare function returning Promise) so // any *synchronous* throw inside the brand call (or its arguments) -// becomes a rejected promise — the SIGTERM handler's `.catch()` then +// becomes a rejected promise; the SIGTERM handler's `.catch()` then // catches it instead of the throw escaping past the `.finally()` // chain and taking the runner down. export async function requestTrainerEarlyStop( @@ -189,7 +189,7 @@ export async function requestTrainerEarlyStop( try { await trainer.cancel(); } catch { - // intentionally ignored — see comment above. + // intentionally ignored; see comment above. } return; } @@ -200,7 +200,7 @@ export async function requestTrainerEarlyStop( * Trainer-shaped value pulled from a re-imported bundle. We don't * import the public `Trainer` type here because consumers of this * helper want to read minimal fields (`name` for display) without - * type-narrowing on the full SDK interface — many tests fabricate + * type-narrowing on the full SDK interface. Many tests fabricate * hand-rolled trainer literals that don't structurally match * `Trainer` (no `requestEarlyStop` etc.) but are still legitimate * user shapes the runner accepts. @@ -221,7 +221,7 @@ function isTrainerLike(value: unknown): value is TrainerLike { * Walk a freshly-imported user bundle in the same precedence order * as `runner.ts`'s `extractTrainer` and return the first * trainer-shaped value (anything that has `start`/`wait`/`cancel` - * functions). Doesn't require the SDK inspection brand — the + * functions). Doesn't require the SDK inspection brand: the * manifest UI displays the trainer's `name` for hand-rolled trainers * too, even when HMR can't compute a `configHash` for them. * @@ -279,7 +279,7 @@ export function findTrainerInModule( * export shapes count as "a trainer is exported here". * * Returns `null` when none of the candidates carry the inspection - * brand — typically because the bundle has no SDK-built trainer + * brand: typically because the bundle has no SDK-built trainer * (hand-rolled trainer, fresh scaffold, syntax error, or a * third-party shape). */ diff --git a/packages/arkor/src/studio/hmr.test.ts b/packages/arkor/src/studio/hmr.test.ts index 22973b29..b892c68c 100644 --- a/packages/arkor/src/studio/hmr.test.ts +++ b/packages/arkor/src/studio/hmr.test.ts @@ -129,7 +129,7 @@ describe("createHmrCoordinator", () => { // Regression: previously `startWatcher` bailed out and never // retried, so an SPA already connected to `/api/dev/events` against // a fresh scaffold would be stuck on the initial `error` event - // forever — EventSource doesn't reconnect on application-level + // forever: EventSource doesn't reconnect on application-level // errors. The coordinator now polls for the entry file in the // background and starts the watcher the moment it appears. const events: HmrEvent[] = []; @@ -137,7 +137,7 @@ describe("createHmrCoordinator", () => { hmr.subscribe((e) => events.push(e)); try { await nextEvent(events, (e) => e.type === "error", 1000); - // Same subscriber — no reconnect, no second `subscribe` call. + // Same subscriber: no reconnect, no second `subscribe` call. mkdirSync(join(cwd, "src/arkor"), { recursive: true }); writeFileSync(join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST); const ready = await nextEvent( @@ -166,7 +166,7 @@ describe("createHmrCoordinator", () => { // We assert "the late subscriber sees the same event the prior one // saw last" rather than literally "ready" because rolldown@1.0.0-rc.17 // on macOS occasionally fires a spurious second BUNDLE_END (FSEvents - // coalescing inside the watcher) — there, `firstEvents` already + // coalescing inside the watcher): there, `firstEvents` already // contains the spurious `rebuild` by the time we late-subscribe, and // the contract under test (replay of the cached state) holds either // way. @@ -187,7 +187,7 @@ describe("createHmrCoordinator", () => { // a fresh subscriber for the late-mount-cached-state contract. // Previously the replay had no try/catch, so a subscriber that // threw during that one call (typical case: an SSE controller - // that closed mid-replay — `controller.enqueue` on a closed + // that closed mid-replay: `controller.enqueue` on a closed // stream throws) propagated out of `subscribe()` and broke // whoever just registered. `broadcast()` already swallowed // subscriber throws defensively; this test pins the symmetric @@ -246,7 +246,7 @@ describe("createHmrCoordinator", () => { // Regression: the BUNDLE_END handler used to fire // `emitBuildSucceeded` without awaiting, so two quick rebuilds // could run `inspectBundle` concurrently and broadcast out of - // order — leaving `lastEvent` pointing at the older snapshot. + // order, leaving `lastEvent` pointing at the older snapshot. // We can't deterministically synthesise a race against rolldown's // real watcher, but we *can* assert the user-visible invariant: // after a sequence of edits, the cached state must match the @@ -270,12 +270,12 @@ describe("createHmrCoordinator", () => { join(cwd, "src/arkor/index.ts"), FAKE_MANIFEST.replace(`"alpha"`, `"gamma"`), ); - // Wait for the watcher to settle — any rebuild that's going to + // Wait for the watcher to settle; any rebuild that's going to // fire (including spurious extras from FSEvents on macOS or // chokidar polling on Windows) lands within this window. The // assertion then compares the cached `lastEvent.hash` against // the *actual* fingerprint of the on-disk artefact, not a - // captured "last expected" hash from earlier in the test — + // captured "last expected" hash from earlier in the test: // that earlier capture was brittle on Windows where rolldown // routinely emits a 4th BUNDLE_END after the explicit edits // settle, producing a slightly different output byte (a @@ -308,14 +308,14 @@ describe("createHmrCoordinator", () => { const events: HmrEvent[] = []; const hmr = createHmrCoordinator({ cwd }); // Before any subscriber attaches, no watcher is running and no - // event has been broadcast — getter must return null without + // event has been broadcast: getter must return null without // throwing. expect(hmr.getCurrentConfigHash()).toBeNull(); hmr.subscribe((e) => events.push(e)); try { const ready = await nextEvent(events, (e) => e.type === "ready"); // FAKE_MANIFEST is hand-rolled (no SDK brand) so the cached - // hash is null — but the *getter* must still return whatever + // hash is null, but the *getter* must still return whatever // the cached event carries, not throw. expect(hmr.getCurrentConfigHash()).toBe(ready.configHash ?? null); } finally { @@ -333,7 +333,7 @@ describe("createHmrCoordinator", () => { // returns a non-null, non-artefact-derived hash. That // silently breaks `dispatchRebuild`'s pre-ready-spawn gate // which relies on null === "no artefact, force restart". - // The fix uses `fingerprintOrNull` — single statSync, true + // The fix uses `fingerprintOrNull`: single statSync, true // null on failure. // // We assert the getter on a project that has NEVER built @@ -345,7 +345,7 @@ describe("createHmrCoordinator", () => { const hmr = createHmrCoordinator({ cwd }); try { - // No subscribe() yet — watcher hasn't started, so no + // No subscribe() yet: watcher hasn't started, so no // BUNDLE_END has written the artefact. The on-disk // `.arkor/build/index.mjs` doesn't exist. expect(hmr.getCurrentArtifactHash()).toBeNull(); @@ -372,7 +372,7 @@ describe("createHmrCoordinator", () => { try { const ready = await nextEvent(events, (e) => e.type === "ready"); const artifactHash = hmr.getCurrentArtifactHash(); - // Same shape as the SSE event's `hash` field — both feed + // Same shape as the SSE event's `hash` field: both feed // through the same `mtimeMs-ctimeMs-size` formula. expect(artifactHash).toBe(ready.hash ?? null); expect(artifactHash).toMatch(/^[\d.]+-[\d.]+-\d+$/); @@ -385,7 +385,7 @@ describe("createHmrCoordinator", () => { // Regression: previously `getCurrentConfigHash()` returned // `lastEvent?.configHash ?? null`. After an ERROR landed, // `lastEvent` was the error event (no `configHash`) so the - // getter went null — even though `.arkor/build/index.mjs` still + // getter went null even though `.arkor/build/index.mjs` still // held the previous *successful* bundle bytes (ERROR doesn't // overwrite the output). A child spawned via `/api/train` in // that window would register `configHash: null`, and the next @@ -411,7 +411,7 @@ describe("createHmrCoordinator", () => { ); await nextEvent(events, (e) => e.type === "error", 4000); // After the error, the cached `lastEvent` is the error frame - // — but the on-disk artifact still holds the previous + // but the on-disk artifact still holds the previous // success. The getter must return that previous-success hash // so any `/api/train` spawn during this window still gets a // useful spawn-time hash for the *next* rebuild's routing. diff --git a/packages/arkor/src/studio/hmr.ts b/packages/arkor/src/studio/hmr.ts index 47f89148..f5830044 100644 --- a/packages/arkor/src/studio/hmr.ts +++ b/packages/arkor/src/studio/hmr.ts @@ -70,7 +70,7 @@ export interface HmrCoordinator { * the child read the same bytes. Without this gate, an edit * landing between spawn and the watcher's first BUNDLE_END would * silently teach the registry to use the post-edit `configHash` - * as the child's baseline — later same-hash rebuilds would then + * as the child's baseline; later same-hash rebuilds would then * hot-swap callbacks into a child whose cloud-side `JobConfig` * was actually spawned against an older version, leaving the * cloud run on a stale config. `null` when stat fails (artefact @@ -81,7 +81,7 @@ export interface HmrCoordinator { * Content-derived hash (sha256, truncated) of the on-disk * artefact RIGHT NOW. Used by `/api/train` to capture a * spawn-time content-hash for the registry's pre-ready-spawn - * equality gate — paired with the rebuild's `event.contentHash`, + * equality gate; paired with the rebuild's `event.contentHash`, * a mismatch unambiguously means the bytes changed (not just * timestamps), so `dispatchRebuild` only SIGTERM-restarts when * the child genuinely loaded different bytes than the new @@ -93,7 +93,7 @@ export interface HmrCoordinator { * Last broadcast event's `type`, or `null` if nothing has been * broadcast yet. `/api/manifest`'s HMR fast path consults this to * suppress its "serve last good artefact" behaviour while the - * watcher is in an `error` state — without that gate, the SPA's + * watcher is in an `error` state; without that gate, the SPA's * 5 s `/api/manifest` poll would keep getting a 200 stale * manifest and silently overwrite the SSE-driven build-error UI, * letting users run with stale code/config while the latest @@ -128,7 +128,7 @@ function contentHashOrNull(outFile: string): string | null { } /** - * Single-stat fingerprint with a clean `null` on failure — used by + * Single-stat fingerprint with a clean `null` on failure: used by * `getCurrentArtifactHash()` whose contract is "return a fingerprint * derived from the artefact bytes, or `null` if no artefact". A * separate exists-check + `fingerprint()` here would race: the file @@ -139,7 +139,7 @@ function contentHashOrNull(outFile: string): string | null { function fingerprintOrNull(outFile: string): string | null { try { const s = statSync(outFile); - // Same shape as `fingerprint()`'s success branch — `ctimeMs` is + // Same shape as `fingerprint()`'s success branch; `ctimeMs` is // the belt-and-braces guard for `touch -m`-style edits where // mtime stays put. return `${s.mtimeMs}-${s.ctimeMs}-${s.size}`; @@ -169,7 +169,7 @@ type InspectionResult = { * * Walks every entry shape `runner.ts` accepts (named `arkor`, named * `trainer`, `default` Arkor manifest, `default.trainer`) via the - * shared `findInspectableTrainer` helper — keeping inspection in sync + * shared `findInspectableTrainer` helper, keeping inspection in sync * with execution. Without this, projects that only `export const * trainer` (a documented shortcut) would always produce `configHash: * null` and the SPA would unnecessarily SIGTERM-restart on every @@ -181,7 +181,7 @@ type InspectionResult = { * - Node's ESM loader caches every dynamically-imported URL for the * lifetime of the process and never evicts. A `?t=Date.now()` * suffix produces a unique URL per call, so a long `arkor dev` - * session would accumulate one module record per BUNDLE_END — + * session would accumulate one module record per BUNDLE_END: * unbounded memory growth. * - The composite key (`mtimeMs-ctimeMs-size`) keys the cache to * "the actual bytes in this file", so spurious watcher events @@ -189,7 +189,7 @@ type InspectionResult = { * leak shrinks from "one entry per keystroke" to "one entry per * actual rebuild", which for a realistic dev session (hundreds * of saves over hours) is bounded by the number of distinct file - * states the user produces — and that's fundamentally what HMR + * states the user produces, and that's fundamentally what HMR * has to track to surface up-to-date trainer state. There's no * public Node API for evicting an ESM module record, so this is * the tightest bound we can offer without spawning a child @@ -222,8 +222,8 @@ async function inspectBundle(outFile: string): Promise { * `/api/dev/events` SSE notifications to the SPA. * * Lazy: the watcher only starts on the first `subscribe` call so a Studio - * launch in a project without `src/arkor/index.ts` doesn't immediately fail - * — the watcher kicks in once the user creates the file and the SPA opens + * launch in a project without `src/arkor/index.ts` doesn't immediately fail. + * The watcher kicks in once the user creates the file and the SPA opens * an EventSource. After every successful build the watcher caches the * latest state and replays it to new subscribers so a late-mounting * component still sees the trainer. @@ -240,8 +240,8 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * entry file yet, a poll timer takes over and waits for the file to * appear. Without this, an SPA that opened `/api/dev/events` against * a fresh scaffold would hang on the initial `error` event forever - * — `startWatcher` is only re-entered on `subscribe()`, but EventSource - * doesn't reconnect on application-level errors. + * (`startWatcher` is only re-entered on `subscribe()`, but EventSource + * doesn't reconnect on application-level errors). */ let entryWaitTimer: ReturnType | null = null; /** @@ -254,7 +254,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * This matters because `inspectBundle` does an asynchronous * dynamic-import of the just-written artifact. Two rebuilds A → B * landing within the import window can race, with A's inspection - * resolving *after* B's — the previous "fire-and-forget" code + * resolving *after* B's. The previous "fire-and-forget" code * would then publish A on top of B and leave `lastEvent` pointing * at the older `configHash`/`trainerName`. That in turn drove * `/api/dev/events` to make hot-swap-vs-restart decisions against @@ -266,7 +266,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * Whether a `ready` event has actually broadcast yet. Tracked * separately from `firstBuild` because the inspection await means * the first BUNDLE_END's broadcast can land *after* a second - * BUNDLE_END schedules its own — pinning the type to + * BUNDLE_END schedules its own. Pinning the type to * "broadcast-time" rather than "schedule-time" guarantees the SPA * still sees `ready` first even when the initial inspection loses * the race. @@ -280,7 +280,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { * `/api/train` reads via `getCurrentConfigHash()`. The on-disk * `.arkor/build/index.mjs` doesn't change on ERROR, so a child * spawned during an error state is running the *previous* successful - * bundle — and the next BUNDLE_END's hash should be compared + * bundle, and the next BUNDLE_END's hash should be compared * against THAT. Without this separate cache, the whole rebuild gets * routed through SIGTERM-restart and SIGUSR2 hot-swap stops working * for the rest of the session whenever the user briefly broke their @@ -294,7 +294,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { try { fn(event); } catch { - // Subscribers are SSE controllers — a thrown error usually means + // Subscribers are SSE controllers; a thrown error usually means // the connection closed mid-flight. Drop it so one bad subscriber // can't poison the broadcast for the rest. } @@ -314,7 +314,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { const type: HmrEventType = firstBroadcast ? "ready" : "rebuild"; firstBroadcast = false; const configHash = inspection?.configHash ?? null; - // BUNDLE_END always reflects what's now on disk — even when the + // BUNDLE_END always reflects what's now on disk: even when the // bundle is unbranded (`configHash === null`), that's the // current truth. Capture it so `/api/train` spawning during a // *subsequent* transient error still has the right spawn-time @@ -343,7 +343,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { }); // Hand off to a low-frequency poll so an SPA already connected to // `/api/dev/events` transitions from "error" to "ready" the moment - // the user creates the entry file — no manual reconnect required. + // the user creates the entry file (no manual reconnect required). // The poll is `unref()`'d so it never blocks process exit, and // `dispose()` clears it. if (!entryWaitTimer) { @@ -363,7 +363,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { } return; } - // The entry exists now — clear any leftover poll timer from a prior + // The entry exists now: clear any leftover poll timer from a prior // failed startWatcher invocation. if (entryWaitTimer) { clearInterval(entryWaitTimer); @@ -379,12 +379,12 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { event.result.close().catch(() => {}); // The event type ("ready" vs "rebuild") is decided inside // `emitBuildSucceeded` *after* the inspection await, based on - // whether any prior broadcast actually landed — see the + // whether any prior broadcast actually landed (see the // `firstBroadcast` comment for why pinning the type at this - // schedule point would be wrong under inspection races. + // schedule point would be wrong under inspection races). void emitBuildSucceeded(); } else if (event.code === "ERROR") { - // Rolldown's ERROR events don't always carry a `result` — + // Rolldown's ERROR events don't always carry a `result`: // when the failure is in the parse/resolve phase there's // no per-build output to close, so `event.result` is // `undefined`. Calling `.close()` then would throw @@ -426,7 +426,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // // Wrapped in the same defensive try/catch as `broadcast` so a // throw inside the subscriber (typically an SSE controller that - // closed mid-replay — `controller.enqueue` on a closed stream + // closed mid-replay: `controller.enqueue` on a closed stream // throws) doesn't propagate out of `subscribe()` and crash // whoever just registered. One bad subscriber must not be able // to break HMR initialisation for the rest of the process. @@ -434,7 +434,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { try { fn(lastEvent); } catch { - // Swallow — subscribers own their own teardown; we just + // Swallow: subscribers own their own teardown; we just // shouldn't poison their `subscribe()` call site. } } @@ -448,7 +448,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // `lastEvent.configHash`. The two diverge after an ERROR: // `lastEvent` becomes the error event (no `configHash`), but // `.arkor/build/index.mjs` still holds the previous successful - // bundle bytes — and a child spawned in that window is running + // bundle bytes, and a child spawned in that window is running // those bytes. Returning the cached success hash keeps // `/api/train` registering accurate spawn-time hashes so the // next successful BUNDLE_END can route hot-swap vs restart @@ -457,7 +457,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { return lastSuccessConfigHash; }, getCurrentArtifactHash() { - // Fresh stat — not the cached `lastEvent.hash`. The cached + // Fresh stat (not the cached `lastEvent.hash`). The cached // hash describes the bytes the watcher last broadcast about, // but the on-disk artefact may be newer (a BUNDLE_END is // queued, file already written, inspection still pending) or @@ -467,7 +467,7 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // RIGHT NOW". // // `fingerprintOrNull` does ONE statSync and returns null on - // failure — preserving the documented contract. A previous + // failure, preserving the documented contract. A previous // implementation here did `statSync(...)` first and then // called `fingerprint()` (which has a `Date.now()` fallback // baked in for SSE dedup uniqueness). That double-stat @@ -483,12 +483,12 @@ export function createHmrCoordinator(opts: HmrOptions): HmrCoordinator { // pre-ready-spawn equality gate. Reads + sha256s the file // at call time so the result describes the exact bytes the // just-spawned child will see in its `await import()`. - // Same null-on-failure contract — caller treats null as + // Same null-on-failure contract: caller treats null as // "force restart" (the conservative default). return contentHashOrNull(resolved.outFile); }, getLastEventType() { - // `lastEvent` is the latest broadcast — `ready` / `rebuild` / + // `lastEvent` is the latest broadcast: `ready` / `rebuild` / // `error`. Returning the type lets `/api/manifest`'s HMR // fast path skip serving the stale built artefact when the // watcher is currently in `error` (current source fails to diff --git a/packages/arkor/src/studio/manifest.ts b/packages/arkor/src/studio/manifest.ts index 2ed73ccf..4f17bb6b 100644 --- a/packages/arkor/src/studio/manifest.ts +++ b/packages/arkor/src/studio/manifest.ts @@ -44,7 +44,7 @@ export async function summariseBuiltManifest( // mtime+ctime+size cache-bust (vs `Date.now()`): the SPA polls // `/api/manifest` every ~5 s, so a `Date.now()` suffix would // accumulate one ESM module record per poll across a long - // `arkor dev` session — Node's loader has no eviction. Keying on + // `arkor dev` session: Node's loader has no eviction. Keying on // the artefact bytes (via `moduleCacheBustUrl`) collapses // unchanged-poll reads onto the existing record. const mod = (await import(moduleCacheBustUrl(outFile))) as Record< @@ -61,7 +61,7 @@ export async function summariseBuiltManifest( // Trainer name renders in the UI even for hand-rolled trainers // that bypass `createTrainer` and therefore don't carry the SDK // inspection brand. The brand is required only for the - // `configHash` used by HMR routing — without it, HMR conservatively + // `configHash` used by HMR routing; without it, HMR conservatively // SIGTERM-restarts on every rebuild (correct fallback). const name = typeof trainer.name === "string" ? trainer.name : "(unnamed trainer)"; @@ -103,7 +103,7 @@ export interface ReadManifestOptions { * Each call rebuilds and re-imports so edits to the user's source * surface without restarting Studio. When `prebuiltOutFile` is * supplied (HMR-enabled servers), the `runBuild()` step is bypassed - * — see `ReadManifestOptions.prebuiltOutFile` for the rationale. + * (see `ReadManifestOptions.prebuiltOutFile` for the rationale). */ export async function readManifestSummary( cwd: string, diff --git a/packages/arkor/src/studio/server.test.ts b/packages/arkor/src/studio/server.test.ts index c587332a..a53b4d46 100644 --- a/packages/arkor/src/studio/server.test.ts +++ b/packages/arkor/src/studio/server.test.ts @@ -83,14 +83,14 @@ describe("Studio server", () => { baseUrl: "http://mock", assetsDir, autoAnonymous: false, - // @ts-expect-error — intentionally omitted to assert the runtime guard + // @ts-expect-error: intentionally omitted to assert the runtime guard studioToken: undefined, }), ).toThrow(/studioToken/); }); it("HTML-escapes special characters in the studio token before injecting", async () => { - // Branch coverage for `htmlAttrEscape` — a defensive guard against + // Branch coverage for `htmlAttrEscape`: a defensive guard against // a token that contains `<`, `>`, `&`, `"`, `'`. randomBytes/base64url // never produces these, but the helper must still escape them so a // future token strategy can't break index.html parsing or open a @@ -112,7 +112,7 @@ describe("Studio server", () => { expect(html).toContain( '', ); - // The raw exotic token must not leak into HTML — an attacker who + // The raw exotic token must not leak into HTML: an attacker who // could influence the token (hypothetical) shouldn't be able to // inject markup. expect(html).not.toMatch(/content="<>/); @@ -143,7 +143,7 @@ describe("Studio server", () => { it("injects when an HMR coordinator is supplied", async () => { // Regression: the SPA can't tell dev-mode usage from prod-mode - // usage at runtime — `vite build` ships with + // usage at runtime: `vite build` ships with // `import.meta.env.DEV === false`, so a build-time DEV gate inside // the SPA bundle would (wrongly) suppress HMR even in real // `arkor dev` sessions. The server-side flag is `true` exactly @@ -399,7 +399,7 @@ describe("Studio server", () => { expect(res.status).toBe(403); }); - // Regression for ENG-404 — `path.resolve` doesn't follow symlinks, so a + // Regression for ENG-404: `path.resolve` doesn't follow symlinks, so a // link inside the project directory pointing outside it would previously // pass the containment check and be handed to `arkor start` (which would // then dlopen the link's target). @@ -476,7 +476,7 @@ describe("Studio server", () => { expect(body.error).toMatch(/does not exist/); }); - // Regression for ENG-356 — `/api/train` previously resolved the bundled + // Regression for ENG-356: `/api/train` previously resolved the bundled // bin at `/bin.mjs` (one level above `dist/`), which never existed. // The DI'd `binPath` lets us assert (a) a working bin streams its stdout // through the response, and (b) a missing bin surfaces ENOENT-grade errors @@ -600,7 +600,7 @@ process.exit(0); it("captures the spawn-time configHash from the HMR coordinator (no extra rebuild)", async () => { // Regression: `/api/train` previously called `readManifestSummary` - // which ran a full `runBuild()` per spawn — wasteful and racy + // which ran a full `runBuild()` per spawn: wasteful and racy // against the HMR watcher writing the same `.arkor/build/index.mjs`. // The new server reads the cached hash from // `coordinator.getCurrentConfigHash()` instead. We assert the @@ -654,7 +654,7 @@ process.exit(0); // stdout marker arrives later but our `getJobId(...) === null` // gate has already short-circuited subsequent scans, so // Stop-training POSTs cancel for the wrong (decoy) job and - // the real one keeps running — silent cloud orphan. + // the real one keeps running: silent cloud orphan. // Splitting into a stdout-only `onStdoutChunk` parser and a // forward-only `onStderrChunk` makes stderr unable to // populate `jobId` regardless of what the user logs there. @@ -768,7 +768,7 @@ process.exit(0); // *before* SIGKILLing. await writeCredentials(ANON_CREDS); // The cancel POST reads scope from `.arkor/state.json` (not - // from the anon creds' orgSlug — that's a different code + // from the anon creds' orgSlug; that's a different code // path). Pre-seed so the POST can address the cloud job. await writeState( { @@ -781,7 +781,7 @@ process.exit(0); // Bin prints the canonical "Started job " line then // hangs (just like the real runner after `start()` resolves). // The id is the same kind of identifier cloud-api would - // mint — opaque string we'll verify shows up in the cancel + // mint: an opaque string we'll verify shows up in the cancel // POST URL below. const FAKE_JOB_ID = "j-cancel-test"; const fakeBin = join(trainCwd, "started-job-bin.mjs"); @@ -820,7 +820,7 @@ process.exit(0); headers: { "content-type": "application/json" }, }); } - // Pass-through default: anything else 404s — which would + // Pass-through default: anything else 404s, which would // surface as a test-side failure if our cancel POST // doesn't match the expected URL shape. return new Response("not found", { status: 404 }); @@ -858,7 +858,7 @@ process.exit(0); if (done) break; buf += decoder.decode(value, { stream: true }); } - // Trigger cancel — should fire the cloud POST + SIGKILL. + // Trigger cancel: should fire the cloud POST + SIGKILL. await reader.cancel(); // Fire-and-forget: give the void IIFE a tick to actually // dispatch the fetch + receive the 200 response. @@ -866,7 +866,7 @@ process.exit(0); expect(cancelHits).toHaveLength(1); expect(cancelHits[0]?.url).toContain(`/v1/jobs/${FAKE_JOB_ID}/cancel`); - // Scope is required by the cloud-api contract — comes from + // Scope is required by the cloud-api contract: comes from // `.arkor/state.json` (seeded above), not the anon creds. expect(cancelHits[0]?.url).toContain("orgSlug=cancel-test-org"); expect(cancelHits[0]?.url).toContain("projectSlug=cancel-test-project"); @@ -1077,7 +1077,7 @@ process.exit(0); // graceful early-stop request (wait for the next checkpoint, // up to ~5 min). For HMR-driven cancels that's correct, but // for a Stop-training click the user wants the run STOPPED - // immediately — leaving it running in the background for + // immediately. Leaving it running in the background for // minutes consuming GPU spend silently is a regression // introduced by this PR's graceful-shutdown work. We assert // SIGKILL by giving the bin a SIGTERM no-op handler: SIGTERM @@ -1122,7 +1122,7 @@ process.exit(0); await new Promise((r) => setTimeout(r, 300)); // `process.kill(pid, 0)` is the standard "is this pid alive?" - // probe — sends signal 0 (no-op) but the syscall still + // probe: sends signal 0 (no-op) but the syscall still // surfaces ESRCH for non-existent pids. SIGKILL → reaped → // ESRCH. SIGTERM (with the bin's no-op handler) → still // alive → no throw → test fails. @@ -1184,13 +1184,13 @@ process.exit(0); // `controller.enqueue(...)` listeners on `child.stdout` / // `child.stderr` and an unguarded `controller.close()` in // `child.on("close")`. After the client cancelled the - // ReadableStream, those handlers kept firing — and calling + // ReadableStream, those handlers kept firing, and calling // `enqueue` / `close` on a closed controller throws "Invalid // state". The throw escaped the request pipeline as an // unhandled exception. The fix flips a `closed` flag in // `cancelTeardown` and try/catches the post-cancel enqueue // paths defensively. NOTE: cancel intentionally does NOT - // detach the `data` listeners — leaving them attached keeps + // detach the `data` listeners; leaving them attached keeps // the OS pipe draining while the child checkpoints / exits // gracefully (otherwise a full pipe back-pressures and // deadlocks the very graceful exit we're preserving). @@ -1201,7 +1201,7 @@ process.exit(0); const fakeBin = join(trainCwd, "fake-bin.mjs"); // Bin spits a chunk every ~5 ms forever. We cancel while it's // mid-stream so the child is *still alive* when listeners are - // removed — the previous bug only surfaced in this window. + // removed: the previous bug only surfaced in this window. writeFileSync( fakeBin, `setInterval(() => process.stdout.write("tick\\n"), 5);\nsetInterval(() => {}, 60_000);\n`, @@ -1226,7 +1226,7 @@ process.exit(0); expect(res.status).toBe(200); const reader = res.body!.getReader(); // Read at least one chunk so the child is definitely streaming - // before we cancel — that's the race window the previous code + // before we cancel: that's the race window the previous code // crashed in. const decoder = new TextDecoder(); let received = ""; @@ -1236,7 +1236,7 @@ process.exit(0); received += decoder.decode(value, { stream: true }); } // Listen for unhandled rejections / uncaught exceptions during - // and shortly after the cancel — before the fix, the child's + // and shortly after the cancel: before the fix, the child's // next `data` chunk would synchronously throw inside the // enqueue callback. const errors: unknown[] = []; @@ -1264,7 +1264,7 @@ process.exit(0); }); it("acquires + persists an anonymous token on the first /api/credentials hit when autoAnonymous=true", async () => { - // No credentials on disk — buildStudioApp's autoAnonymous default + // No credentials on disk: buildStudioApp's autoAnonymous default // (true) lets the server bootstrap on first hit so a fresh `arkor // dev` works even when the up-front bootstrap in dev.ts skipped due // to a transient network blip. @@ -1306,7 +1306,7 @@ process.exit(0); expect(body).toMatchObject({ token: "lazy-anon", mode: "anon" }); expect(calls).toBe(1); - // Subsequent calls use the persisted credentials — no re-bootstrap. + // Subsequent calls use the persisted credentials (no re-bootstrap). const res2 = await app.request("/api/credentials", { headers: { host: "127.0.0.1:4000", @@ -1381,7 +1381,7 @@ process.exit(0); // The cloud-api-client wrapper around `onDeprecation` synchronously // checks `typeof result.then` on the callback's return value; a plain // `void` return throws and gets swallowed with a stderr log. The - // wrapper in `createRpc` returns null to short-circuit that check — + // wrapper in `createRpc` returns null to short-circuit that check; // assert that no such log fires here. const errorSpy = vi .spyOn(console, "error") @@ -1845,7 +1845,7 @@ process.exit(0); // pre-existing artefact directly when HMR's coordinator is // wired in. We assert by pre-writing a hand-rolled artefact // bundle and verifying `/api/manifest` returns its trainer - // *without* the source file existing — `runBuild()` would + // *without* the source file existing: `runBuild()` would // throw on the missing entry, so a 200 here proves we never // called it. await writeCredentials(ANON_CREDS); @@ -1866,7 +1866,7 @@ process.exit(0); `, ); // Notice: NO `src/arkor/index.ts`. `runBuild()` would fail with - // "Build entry not found" — the test fails if the fast path + // "Build entry not found"; the test fails if the fast path // regresses and falls through to it. const fakeHmr = { subscribe: () => () => undefined, @@ -1919,7 +1919,7 @@ process.exit(0); }, });`, ); - // No pre-existing `.arkor/build/index.mjs` — the artefact + // No pre-existing `.arkor/build/index.mjs`: the artefact // doesn't exist. `existsSync` is false → `runBuild()` runs. const fakeHmr = { subscribe: () => () => undefined, @@ -1955,7 +1955,7 @@ process.exit(0); // even when the watcher's most recent event was `error`. The // SPA's `/api/manifest` poll runs every ~5s, so a successful // 200 with stale data would silently overwrite the SSE-driven - // build-error UI within 5s of the user breaking their source — + // build-error UI within 5s of the user breaking their source: // they'd then unknowingly run stale code/config while the // latest edit is still failing to compile. Gating the fast // path on `getLastEventType() === "error"` keeps both @@ -1976,7 +1976,7 @@ process.exit(0); export default arkor; `, ); - // Coordinator is currently in error state — the latest + // Coordinator is currently in error state: the latest // broadcast was a compile failure. const fakeHmr = { subscribe: () => () => undefined, @@ -2000,7 +2000,7 @@ process.exit(0); "x-arkor-studio-token": STUDIO_TOKEN, }, }); - // 400 — the SPA's existing 4xx-handling path renders the + // 400: the SPA's existing 4xx-handling path renders the // build-error hint instead of a fake-healthy manifest. expect(res.status).toBe(400); const body = (await res.json()) as { error?: string }; @@ -2019,7 +2019,7 @@ process.exit(0); it("auto-bootstraps project state and proxies base-model inference", async () => { await writeCredentials(ANON_CREDS); - // No state.json — server should derive a slug from cwd, create the + // No state.json: server should derive a slug from cwd, create the // project on cloud-api, persist state, and forward the inference call. const calls: Array<{ @@ -2148,7 +2148,7 @@ process.exit(0); }); expect(res.status).toBe(200); - // Only the inference call should have hit the network — no project + // Only the inference call should have hit the network: no project // create/list when state is already present. expect(calls.filter((c) => c.url.includes("/v1/projects"))).toHaveLength(0); const chat = calls.find((c) => c.url.includes("/v1/inference/chat")); @@ -2157,7 +2157,7 @@ process.exit(0); it("propagates the cloud-api status when project bootstrap fails", async () => { await writeCredentials(ANON_CREDS); - // No state.json — bootstrap will hit cloud-api, which returns 503. + // No state.json: bootstrap will hit cloud-api, which returns 503. // We expect that 503 to be passed through, not collapsed to 400. globalThis.fetch = (async ( @@ -2263,7 +2263,7 @@ process.exit(0); coordinator, emit(event: HmrEvent) { // Track the latest event type so `getLastEventType()` - // mirrors the real coordinator's `lastEvent?.type` — + // mirrors the real coordinator's `lastEvent?.type`; // the `/api/manifest` HMR-error gate consults this. lastEventType = event.type; for (const fn of subs) fn(event); @@ -2326,7 +2326,7 @@ process.exit(0); // The server subscribes to the HMR coordinator exactly once at // build time (so multiple SSE clients don't fan signal dispatch // out to the same child N times). Per-client cleanup happens on - // the SSE listener set, not against the coordinator — so + // the SSE listener set, not against the coordinator, so // `fake.subscriberCount` stays at 1 across the connection // lifecycle. We assert that here rather than expect the // pre-refactor "0 after cancel" behaviour. @@ -2420,7 +2420,7 @@ process.exit(0); // (which would have triggered the runner's `exit(143)` // emergency path and broken cloud cancel POSTing). With // SIGKILL replacing the user-stop SIGTERM, the - // double-signal worry no longer applies — and the gate + // double-signal worry no longer applies, and the gate // turned a Stop click during HMR's graceful window into a // total no-op, leaving the run alive until checkpoint / // 5-min timeout. Manual stop now overrides HMR's graceful @@ -2438,7 +2438,7 @@ process.exit(0); const FAKE_JOB_ID = "manual-stop-during-hmr"; const fakeBin = join(trainCwd, "manual-during-hmr-bin.mjs"); // SIGTERM no-op so HMR's graceful SIGTERM doesn't terminate - // the bin — we need it alive so the subsequent manual + // the bin; we need it alive so the subsequent manual // cancel actually has something to SIGKILL. Marker uses the // server-injected nonce prefix so the parser accepts it. writeFileSync( @@ -2498,7 +2498,7 @@ process.exit(0); if (done) break; buf += decoder.decode(value, { stream: true }); } - // Emit an HMR mismatch — server's dispatch SIGTERMs the + // Emit an HMR mismatch: server's dispatch SIGTERMs the // bin and sets `earlyStopRequested = true` on the entry. // The bin's SIGTERM no-op keeps it alive so the manual // cancel below has a target. @@ -2512,7 +2512,7 @@ process.exit(0); // Let the dispatch run + signal land. await new Promise((r) => setTimeout(r, 80)); - // Manual cancel — old code would have early-returned; new + // Manual cancel: old code would have early-returned; new // code POSTs cloud cancel + SIGKILLs. await reader.cancel(); await new Promise((r) => setTimeout(r, 250)); @@ -2520,7 +2520,7 @@ process.exit(0); // Cloud cancel POST landed for the right job. expect(cancelHits).toHaveLength(1); expect(cancelHits[0]?.url).toContain(`/v1/jobs/${FAKE_JOB_ID}/cancel`); - // And the bin is dead — SIGKILL bypassed its SIGTERM + // And the bin is dead: SIGKILL bypassed its SIGTERM // no-op (which had been masking HMR's earlier SIGTERM). let probeError: NodeJS.ErrnoException | null = null; try { @@ -2540,7 +2540,7 @@ process.exit(0); // the watcher's first successful BUNDLE_END (the very first // success is broadcast as `ready`, and the entry-wait recovery // path also emits `ready`) would never get SIGUSR2/SIGTERM- - // routed when that build eventually landed — leaving it + // routed when that build eventually landed, leaving it // running a stale or empty artifact. Exercise the contract // here by spawning a hanging child, then emitting `ready` // with a different `configHash`; dispatch should pick up the @@ -2599,7 +2599,7 @@ process.exit(0); received += decoder.decode(value, { stream: true }); } expect(received).toContain("event: ready"); - // The dispatch augmentation marker — would be absent if the + // The dispatch augmentation marker: would be absent if the // `event.type !== "error"` filter regressed back to gating on // `=== "rebuild"`, and `restart`/`restartTargets` would never // appear on a `ready` frame. diff --git a/packages/arkor/src/studio/server.ts b/packages/arkor/src/studio/server.ts index 2506935c..9c8e15af 100644 --- a/packages/arkor/src/studio/server.ts +++ b/packages/arkor/src/studio/server.ts @@ -130,7 +130,7 @@ function htmlAttrEscape(s: string): string { /** * Inject the per-launch studio token (always) and an optional HMR * feature flag into ``. Both are read by the SPA via - * `` lookups — the token gates `/api/*` requests and + * `` lookups: the token gates `/api/*` requests and * the HMR flag tells `RunTraining` whether to open * `/api/dev/events` (which only exists when `arkor dev` wired in an * HMR coordinator). Without the server-side flag the SPA can't tell @@ -163,7 +163,7 @@ export function buildStudioApp(options: StudioServerOptions) { // `studio/server.ts` is bundled into `dist/bin.mjs` (it isn't reachable // from `src/index.ts`, so tsdown doesn't extract it as a shared chunk). // The bin therefore sits *next* to this code at runtime, not one - // directory up — `../bin.mjs` would resolve to the package root. + // directory up: `../bin.mjs` would resolve to the package root. const trainBinPath = options.binPath ?? fileURLToPath(new URL("./bin.mjs", import.meta.url)); @@ -201,7 +201,7 @@ export function buildStudioApp(options: StudioServerOptions) { // 1. Per-launch token. CORS is intentionally not configured: the SPA // is same-origin so CORS adds no value, and reflecting `*` would let // "simple" cross-origin POSTs (text/plain, urlencoded) skip preflight - // and reach the handler. The token check rejects those — an attacker + // and reach the handler. The token check rejects those: an attacker // page can't read the SPA's from another origin. // 2. `?studioToken=` is accepted only on the job-event stream route // because `EventSource` cannot send custom headers. Mutation routes @@ -288,7 +288,7 @@ export function buildStudioApp(options: StudioServerOptions) { // Pre-resolved outFile for the HMR fast path. The path is // deterministic per cwd (defaults from `BUILD_DEFAULTS`), so we // compute it once at app build time rather than on every request. - // Only used when HMR is enabled — `readManifestSummary` falls + // Only used when HMR is enabled; `readManifestSummary` falls // back to `runBuild()` when this is undefined or the file doesn't // exist yet (fresh scaffold pre-watcher-bootstrap). const hmrOutFile = options.hmr @@ -299,7 +299,7 @@ export function buildStudioApp(options: StudioServerOptions) { // Surface watcher build errors directly. Without this gate the // HMR fast path below would happily serve the LAST GOOD // artefact even when the user's current source fails to - // compile — `RunTraining` polls `/api/manifest` every ~5 s, so + // compile: `RunTraining` polls `/api/manifest` every ~5 s, so // the next poll after a compile error would 200 with stale // data and silently overwrite the SSE-surfaced error UI. // Users would then see a "healthy" trainer in the manifest @@ -380,14 +380,14 @@ export function buildStudioApp(options: StudioServerOptions) { }); // Active `/api/train` subprocesses. The registry encapsulates the - // signal-dispatch policy — see `studio/trainRegistry.ts`. + // signal-dispatch policy (see `studio/trainRegistry.ts`). const activeTrains = new TrainRegistry(); app.post("/api/train", async (c) => { const body = (await c.req.json().catch(() => ({}))) as { file?: string }; let trainFile: string | undefined; if (body.file) { - // Resolve symlinks before the containment check — `path.resolve` is purely + // Resolve symlinks before the containment check: `path.resolve` is purely // lexical, so a symlink under the project directory pointing at e.g. // `/etc/passwd` would otherwise pass `startsWith(baseAbs + sep)`. The // bin spawned below would then dlopen the link's target. @@ -416,7 +416,7 @@ export function buildStudioApp(options: StudioServerOptions) { // When HMR is enabled, read it synchronously from the coordinator // (which already maintains `lastEvent.configHash` for its watcher). // Reading from the cache avoids triggering an extra `runBuild()` - // per train request — the previous implementation called + // per train request: the previous implementation called // `readManifestSummary(trainCwd)` here, which both wasted CPU and // raced the watcher writing the same `.arkor/build/index.mjs`. // @@ -473,11 +473,11 @@ export function buildStudioApp(options: StudioServerOptions) { // child's `error` event), but Node can still throw synchronously // for argument-shape problems (e.g. invalid stdio descriptor on // unusual platforms). Catch both paths so an `/api/train` POST - // can never hang the SPA — sync throws return a clean 500, async + // can never hang the SPA: sync throws return a clean 500, async // 'error' events forward into the stream and close it (handled // inside the ReadableStream `start()` below). // `ChildProcessByStdio` is the - // specific overload return for `stdio: "pipe"` — narrows + // specific overload return for `stdio: "pipe"`; narrows // `child.stdout` / `child.stderr` away from the nullable // `Readable | null` of the general `ChildProcess` type. // `ReturnType` would land on the union and force @@ -533,26 +533,26 @@ export function buildStudioApp(options: StudioServerOptions) { // event loop also stops dispatching once we've torn down. let closed = false; // `child.stdout` is in default (binary) mode, so each `data` - // chunk is a Buffer — and `Buffer extends Uint8Array`, so we + // chunk is a Buffer, and `Buffer extends Uint8Array`, so we // can pass it straight to `controller.enqueue` without a // round-trip through `TextEncoder`. The previous code did // `enc.encode(d)` which implicitly coerced the buffer via - // `String()` — same byte content, but allocates a new array. + // `String()`: same byte content, but allocates a new array. // Forward a chunk to the SPA stream. Shared between the - // stdout and stderr listeners — both paths surface as + // stdout and stderr listeners; both paths surface as // request body bytes for the SPA's log view. const forward = (d: Buffer): void => { if (closed) return; try { controller.enqueue(d); } catch { - // Controller raced us into the closed state — flip the + // Controller raced us into the closed state; flip the // flag so subsequent chunks short-circuit. closed = true; } }; // Carry-over buffer for line-oriented job-id extraction. - // Stream chunk boundaries are arbitrary — the runner's + // Stream chunk boundaries are arbitrary: the runner's // single-line `Started job ` write can land split // across two `data` events, in which case a per-chunk // regex would never match and the cancel POST chain @@ -567,8 +567,8 @@ export function buildStudioApp(options: StudioServerOptions) { let stdoutLineBuf = ""; const STARTED_JOB_BUFFER_CAP = 4096; // STDOUT-ONLY job-id parser. The runner writes the canonical - // `Started job ` line via `process.stdout.write` — never - // stderr — so a single shared buffer across both pipes + // `Started job ` line via `process.stdout.write` (never + // stderr), so a single shared buffer across both pipes // would mis-match in two ways: // 1. A user `console.error("Started job ")` would // poison the buffer first; the real stdout marker @@ -613,7 +613,7 @@ export function buildStudioApp(options: StudioServerOptions) { forward(d); }; const onStderrChunk = (d: Buffer): void => { - // Forward only — never scan for `Started job`. See + // Forward only; never scan for `Started job`. See // `onStdoutChunk` comment for the cross-stream poisoning // hazards this split prevents. forward(d); @@ -621,13 +621,13 @@ export function buildStudioApp(options: StudioServerOptions) { const enc = new TextEncoder(); // Detach every listener this stream wired onto `child`. Called // from `onClose` / `onError` themselves (so once one fires the - // closure references — controller, TextEncoder — drop and the + // closure references (controller, TextEncoder) drop and the // subprocess record can be GC'd promptly even if the other // event also queues), and from `cancelTeardown` for the // client-side cancel path. Removing only the `data` listeners // (as the previous code did) left `close` / `error` attached // to the dead ChildProcess, which kept their closures pinned - // until the process object itself was reaped — meaningful + // until the process object itself was reaped: meaningful // memory pressure for an `arkor dev` session that spawns many // children over hours. const detachListeners = (): void => { @@ -651,11 +651,11 @@ export function buildStudioApp(options: StudioServerOptions) { // `error` event fires when async spawn machinery surfaces a // failure (ENOENT for the executable, EACCES, EAGAIN under // resource exhaustion, etc.). Without this listener the - // ReadableStream would never close — the SPA would hang + // ReadableStream would never close; the SPA would hang // waiting for output that never arrives. Forward the error // text into the stream body, close, and unregister the // child. Node's contract is: if 'error' fires, 'close' may - // or may not follow — both paths are guarded by the `closed` + // or may not follow; both paths are guarded by the `closed` // flag and the `unregister` call is idempotent. const onError = (err: Error): void => { activeTrains.unregister(child.pid); @@ -677,7 +677,7 @@ export function buildStudioApp(options: StudioServerOptions) { child.on("error", onError); cancelTeardown = () => { // Don't detach data listeners here: the child stays alive - // for some time after the SPA cancels — either because + // for some time after the SPA cancels, either because // we're skipping `child.kill()` for an in-progress // HMR early-stop, or because `child.kill()`'s SIGTERM // triggers a graceful checkpoint+exit that takes @@ -685,19 +685,19 @@ export function buildStudioApp(options: StudioServerOptions) { // logs to its stdout/stderr pipes; if our `data` // listeners are gone, Node stops draining the OS pipe, // the buffer fills, and the child's next `write()` - // blocks indefinitely — deadlocking the very graceful + // blocks indefinitely, deadlocking the very graceful // exit we're trying to preserve. The `closed` flag // already makes `enqueue`/`close` a no-op so the // controller-closed race stays safe; the eventual // `onClose` / `onError` listeners detach everything // (via `detachListeners()`) when the child finally - // exits. That timing — at-exit, not at-cancel — is the + // exits. That timing (at-exit, not at-cancel) is the // correct moment to break the closure refs for GC. closed = true; }; }, cancel() { - // The SPA-side cancel is always *user-initiated* — either an + // The SPA-side cancel is always *user-initiated*: either an // explicit Stop click or tab-close/navigation, which the // user just as explicitly chose. HMR-driven SIGTERMs go // straight from the server to the runner via @@ -712,7 +712,7 @@ export function buildStudioApp(options: StudioServerOptions) { // (which used to gate this branch on // `isEarlyStopRequested`) doesn't apply. The runner's // graceful early-stop chain may have been trying to - // preserve a checkpoint, but the user just said no — keep + // preserve a checkpoint, but the user just said no; keep // the local subprocess teardown snappy and let the // server-side cancel POST handle the cloud-side release. // @@ -788,7 +788,7 @@ export function buildStudioApp(options: StudioServerOptions) { // SIGKILL (not the default SIGTERM) for user-initiated // aborts. The runner's `installShutdownHandlers` now treats // a single SIGTERM as the HMR-driven "graceful early-stop" - // signal — wait for the next checkpoint (up to ~5 min + // signal: wait for the next checkpoint (up to ~5 min // timeout) before exiting. That semantics is right for the // HMR path but wrong for a Stop-training click: the user // wants the run STOPPED, not left running in the background @@ -803,15 +803,15 @@ export function buildStudioApp(options: StudioServerOptions) { // line on the registry; the IIFE looks it up here). SIGKILL // alone would have left the cloud job orphaned until // TTL/reaper because the runner can't POST cancel itself - // when the kernel reaps it without warning. Together — - // server-side cancel POST + SIGKILL — give snappy local + // when the kernel reaps it without warning. Together, + // server-side cancel POST + SIGKILL give snappy local // teardown AND eventual cloud-side release. // // `ChildProcess.kill()` can throw (ESRCH if the process has // already exited between this handler's invocation and the // signal delivery). A throw here would surface as an unhandled // exception in the request pipeline and crash the server - // handler — swallow it; the close handler above has already + // handler. Swallow it; the close handler above has already // taken the entry out of the registry. try { child.kill("SIGKILL"); @@ -827,7 +827,7 @@ export function buildStudioApp(options: StudioServerOptions) { // misread a sibling tab's restart event as its own. // // Header is OMITTED entirely (rather than sent as an empty - // string) when `child.pid` isn't a number — that case happens + // string) when `child.pid` isn't a number; that case happens // when the OS hasn't assigned a pid by the time `spawn()` // returns and the child's async `error` event will fire shortly // (per-Node-docs `subprocess.pid` is `undefined` for @@ -846,7 +846,7 @@ export function buildStudioApp(options: StudioServerOptions) { return new Response(stream, { status: 200, headers }); }); - // `/api/dev/events` — SSE stream of HMR rebuild / error notifications. + // `/api/dev/events`: SSE stream of HMR rebuild / error notifications. // Only active when `arkor dev` passed an HMR coordinator. The CSRF model // accepts `?studioToken=` here (whitelisted in `eventStreamPathPattern`) // because `EventSource` cannot send headers. When HMR is not configured @@ -888,7 +888,7 @@ export function buildStudioApp(options: StudioServerOptions) { // before the first `ready` (e.g. the SPA fired Run Training // immediately after `arkor dev` booted, while the watcher's // initial BUNDLE_END was still in flight) would otherwise - // never get SIGUSR2/SIGTERM-routed when that build lands — + // never get SIGUSR2/SIGTERM-routed when that build lands, // leaving it stuck on a stale or empty artifact until the // next edit triggers a `rebuild`. Filtering by "not error" // is forward-compatible with any new successful event types. @@ -902,9 +902,9 @@ export function buildStudioApp(options: StudioServerOptions) { // Content-hash for the pre-ready-spawn equality gate (the // timestamp `event.hash` would over-trigger SIGTERM-restart // on identical-bytes rebuilds). Both sides of the - // comparison — `entry.spawnArtifactContentHash` (captured - // via `getCurrentArtifactContentHash()`) and this - // `event.contentHash` — are derived the same way, so a + // comparison (`entry.spawnArtifactContentHash` captured + // via `getCurrentArtifactContentHash()`, and this + // `event.contentHash`) are derived the same way, so a // match means the child's loaded bytes ARE what the new // configHash describes. const nextArtifactContentHash = event.contentHash ?? null; @@ -925,7 +925,7 @@ export function buildStudioApp(options: StudioServerOptions) { try { fn(augmented); } catch { - // listener controller closed mid-write — the cancel hook + // listener controller closed mid-write; the cancel hook // below takes care of removing it from the set. } } @@ -978,7 +978,7 @@ export function buildStudioApp(options: StudioServerOptions) { state = await ensureProjectState({ cwd: trainCwd, client, credentials }); } catch (err) { // Propagate cloud-api's status verbatim (e.g. 401 / 403 / 5xx) so the - // SPA / clients can react appropriately — collapsing everything to 400 + // SPA / clients can react appropriately; collapsing everything to 400 // would mis-report upstream outages and auth failures. Anything else // (local writeState failures, missing-credentials guard) is treated as // a server-side error. diff --git a/packages/arkor/src/studio/trainRegistry.test.ts b/packages/arkor/src/studio/trainRegistry.test.ts index 324797bb..1278f7f0 100644 --- a/packages/arkor/src/studio/trainRegistry.test.ts +++ b/packages/arkor/src/studio/trainRegistry.test.ts @@ -48,7 +48,7 @@ describe("TrainRegistry", () => { }); it("dispatchRebuild SIGTERMs everything when nextConfigHash is null", () => { - // null nextHash means "we couldn't inspect the new bundle" — be + // null nextHash means "we couldn't inspect the new bundle": be // conservative and SIGTERM every active child since we can't // prove their configs are unaffected. const reg = new TrainRegistry(); @@ -72,7 +72,7 @@ describe("TrainRegistry", () => { // and now: if `spawnArtifactContentHash === nextArtifactContentHash`, the // child read exactly the bytes the new hash describes → // backfill + skip dispatch (no spurious cancel+restart cycle). - // Otherwise — see the next test — SIGTERM-restart so cloud + // Otherwise (see the next test) SIGTERM-restart so cloud // and child stay aligned. const reg = new TrainRegistry(); const c = fakeChild(401); @@ -82,7 +82,7 @@ describe("TrainRegistry", () => { spawnArtifactContentHash: "art-v1", }); const result = reg.dispatchRebuild("first-real-hash", "art-v1"); - // Neither bucket — no signal sent, nothing for the SPA to react to. + // Neither bucket: no signal sent, nothing for the SPA to react to. expect(result.hotSwapTargets).toEqual([]); expect(result.restartTargets).toEqual([]); expect(c.kill).not.toHaveBeenCalled(); @@ -110,7 +110,7 @@ describe("TrainRegistry", () => { // watcher's first BUNDLE_END means the bytes the child loaded // differ from what the new `configHash` describes. Backfilling // unconditionally would silently teach the registry to use the - // post-edit hash as the child's baseline — later same-hash + // post-edit hash as the child's baseline; later same-hash // rebuilds would then hot-swap callbacks into a child whose // cloud-side `JobConfig` was actually spawned against an older // version, leaving the cloud run on a stale config. The artefact @@ -180,7 +180,7 @@ describe("TrainRegistry", () => { expect(reg.isEarlyStopRequested(undefined)).toBe(false); expect(reg.isEarlyStopRequested(99999)).toBe(false); // Once the child unregisters (close handler) the flag effectively - // resets — subsequent queries return false rather than retaining + // resets: subsequent queries return false rather than retaining // stale state. reg.unregister(901); expect(reg.isEarlyStopRequested(901)).toBe(false); @@ -215,7 +215,7 @@ describe("TrainRegistry", () => { // Regression: previously the implementation always pushed onto // `targets` even when `kill()` threw, so a child that had already // exited would still be reported back to the SPA as a restart - // target — the SPA would then wait forever for the (already- + // target: the SPA would then wait forever for the (already- // delivered) `exit=...` line and never re-spawn. const reg = new TrainRegistry(); const dead = fakeChild(601); @@ -237,7 +237,7 @@ describe("TrainRegistry", () => { // Regression: `safeKill` previously treated any thrown error as // `"unsupported"`, which on the hash-match branch triggers a // SIGTERM fallback (intended for Windows + SIGUSR2 unsupported). - // POSIX `kill(2)` raises `ESRCH` for an already-exited child — + // POSIX `kill(2)` raises `ESRCH` for an already-exited child: // classifying that as "unsupported" caused a needless SIGTERM // attempt against a dead PID. Now ESRCH routes through the // "gone" branch (no fallback, no restart-target push) so the @@ -258,7 +258,7 @@ describe("TrainRegistry", () => { // as gone, NOT routed into the SIGTERM fallback path). expect(result.hotSwapTargets).toEqual([]); expect(result.restartTargets).toEqual([]); - // Single SIGUSR2 attempt — no SIGTERM fallback was issued. + // Single SIGUSR2 attempt: no SIGTERM fallback was issued. expect(goneOnSigusr2.kill).toHaveBeenCalledTimes(1); expect(goneOnSigusr2.kill).toHaveBeenCalledWith("SIGUSR2"); }); @@ -267,7 +267,7 @@ describe("TrainRegistry", () => { // Regression: `ChildProcess.kill()` returns `false` (without // throwing) when the target process is already gone. The previous // implementation treated any non-throw as success and reported the - // child as a restart target — the SPA would then wait forever for + // child as a restart target; the SPA would then wait forever for // an exit line that already arrived. const reg = new TrainRegistry(); const gone = fakeChild(701); @@ -278,7 +278,7 @@ describe("TrainRegistry", () => { }); const result = reg.dispatchRebuild("fresh"); expect(result.restartTargets).toEqual([]); - // We still attempted the kill — only the bookkeeping is skipped. + // We still attempted the kill; only the bookkeeping is skipped. expect(gone.kill).toHaveBeenCalledWith("SIGTERM"); }); @@ -353,7 +353,7 @@ describe("TrainRegistry", () => { trainFile: "/tmp/win.ts", }); const result = reg.dispatchRebuild("match"); - // Restart bucket only — hot-swap is unsafe on win32 even + // Restart bucket only: hot-swap is unsafe on win32 even // when kill() reported "ok". expect(result.hotSwapTargets).toEqual([]); expect(result.restartTargets).toEqual([ @@ -378,7 +378,7 @@ describe("TrainRegistry", () => { // implementation silently swallowed that throw, so on Windows a // hash-match rebuild produced neither hot-swap nor restart and // callback edits never landed. Now we degrade to a SIGTERM-driven - // restart so the new code does take effect — at the cost of a + // restart so the new code does take effect, at the cost of a // brief gap rather than an in-place swap. const reg = new TrainRegistry(); const a = fakeChild(901); diff --git a/packages/arkor/src/studio/trainRegistry.ts b/packages/arkor/src/studio/trainRegistry.ts index baad7386..9286e98b 100644 --- a/packages/arkor/src/studio/trainRegistry.ts +++ b/packages/arkor/src/studio/trainRegistry.ts @@ -6,10 +6,10 @@ import type { ChildProcess } from "node:child_process"; * rebuilds can decide, per child, between: * * - **SIGUSR2** (callback hot-swap) when the new bundle's `configHash` - * matches the one captured at spawn time — the cloud-side run is + * matches the one captured at spawn time: the cloud-side run is * unaffected, only in-process callbacks need to update. * - **SIGTERM** (graceful early-stop + restart) when the configs - * diverge — the runner's internal early-stop entry point lets the + * diverge: the runner's internal early-stop entry point lets the * next checkpoint finish, the subprocess exits, and the SPA * re-spawns with the rebuilt artefact. */ @@ -17,18 +17,18 @@ export interface ActiveTrain { child: ChildProcess; trainFile?: string; /** Cloud-side config hash captured at spawn time (may be null if the - * manifest wasn't inspectable yet — e.g. spawn raced an in-flight + * manifest wasn't inspectable yet, e.g. spawn raced an in-flight * build). A null entry forces SIGTERM on the next rebuild because we * can't prove the configs match. */ configHash: string | null; /** - * Content hash (sha256, truncated — see `studio/hmr.ts`'s + * Content hash (sha256, truncated; see `studio/hmr.ts`'s * `contentHashOrNull`) of the on-disk `.arkor/build/index.mjs` * at spawn time. Used **only** to gate the pre-ready-spawn * backfill: if a rebuild eventually fires while `configHash` is * still null and this content hash equals the rebuild's * `event.contentHash`, the child is provably reading the same - * bundle bytes the new hash describes — safe to backfill + * bundle bytes the new hash describes: safe to backfill * `configHash` and skip dispatch. A mismatch (or null here) * means the on-disk artefact has changed between spawn and * rebuild (user edited mid-spawn, fresh project never built, …) @@ -38,7 +38,7 @@ export interface ActiveTrain { * * Content-hash (vs the timestamp `mtime+ctime+size` shape used * by `event.hash` for SSE dedup) avoids a false-positive - * mismatch when a watcher rebuild produces identical bytes — + * mismatch when a watcher rebuild produces identical bytes: * timestamps still bump, but content is the same and we * shouldn't force a spurious cancel+restart cycle. Null when * HMR isn't enabled or read failed. @@ -47,7 +47,7 @@ export interface ActiveTrain { /** * `true` once we've already SIGTERM'd this child for an HMR-driven * early-stop. Subsequent rebuilds (which can land before the child - * has reached its next checkpoint) must NOT re-send SIGTERM — + * has reached its next checkpoint) must NOT re-send SIGTERM: * the runner's shutdown handler treats a second SIGTERM as the * emergency `process.exit(143)` escape hatch, which would defeat * the whole point of preserving the in-flight checkpoint. Kept @@ -61,7 +61,7 @@ export interface ActiveTrain { * before that or for runs whose stdout we never saw the line on * (early spawn failure, custom user bins, etc.). The * `/api/train` cancel handler reads this to fire a fire-and-forget - * `POST /v1/jobs/:id/cancel` before SIGKILLing the subprocess — + * `POST /v1/jobs/:id/cancel` before SIGKILLing the subprocess. * SIGKILL bypasses the runner's `installShutdownHandlers`, so * without this server-side cancel the cloud-side job would live * until the cloud reaper / TTL fires (continued GPU spend). @@ -108,9 +108,9 @@ export interface DispatchResult { * - `"gone"`: process was already exited. Surfaces both as `kill` * returning `false` (Node's mapped form) and as a thrown `ESRCH` * (a race where the child exits between the `entries` lookup and - * the `kill` call — POSIX `kill(2)` raises `ESRCH` for + * the `kill` call: POSIX `kill(2)` raises `ESRCH` for * non-existent PIDs and Node propagates it on some versions). - * - `"unsupported"`: any *other* `kill` throw — i.e. the signal + * - `"unsupported"`: any *other* `kill` throw, i.e. the signal * couldn't be delivered for a reason that isn't "process is gone". * The motivating case is the platform not supporting this signal * kind (Windows + `SIGUSR2` → `ENOSYS`; bad signal name → @@ -118,8 +118,8 @@ export interface DispatchResult { * for. The bucket is intentionally a catch-all rather than a * whitelist of error codes: rare cases like `EPERM` (lost the * right to signal a re-parented child) and platform-specific - * surprises take the same conservative fallback — try the next - * signal, otherwise drop the entry — which is what callers want + * surprises take the same conservative fallback (try the next + * signal, otherwise drop the entry), which is what callers want * from "kill failed for some non-recoverable reason". */ type KillResult = "ok" | "gone" | "unsupported"; @@ -128,7 +128,7 @@ function safeKill(child: ChildProcess, signal: NodeJS.Signals): KillResult { try { return child.kill(signal) ? "ok" : "gone"; } catch (err) { - // `ESRCH` ("no such process") means the child already exited — + // `ESRCH` ("no such process") means the child already exited: // semantically identical to `kill returning false`. Mis-classifying // it as `"unsupported"` would route a hash-match hot-swap candidate // into the SIGTERM fallback, which then also no-ops (also gone) but @@ -165,8 +165,8 @@ export class TrainRegistry { // HMR-disabled server, a hand-rolled fake) can omit it. // Defaults to `null`, which forces the pre-ready-spawn // branch to fall through to SIGTERM-restart on the next - // non-null rebuild — the safe choice when we genuinely - // don't know what bytes the child loaded. Real `/api/train` + // non-null rebuild (the safe choice when we genuinely + // don't know what bytes the child loaded). Real `/api/train` // calls in HMR mode capture this from // `coordinator.getCurrentArtifactContentHash()`. spawnArtifactContentHash?: string | null; @@ -185,7 +185,7 @@ export class TrainRegistry { spawnArtifactContentHash: init.spawnArtifactContentHash ?? null, scope: init.scope ?? null, earlyStopRequested: false, - // `jobId` starts null — populated later by `recordJobId(pid, + // `jobId` starts null; populated later by `recordJobId(pid, // id)` when the server's stdout parser sees the runner's // `Started job ` line. Tests that don't exercise the // cancel-POST path can leave it null. @@ -215,7 +215,7 @@ export class TrainRegistry { /** * Read the recorded cloud-side job id for a pid. `/api/train`'s * cancel handler consults this to POST `/v1/jobs/:id/cancel` - * before SIGKILLing the local subprocess — without that POST, + * before SIGKILLing the local subprocess; without that POST, * a user-initiated stop would leave the cloud job running * until TTL (the SIGKILL bypasses the runner's `installShutdownHandlers` * so the runner can't issue cancel itself). Returns null when @@ -249,7 +249,7 @@ export class TrainRegistry { * SIGTERM to this child as part of an HMR cycle. Consulted by * `/api/train`'s ReadableStream `cancel()` handler so a client- * driven cancel (tab close, navigation, aborted fetch) doesn't - * pile a second SIGTERM on top of an in-progress early-stop — + * pile a second SIGTERM on top of an in-progress early-stop: * the runner's `installShutdownHandlers` interprets a second * SIGTERM as the emergency `exit(143)` fast-path, which bypasses * the checkpoint-preserving early-stop + `cancel()` flow and @@ -279,7 +279,7 @@ export class TrainRegistry { * * Combines what was previously `notifyCallbackReload` + * `requestEarlyStopOnMismatch` into one pass so the per-child - * decision is atomic — important because the hot-swap path can + * decision is atomic: important because the hot-swap path can * gracefully degrade into the restart path on platforms (Windows) * where SIGUSR2 isn't supported, which is hard to express across * two separate iterations of the registry. @@ -337,7 +337,7 @@ export class TrainRegistry { // config are guaranteed to align. Without this gate, an // edit landing between spawn and the first BUNDLE_END would // silently teach the registry to use the post-edit hash as - // the child's baseline — later same-hash rebuilds would + // the child's baseline; later same-hash rebuilds would // then hot-swap callbacks into a child whose cloud-side // `JobConfig` was *actually* spawned against an older // version, leaving the cloud run on a stale config. @@ -362,8 +362,8 @@ export class TrainRegistry { if (matches) { // On Windows, Node's `child.kill(signal)` for any unknown // POSIX signal (including SIGUSR2) is documented to - // **forcefully terminate** the process — same effect as - // SIGKILL — and `kill()` returns `true` like a successful + // **forcefully terminate** the process (same effect as + // SIGKILL), and `kill()` returns `true` like a successful // delivery. `safeKill` would then report `"ok"`, the entry // would land in `hotSwapTargets`, and the SPA would never // schedule a restart even though the child is *dead*. Skip @@ -384,7 +384,7 @@ export class TrainRegistry { continue; } // Cross-platform safety net: SIGUSR2 reported `"unsupported"` - // on a non-win32 platform (rare — `ENOSYS` from libuv signal + // on a non-win32 platform (rare: `ENOSYS` from libuv signal // wrap on exotic builds, future Node versions removing the // signal, etc.). Same fallback as the win32 skip above: // route to SIGTERM-restart so callback edits still take diff --git a/packages/cli-internal/src/templates.ts b/packages/cli-internal/src/templates.ts index c3e68f7f..aa9ce5f4 100644 --- a/packages/cli-internal/src/templates.ts +++ b/packages/cli-internal/src/templates.ts @@ -1,13 +1,13 @@ /** * Starter templates written out by `create-arkor` / `arkor init`. - * Single source of truth — both consumers bundle this module at build time. + * Single source of truth: both consumers bundle this module at build time. * * Layout written to disk: * * src/arkor/index.ts ← entry-point manifest (`createArkor({ trainer })`) * src/arkor/trainer.ts ← per-template trainer (`createTrainer({...})`) * - * `index.ts` is identical across templates — only the trainer body differs. + * `index.ts` is identical across templates; only the trainer body differs. */ export type TemplateId = "redaction" | "translate" | "triage"; @@ -76,7 +76,7 @@ export const trainer = createTrainer({ }); `; -// Order is significant — `templateChoices()` preserves insertion order so the +// Order is significant: `templateChoices()` preserves insertion order so the // CLI prompt lists demos first (sorted by estimated training time). // // Estimated training times assume A100 80GB on Runpod Serverless with the @@ -101,7 +101,7 @@ export const TEMPLATES: Record = { }; /** - * Body of `src/arkor/index.ts` — identical across templates. The `createArkor` + * Body of `src/arkor/index.ts`: identical across templates. The `createArkor` * factory is what `arkor build` / Studio discovers; per-role primitives * (`trainer`, future `deploy`, `eval`) live in sibling files and get gathered * here. @@ -113,7 +113,7 @@ export const arkor = createArkor({ trainer }); `; export const STARTER_CONFIG = `// Training defaults. Project routing (orgSlug / projectSlug) is tracked -// automatically in .arkor/state.json — do not put it here. +// automatically in .arkor/state.json; do not put it here. export default {}; `; @@ -125,7 +125,7 @@ An arkor training project scaffolded by \`create-arkor\`. The \`dev\` / \`build\` / \`start\` package scripts forward to the matching \`arkor\` subcommands, so the script form works across every package -manager (\`npm\` does not run package binaries via \`npm \` — use +manager (\`npm\` does not run package binaries via \`npm \`; use \`npm run