diff --git a/CHANGELOG.md b/CHANGELOG.md index 54bf686..34175fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,27 @@ ## Unreleased +- **fix(launcher): `POST /api/control/start` no longer reports success on a + hung backend; returns `202 Accepted` with `started_pending=true`.** PR #68 + made `supervisor.start()` non-fatal on `/stats` timeout (proc left + running so the tray's next poll picks it up). The REST handler still + treated this as success and returned `{ok: true, pid}`, so external + automation hitting `/api/control/start` directly couldn't distinguish + ready from alive-but-not-ready. New `supervisor.last_start_pending` + flag flips on the timeout path; REST surface returns 202 with a + `started_pending: true` field and a hint to poll `/api/state` or + `GET /stats`. Same treatment on `/api/control/restart`. Closes #72. +- **fix(hardware): summary `WARNING` line when explicit-device probe + falls back to CPU.** The tray fires a balloon, but headless deployments + (server, supervisor-managed, agents) miss that signal. `_detect()` now + emits one `log.warning("Hardware fallback: requested=X active=cpu — ...")` + alongside the per-candidate probe failures so operators tailing logs + see the cause in line-of-sight. `auto`→cpu is unchanged (not noteworthy). + Closes #65 SF2 — SF1, SF3, SF4 were already addressed on master + (per-rewrite log.info, `cost_class` in `/health` + Prometheus info + metric + startup WARN, and the WAL-bloat section in + `docs/TROUBLESHOOTING.md` + `/admin/checkpoint` admin endpoint). + - **fix(api): `open_session()` now honors `HELIX_CONFIG` / `HELIX_GENOME_PATH`.** Pre-fix, every cold-start CLI subcommand (query, diag corpus, packet, gene, neighbors, refresh-targets) called `HelixConfig()` (defaults) and diff --git a/helix_context/hardware.py b/helix_context/hardware.py index 529df8a..6bded0e 100644 --- a/helix_context/hardware.py +++ b/helix_context/hardware.py @@ -300,6 +300,14 @@ def _detect() -> HardwareInfo: f"requested {requested!r} but probe/availability failed: " f"{last_failure_reason or 'no usable device found'}" ) + # SF2 (#65): headless deployments miss the tray balloon. A summary + # WARNING here means operators tailing logs see one clear line + # connecting the per-candidate probe failures above to the + # final "we ended up on CPU" outcome. + log.warning( + "Hardware fallback: requested=%s active=cpu — %s", + requested, fallback_reason, + ) else: fallback_reason = None diff --git a/helix_context/launcher/app.py b/helix_context/launcher/app.py index cdb3f44..c011aa8 100644 --- a/helix_context/launcher/app.py +++ b/helix_context/launcher/app.py @@ -29,7 +29,6 @@ HelixSupervisor, NotRunning, ShutdownTimeout, - StartupTimeout, SupervisorError, ) from .update_check import UpdateChecker @@ -141,11 +140,32 @@ async def api_state(): async def api_control_start(): try: pid = supervisor.start() - return {"ok": True, "pid": pid} except AlreadyRunning as exc: return JSONResponse({"error": str(exc)}, status_code=409) - except (StartupTimeout, SupervisorError) as exc: + except SupervisorError as exc: + # NB: StartupTimeout is no longer raised by start() (PR #68); it is + # surfaced via ``supervisor.last_start_pending`` below. SupervisorError + # still covers port-collisions and non-timeout failures. return JSONResponse({"error": str(exc)}, status_code=500) + # PR #68 made cold-start /stats timeout non-fatal — proc is left running + # for the tray's next poll. REST callers need to distinguish "ready" + # from "alive-but-not-ready" so an external automation doesn't treat + # a hung backend as success (closes #72). + if supervisor.last_start_pending: + return JSONResponse( + { + "ok": True, + "pid": pid, + "started_pending": True, + "message": ( + "Backend pid was spawned but /stats did not answer " + "within the wait timeout. The process is still alive; " + "poll /api/state or GET /stats to confirm readiness." + ), + }, + status_code=202, + ) + return {"ok": True, "pid": pid, "started_pending": False} @app.post("/api/control/stop") async def api_control_stop(): @@ -163,9 +183,24 @@ async def api_control_stop(): async def api_control_restart(): try: pid = supervisor.restart(reason="manual restart from launcher UI") - return {"ok": True, "pid": pid} - except (StartupTimeout, ShutdownTimeout, SupervisorError) as exc: + except (ShutdownTimeout, SupervisorError) as exc: return JSONResponse({"error": str(exc)}, status_code=500) + # Same alive-but-not-ready surface as /api/control/start (closes #72). + if supervisor.last_start_pending: + return JSONResponse( + { + "ok": True, + "pid": pid, + "started_pending": True, + "message": ( + "Backend pid was respawned but /stats did not answer " + "within the wait timeout. The process is still alive; " + "poll /api/state or GET /stats to confirm readiness." + ), + }, + status_code=202, + ) + return {"ok": True, "pid": pid, "started_pending": False} return app diff --git a/helix_context/launcher/supervisor.py b/helix_context/launcher/supervisor.py index 2884513..63141f1 100644 --- a/helix_context/launcher/supervisor.py +++ b/helix_context/launcher/supervisor.py @@ -91,6 +91,10 @@ def __init__( self._last_error: Optional[str] = None self._last_error_at: Optional[float] = None self._last_error_operation: Optional[str] = None + # True iff the most recent start() returned a pid for a process that + # did not answer /stats within the wait timeout. Read by REST callers + # (closes #72) so a hung backend doesn't look like "ok pid=42". + self._last_start_pending: bool = False self._owns_helix_process = False # Tracked Popen handle — used for POSIX zombie reap via poll(). # Windows reaps through taskkill; POSIX needs a wait() sibling call. @@ -188,6 +192,17 @@ def get_last_error(self) -> Optional[dict]: "at": self._last_error_at, } + @property + def last_start_pending(self) -> bool: + """True iff the most recent ``start()`` returned a pid for a process + that did not answer ``/stats`` within the wait timeout. PR #68 made + cold-start /stats timeout non-fatal (the spawned proc is left running + for the tray's next poll); this flag lets REST callers detect the + "alive but not ready" state so they don't report success on a hung + backend (closes #72). Reset to False on the next start() success or + adoption path.""" + return self._last_start_pending + def _record_error(self, operation: str, message: str) -> None: self._last_error = message self._last_error_at = time.time() @@ -308,6 +323,7 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int: port=self.helix_port, ) self._owns_helix_process = False + self._last_start_pending = False self._clear_error() return orphan_pid @@ -362,9 +378,11 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int: timeout, proc.pid, exc, ) self._record_error("start", str(exc)) + self._last_start_pending = True return proc.pid log.info("Helix started (pid=%d)", proc.pid) + self._last_start_pending = False self._clear_error() return proc.pid diff --git a/tests/test_hardware.py b/tests/test_hardware.py index 83426a2..da72f77 100644 --- a/tests/test_hardware.py +++ b/tests/test_hardware.py @@ -320,6 +320,54 @@ def test_explicit_cuda_falls_back_to_cpu_on_probe_failure(mock_torch, monkeypatc assert "cuda" in info.fallback_reason.lower() +def test_fallback_emits_summary_warning_for_headless_operators( + mock_torch, monkeypatch, caplog +): + """SF2 (#65): operators tailing logs on a headless deployment miss + the tray balloon. _detect() must emit a single WARNING line tying the + requested/active devices to the fallback_reason so the cause is in + line-of-sight, not just a downstream /health field.""" + import logging + monkeypatch.setenv("HELIX_DEVICE", "cuda") + mock_torch["cuda_available"] = False + with caplog.at_level(logging.WARNING, logger="helix.hardware"): + info = hardware._detect() + assert info.device_type == "cpu" + assert info.fallback_reason is not None + summary_lines = [ + rec.message + for rec in caplog.records + if "hardware fallback" in rec.message.lower() + ] + assert summary_lines, ( + "expected a 'Hardware fallback: ...' summary WARNING; got: " + f"{[r.message for r in caplog.records]}" + ) + msg = summary_lines[0].lower() + assert "requested=cuda" in msg + assert "active=cpu" in msg + + +def test_auto_picker_does_not_emit_fallback_warning(mock_torch, caplog): + """When the user requested 'auto' and we landed on CPU, that's a + normal outcome (no GPU on this box), not a fallback. The summary + WARNING from SF2 must not fire — auto-on-CPU is not noteworthy.""" + import logging + # mock_torch fixture defaults all backends to unavailable -> auto -> cpu + with caplog.at_level(logging.WARNING, logger="helix.hardware"): + info = hardware._detect() + assert info.device_type == "cpu" + assert info.fallback_reason is None + summary_lines = [ + rec.message + for rec in caplog.records + if "hardware fallback" in rec.message.lower() + ] + assert not summary_lines, ( + f"auto-on-cpu should not fire the fallback WARNING; got: {summary_lines}" + ) + + def test_helix_device_env_var_case_insensitive(mock_torch, monkeypatch): monkeypatch.setenv("HELIX_DEVICE", "CPU") info = hardware._detect() diff --git a/tests/test_launcher_app.py b/tests/test_launcher_app.py index 26397f6..e26256b 100644 --- a/tests/test_launcher_app.py +++ b/tests/test_launcher_app.py @@ -17,7 +17,7 @@ AlreadyRunning, NotRunning, ShutdownTimeout, - StartupTimeout, + SupervisorError, ) @@ -187,17 +187,37 @@ def test_panels_partial_renders_disconnected_agents(self, client, fake_collector class TestControlStart: def test_start_success(self, client, fake_supervisor): fake_supervisor.start.return_value = 99999 + fake_supervisor.last_start_pending = False resp = client.post("/api/control/start") assert resp.status_code == 200 - assert resp.json() == {"ok": True, "pid": 99999} + assert resp.json() == {"ok": True, "pid": 99999, "started_pending": False} def test_start_already_running_returns_409(self, client, fake_supervisor): fake_supervisor.start.side_effect = AlreadyRunning("already") resp = client.post("/api/control/start") assert resp.status_code == 409 - def test_start_timeout_returns_500(self, client, fake_supervisor): - fake_supervisor.start.side_effect = StartupTimeout("did not start") + def test_start_pending_returns_202_with_started_pending_field( + self, client, fake_supervisor + ): + """Regression for #72: PR #68 made supervisor.start() return pid + on /stats timeout (proc left running for tray's next poll). REST + callers must see a distinct alive-but-not-ready signal instead of + the success-shaped 200.""" + fake_supervisor.start.return_value = 99999 + fake_supervisor.last_start_pending = True + resp = client.post("/api/control/start") + assert resp.status_code == 202 + body = resp.json() + assert body["ok"] is True + assert body["pid"] == 99999 + assert body["started_pending"] is True + assert "did not answer" in body["message"] + + def test_start_supervisor_error_returns_500(self, client, fake_supervisor): + """Non-timeout supervisor failures (port collision, psutil missing, + etc) still surface as 500 — only /stats timeouts are 202.""" + fake_supervisor.start.side_effect = SupervisorError("port collision") resp = client.post("/api/control/start") assert resp.status_code == 500 @@ -223,9 +243,24 @@ def test_stop_shutdown_timeout_returns_408(self, client, fake_supervisor): class TestControlRestart: def test_restart_success(self, client, fake_supervisor): fake_supervisor.restart.return_value = 88888 + fake_supervisor.last_start_pending = False resp = client.post("/api/control/restart") assert resp.status_code == 200 - assert resp.json() == {"ok": True, "pid": 88888} + assert resp.json() == {"ok": True, "pid": 88888, "started_pending": False} + + def test_restart_pending_returns_202_with_started_pending_field( + self, client, fake_supervisor + ): + """Same alive-but-not-ready surface as /api/control/start (#72).""" + fake_supervisor.restart.return_value = 88888 + fake_supervisor.last_start_pending = True + resp = client.post("/api/control/restart") + assert resp.status_code == 202 + body = resp.json() + assert body["ok"] is True + assert body["pid"] == 88888 + assert body["started_pending"] is True + assert "did not answer" in body["message"] class TestNativeFailFast: diff --git a/tests/test_launcher_supervisor.py b/tests/test_launcher_supervisor.py index d95aea2..1d87e85 100644 --- a/tests/test_launcher_supervisor.py +++ b/tests/test_launcher_supervisor.py @@ -119,6 +119,8 @@ def test_start_spawns_subprocess_and_writes_state(self, supervisor, store): assert store.state.helix_pid == 54321 assert store.state.helix_port == 11999 assert popen_mock.called + # Success path: pending flag is cleared (closes #72). + assert supervisor.last_start_pending is False # Verify CREATE_NO_WINDOW was passed on Windows, 0 elsewhere popen_kwargs = popen_mock.call_args.kwargs @@ -153,6 +155,9 @@ def test_start_keeps_process_running_on_startup_timeout(self, supervisor, store) # State remains set; the tray will probe /stats on the next refresh. assert store.state.helix_pid == 54321 + # Pending flag is set so REST callers can distinguish ready vs + # alive-but-not-ready (closes #72). + assert supervisor.last_start_pending is True class TestStop: