mbachaud · mbachaud · May 12, 2026 · May 12, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,27 @@
 
 ## Unreleased
 
+- **fix(launcher): `POST /api/control/start` no longer reports success on a
+  hung backend; returns `202 Accepted` with `started_pending=true`.** PR #68
+  made `supervisor.start()` non-fatal on `/stats` timeout (proc left
+  running so the tray's next poll picks it up). The REST handler still
+  treated this as success and returned `{ok: true, pid}`, so external
+  automation hitting `/api/control/start` directly couldn't distinguish
+  ready from alive-but-not-ready. New `supervisor.last_start_pending`
+  flag flips on the timeout path; REST surface returns 202 with a
+  `started_pending: true` field and a hint to poll `/api/state` or
+  `GET /stats`. Same treatment on `/api/control/restart`. Closes #72.
+- **fix(hardware): summary `WARNING` line when explicit-device probe
+  falls back to CPU.** The tray fires a balloon, but headless deployments
+  (server, supervisor-managed, agents) miss that signal. `_detect()` now
+  emits one `log.warning("Hardware fallback: requested=X active=cpu — ...")`
+  alongside the per-candidate probe failures so operators tailing logs
+  see the cause in line-of-sight. `auto`→cpu is unchanged (not noteworthy).
+  Closes #65 SF2 — SF1, SF3, SF4 were already addressed on master
+  (per-rewrite log.info, `cost_class` in `/health` + Prometheus info
+  metric + startup WARN, and the WAL-bloat section in
+  `docs/TROUBLESHOOTING.md` + `/admin/checkpoint` admin endpoint).
+
 - **fix(api): `open_session()` now honors `HELIX_CONFIG` / `HELIX_GENOME_PATH`.**
   Pre-fix, every cold-start CLI subcommand (query, diag corpus, packet,
   gene, neighbors, refresh-targets) called `HelixConfig()` (defaults) and

diff --git a/helix_context/hardware.py b/helix_context/hardware.py
@@ -300,6 +300,14 @@ def _detect() -> HardwareInfo:
             f"requested {requested!r} but probe/availability failed: "
             f"{last_failure_reason or 'no usable device found'}"
         )
+        # SF2 (#65): headless deployments miss the tray balloon. A summary
+        # WARNING here means operators tailing logs see one clear line
+        # connecting the per-candidate probe failures above to the
+        # final "we ended up on CPU" outcome.
+        log.warning(
+            "Hardware fallback: requested=%s active=cpu — %s",
+            requested, fallback_reason,
+        )
     else:
         fallback_reason = None
 

diff --git a/helix_context/launcher/app.py b/helix_context/launcher/app.py
@@ -29,7 +29,6 @@
     HelixSupervisor,
     NotRunning,
     ShutdownTimeout,
-    StartupTimeout,
     SupervisorError,
 )
 from .update_check import UpdateChecker
@@ -141,11 +140,32 @@ async def api_state():
     async def api_control_start():
         try:
             pid = supervisor.start()
-            return {"ok": True, "pid": pid}
         except AlreadyRunning as exc:
             return JSONResponse({"error": str(exc)}, status_code=409)
-        except (StartupTimeout, SupervisorError) as exc:
+        except SupervisorError as exc:
+            # NB: StartupTimeout is no longer raised by start() (PR #68); it is
+            # surfaced via ``supervisor.last_start_pending`` below. SupervisorError
+            # still covers port-collisions and non-timeout failures.
             return JSONResponse({"error": str(exc)}, status_code=500)
+        # PR #68 made cold-start /stats timeout non-fatal — proc is left running
+        # for the tray's next poll. REST callers need to distinguish "ready"
+        # from "alive-but-not-ready" so an external automation doesn't treat
+        # a hung backend as success (closes #72).
+        if supervisor.last_start_pending:
+            return JSONResponse(
+                {
+                    "ok": True,
+                    "pid": pid,
+                    "started_pending": True,
+                    "message": (
+                        "Backend pid was spawned but /stats did not answer "
+                        "within the wait timeout. The process is still alive; "
+                        "poll /api/state or GET /stats to confirm readiness."
+                    ),
+                },
+                status_code=202,
+            )
+        return {"ok": True, "pid": pid, "started_pending": False}
 
     @app.post("/api/control/stop")
     async def api_control_stop():
@@ -163,9 +183,24 @@ async def api_control_stop():
     async def api_control_restart():
         try:
             pid = supervisor.restart(reason="manual restart from launcher UI")
-            return {"ok": True, "pid": pid}
-        except (StartupTimeout, ShutdownTimeout, SupervisorError) as exc:
+        except (ShutdownTimeout, SupervisorError) as exc:
             return JSONResponse({"error": str(exc)}, status_code=500)
+        # Same alive-but-not-ready surface as /api/control/start (closes #72).
+        if supervisor.last_start_pending:
+            return JSONResponse(
+                {
+                    "ok": True,
+                    "pid": pid,
+                    "started_pending": True,
+                    "message": (
+                        "Backend pid was respawned but /stats did not answer "
+                        "within the wait timeout. The process is still alive; "
+                        "poll /api/state or GET /stats to confirm readiness."
+                    ),
+                },
+                status_code=202,
+            )
+        return {"ok": True, "pid": pid, "started_pending": False}
 
     return app
 

diff --git a/helix_context/launcher/supervisor.py b/helix_context/launcher/supervisor.py
@@ -91,6 +91,10 @@ def __init__(
         self._last_error: Optional[str] = None
         self._last_error_at: Optional[float] = None
         self._last_error_operation: Optional[str] = None
+        # True iff the most recent start() returned a pid for a process that
+        # did not answer /stats within the wait timeout. Read by REST callers
+        # (closes #72) so a hung backend doesn't look like "ok pid=42".
+        self._last_start_pending: bool = False
         self._owns_helix_process = False
         # Tracked Popen handle — used for POSIX zombie reap via poll().
         # Windows reaps through taskkill; POSIX needs a wait() sibling call.
@@ -188,6 +192,17 @@ def get_last_error(self) -> Optional[dict]:
             "at": self._last_error_at,
         }
 
+    @property
+    def last_start_pending(self) -> bool:
+        """True iff the most recent ``start()`` returned a pid for a process
+        that did not answer ``/stats`` within the wait timeout. PR #68 made
+        cold-start /stats timeout non-fatal (the spawned proc is left running
+        for the tray's next poll); this flag lets REST callers detect the
+        "alive but not ready" state so they don't report success on a hung
+        backend (closes #72). Reset to False on the next start() success or
+        adoption path."""
+        return self._last_start_pending
+
     def _record_error(self, operation: str, message: str) -> None:
         self._last_error = message
         self._last_error_at = time.time()
@@ -308,6 +323,7 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int:
                     port=self.helix_port,
                 )
                 self._owns_helix_process = False
+                self._last_start_pending = False
                 self._clear_error()
                 return orphan_pid
 
@@ -362,9 +378,11 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int:
                     timeout, proc.pid, exc,
                 )
                 self._record_error("start", str(exc))
+                self._last_start_pending = True
                 return proc.pid
 
         log.info("Helix started (pid=%d)", proc.pid)
+        self._last_start_pending = False
         self._clear_error()
         return proc.pid
 

diff --git a/tests/test_hardware.py b/tests/test_hardware.py
@@ -320,6 +320,54 @@ def test_explicit_cuda_falls_back_to_cpu_on_probe_failure(mock_torch, monkeypatc
     assert "cuda" in info.fallback_reason.lower()
 
 
+def test_fallback_emits_summary_warning_for_headless_operators(
+    mock_torch, monkeypatch, caplog
+):
+    """SF2 (#65): operators tailing logs on a headless deployment miss
+    the tray balloon. _detect() must emit a single WARNING line tying the
+    requested/active devices to the fallback_reason so the cause is in
+    line-of-sight, not just a downstream /health field."""
+    import logging
+    monkeypatch.setenv("HELIX_DEVICE", "cuda")
+    mock_torch["cuda_available"] = False
+    with caplog.at_level(logging.WARNING, logger="helix.hardware"):
+        info = hardware._detect()
+    assert info.device_type == "cpu"
+    assert info.fallback_reason is not None
+    summary_lines = [
+        rec.message
+        for rec in caplog.records
+        if "hardware fallback" in rec.message.lower()
+    ]
+    assert summary_lines, (
+        "expected a 'Hardware fallback: ...' summary WARNING; got: "
+        f"{[r.message for r in caplog.records]}"
+    )
+    msg = summary_lines[0].lower()
+    assert "requested=cuda" in msg
+    assert "active=cpu" in msg
+
+
+def test_auto_picker_does_not_emit_fallback_warning(mock_torch, caplog):
+    """When the user requested 'auto' and we landed on CPU, that's a
+    normal outcome (no GPU on this box), not a fallback. The summary
+    WARNING from SF2 must not fire — auto-on-CPU is not noteworthy."""
+    import logging
+    # mock_torch fixture defaults all backends to unavailable -> auto -> cpu
+    with caplog.at_level(logging.WARNING, logger="helix.hardware"):
+        info = hardware._detect()
+    assert info.device_type == "cpu"
+    assert info.fallback_reason is None
+    summary_lines = [
+        rec.message
+        for rec in caplog.records
+        if "hardware fallback" in rec.message.lower()
+    ]
+    assert not summary_lines, (
+        f"auto-on-cpu should not fire the fallback WARNING; got: {summary_lines}"
+    )
+
+
 def test_helix_device_env_var_case_insensitive(mock_torch, monkeypatch):
     monkeypatch.setenv("HELIX_DEVICE", "CPU")
     info = hardware._detect()

diff --git a/tests/test_launcher_app.py b/tests/test_launcher_app.py
@@ -17,7 +17,7 @@
     AlreadyRunning,
     NotRunning,
     ShutdownTimeout,
-    StartupTimeout,
+    SupervisorError,
 )
 
 
@@ -187,17 +187,37 @@ def test_panels_partial_renders_disconnected_agents(self, client, fake_collector
 class TestControlStart:
     def test_start_success(self, client, fake_supervisor):
         fake_supervisor.start.return_value = 99999
+        fake_supervisor.last_start_pending = False
         resp = client.post("/api/control/start")
         assert resp.status_code == 200
-        assert resp.json() == {"ok": True, "pid": 99999}
+        assert resp.json() == {"ok": True, "pid": 99999, "started_pending": False}
 
     def test_start_already_running_returns_409(self, client, fake_supervisor):
         fake_supervisor.start.side_effect = AlreadyRunning("already")
         resp = client.post("/api/control/start")
         assert resp.status_code == 409
 
-    def test_start_timeout_returns_500(self, client, fake_supervisor):
-        fake_supervisor.start.side_effect = StartupTimeout("did not start")
+    def test_start_pending_returns_202_with_started_pending_field(
+        self, client, fake_supervisor
+    ):
+        """Regression for #72: PR #68 made supervisor.start() return pid
+        on /stats timeout (proc left running for tray's next poll). REST
+        callers must see a distinct alive-but-not-ready signal instead of
+        the success-shaped 200."""
+        fake_supervisor.start.return_value = 99999
+        fake_supervisor.last_start_pending = True
+        resp = client.post("/api/control/start")
+        assert resp.status_code == 202
+        body = resp.json()
+        assert body["ok"] is True
+        assert body["pid"] == 99999
+        assert body["started_pending"] is True
+        assert "did not answer" in body["message"]
+
+    def test_start_supervisor_error_returns_500(self, client, fake_supervisor):
+        """Non-timeout supervisor failures (port collision, psutil missing,
+        etc) still surface as 500 — only /stats timeouts are 202."""
+        fake_supervisor.start.side_effect = SupervisorError("port collision")
         resp = client.post("/api/control/start")
         assert resp.status_code == 500
 
@@ -223,9 +243,24 @@ def test_stop_shutdown_timeout_returns_408(self, client, fake_supervisor):
 class TestControlRestart:
     def test_restart_success(self, client, fake_supervisor):
         fake_supervisor.restart.return_value = 88888
+        fake_supervisor.last_start_pending = False
         resp = client.post("/api/control/restart")
         assert resp.status_code == 200
-        assert resp.json() == {"ok": True, "pid": 88888}
+        assert resp.json() == {"ok": True, "pid": 88888, "started_pending": False}
+
+    def test_restart_pending_returns_202_with_started_pending_field(
+        self, client, fake_supervisor
+    ):
+        """Same alive-but-not-ready surface as /api/control/start (#72)."""
+        fake_supervisor.restart.return_value = 88888
+        fake_supervisor.last_start_pending = True
+        resp = client.post("/api/control/restart")
+        assert resp.status_code == 202
+        body = resp.json()
+        assert body["ok"] is True
+        assert body["pid"] == 88888
+        assert body["started_pending"] is True
+        assert "did not answer" in body["message"]
 
 
 class TestNativeFailFast:

diff --git a/tests/test_launcher_supervisor.py b/tests/test_launcher_supervisor.py
@@ -119,6 +119,8 @@ def test_start_spawns_subprocess_and_writes_state(self, supervisor, store):
         assert store.state.helix_pid == 54321
         assert store.state.helix_port == 11999
         assert popen_mock.called
+        # Success path: pending flag is cleared (closes #72).
+        assert supervisor.last_start_pending is False
 
         # Verify CREATE_NO_WINDOW was passed on Windows, 0 elsewhere
         popen_kwargs = popen_mock.call_args.kwargs
@@ -153,6 +155,9 @@ def test_start_keeps_process_running_on_startup_timeout(self, supervisor, store)
 
         # State remains set; the tray will probe /stats on the next refresh.
         assert store.state.helix_pid == 54321
+        # Pending flag is set so REST callers can distinguish ready vs
+        # alive-but-not-ready (closes #72).
+        assert supervisor.last_start_pending is True
 
 
 class TestStop: