Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,27 @@

## Unreleased

- **fix(launcher): `POST /api/control/start` no longer reports success on a
hung backend; returns `202 Accepted` with `started_pending=true`.** PR #68
made `supervisor.start()` non-fatal on `/stats` timeout (proc left
running so the tray's next poll picks it up). The REST handler still
treated this as success and returned `{ok: true, pid}`, so external
automation hitting `/api/control/start` directly couldn't distinguish
ready from alive-but-not-ready. New `supervisor.last_start_pending`
flag flips on the timeout path; REST surface returns 202 with a
`started_pending: true` field and a hint to poll `/api/state` or
`GET /stats`. Same treatment on `/api/control/restart`. Closes #72.
- **fix(hardware): summary `WARNING` line when explicit-device probe
falls back to CPU.** The tray fires a balloon, but headless deployments
(server, supervisor-managed, agents) miss that signal. `_detect()` now
emits one `log.warning("Hardware fallback: requested=X active=cpu — ...")`
alongside the per-candidate probe failures so operators tailing logs
see the cause in line-of-sight. `auto`→cpu is unchanged (not noteworthy).
Closes #65 SF2 — SF1, SF3, SF4 were already addressed on master
(per-rewrite log.info, `cost_class` in `/health` + Prometheus info
metric + startup WARN, and the WAL-bloat section in
`docs/TROUBLESHOOTING.md` + `/admin/checkpoint` admin endpoint).

- **fix(api): `open_session()` now honors `HELIX_CONFIG` / `HELIX_GENOME_PATH`.**
Pre-fix, every cold-start CLI subcommand (query, diag corpus, packet,
gene, neighbors, refresh-targets) called `HelixConfig()` (defaults) and
Expand Down
8 changes: 8 additions & 0 deletions helix_context/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,14 @@ def _detect() -> HardwareInfo:
f"requested {requested!r} but probe/availability failed: "
f"{last_failure_reason or 'no usable device found'}"
)
# SF2 (#65): headless deployments miss the tray balloon. A summary
# WARNING here means operators tailing logs see one clear line
# connecting the per-candidate probe failures above to the
# final "we ended up on CPU" outcome.
log.warning(
"Hardware fallback: requested=%s active=cpu — %s",
requested, fallback_reason,
)
else:
fallback_reason = None

Expand Down
45 changes: 40 additions & 5 deletions helix_context/launcher/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
HelixSupervisor,
NotRunning,
ShutdownTimeout,
StartupTimeout,
SupervisorError,
)
from .update_check import UpdateChecker
Expand Down Expand Up @@ -141,11 +140,32 @@ async def api_state():
async def api_control_start():
try:
pid = supervisor.start()
return {"ok": True, "pid": pid}
except AlreadyRunning as exc:
return JSONResponse({"error": str(exc)}, status_code=409)
except (StartupTimeout, SupervisorError) as exc:
except SupervisorError as exc:
# NB: StartupTimeout is no longer raised by start() (PR #68); it is
# surfaced via ``supervisor.last_start_pending`` below. SupervisorError
# still covers port-collisions and non-timeout failures.
return JSONResponse({"error": str(exc)}, status_code=500)
# PR #68 made cold-start /stats timeout non-fatal — proc is left running
# for the tray's next poll. REST callers need to distinguish "ready"
# from "alive-but-not-ready" so an external automation doesn't treat
# a hung backend as success (closes #72).
if supervisor.last_start_pending:
return JSONResponse(
{
"ok": True,
"pid": pid,
"started_pending": True,
"message": (
"Backend pid was spawned but /stats did not answer "
"within the wait timeout. The process is still alive; "
"poll /api/state or GET /stats to confirm readiness."
),
},
status_code=202,
)
return {"ok": True, "pid": pid, "started_pending": False}

@app.post("/api/control/stop")
async def api_control_stop():
Expand All @@ -163,9 +183,24 @@ async def api_control_stop():
async def api_control_restart():
try:
pid = supervisor.restart(reason="manual restart from launcher UI")
return {"ok": True, "pid": pid}
except (StartupTimeout, ShutdownTimeout, SupervisorError) as exc:
except (ShutdownTimeout, SupervisorError) as exc:
return JSONResponse({"error": str(exc)}, status_code=500)
# Same alive-but-not-ready surface as /api/control/start (closes #72).
if supervisor.last_start_pending:
return JSONResponse(
{
"ok": True,
"pid": pid,
"started_pending": True,
"message": (
"Backend pid was respawned but /stats did not answer "
"within the wait timeout. The process is still alive; "
"poll /api/state or GET /stats to confirm readiness."
),
},
status_code=202,
)
return {"ok": True, "pid": pid, "started_pending": False}

return app

Expand Down
18 changes: 18 additions & 0 deletions helix_context/launcher/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ def __init__(
self._last_error: Optional[str] = None
self._last_error_at: Optional[float] = None
self._last_error_operation: Optional[str] = None
# True iff the most recent start() returned a pid for a process that
# did not answer /stats within the wait timeout. Read by REST callers
# (closes #72) so a hung backend doesn't look like "ok pid=42".
self._last_start_pending: bool = False
self._owns_helix_process = False
# Tracked Popen handle — used for POSIX zombie reap via poll().
# Windows reaps through taskkill; POSIX needs a wait() sibling call.
Expand Down Expand Up @@ -188,6 +192,17 @@ def get_last_error(self) -> Optional[dict]:
"at": self._last_error_at,
}

@property
def last_start_pending(self) -> bool:
"""True iff the most recent ``start()`` returned a pid for a process
that did not answer ``/stats`` within the wait timeout. PR #68 made
cold-start /stats timeout non-fatal (the spawned proc is left running
for the tray's next poll); this flag lets REST callers detect the
"alive but not ready" state so they don't report success on a hung
backend (closes #72). Reset to False on the next start() success or
adoption path."""
return self._last_start_pending

def _record_error(self, operation: str, message: str) -> None:
self._last_error = message
self._last_error_at = time.time()
Expand Down Expand Up @@ -308,6 +323,7 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int:
port=self.helix_port,
)
self._owns_helix_process = False
self._last_start_pending = False
self._clear_error()
return orphan_pid

Expand Down Expand Up @@ -362,9 +378,11 @@ def start(self, wait_ready: bool = True, timeout: float = 90.0) -> int:
timeout, proc.pid, exc,
)
self._record_error("start", str(exc))
self._last_start_pending = True
return proc.pid

log.info("Helix started (pid=%d)", proc.pid)
self._last_start_pending = False
self._clear_error()
return proc.pid

Expand Down
48 changes: 48 additions & 0 deletions tests/test_hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,54 @@ def test_explicit_cuda_falls_back_to_cpu_on_probe_failure(mock_torch, monkeypatc
assert "cuda" in info.fallback_reason.lower()


def test_fallback_emits_summary_warning_for_headless_operators(
mock_torch, monkeypatch, caplog
):
"""SF2 (#65): operators tailing logs on a headless deployment miss
the tray balloon. _detect() must emit a single WARNING line tying the
requested/active devices to the fallback_reason so the cause is in
line-of-sight, not just a downstream /health field."""
import logging
monkeypatch.setenv("HELIX_DEVICE", "cuda")
mock_torch["cuda_available"] = False
with caplog.at_level(logging.WARNING, logger="helix.hardware"):
info = hardware._detect()
assert info.device_type == "cpu"
assert info.fallback_reason is not None
summary_lines = [
rec.message
for rec in caplog.records
if "hardware fallback" in rec.message.lower()
]
assert summary_lines, (
"expected a 'Hardware fallback: ...' summary WARNING; got: "
f"{[r.message for r in caplog.records]}"
)
msg = summary_lines[0].lower()
assert "requested=cuda" in msg
assert "active=cpu" in msg


def test_auto_picker_does_not_emit_fallback_warning(mock_torch, caplog):
"""When the user requested 'auto' and we landed on CPU, that's a
normal outcome (no GPU on this box), not a fallback. The summary
WARNING from SF2 must not fire — auto-on-CPU is not noteworthy."""
import logging
# mock_torch fixture defaults all backends to unavailable -> auto -> cpu
with caplog.at_level(logging.WARNING, logger="helix.hardware"):
info = hardware._detect()
assert info.device_type == "cpu"
assert info.fallback_reason is None
summary_lines = [
rec.message
for rec in caplog.records
if "hardware fallback" in rec.message.lower()
]
assert not summary_lines, (
f"auto-on-cpu should not fire the fallback WARNING; got: {summary_lines}"
)


def test_helix_device_env_var_case_insensitive(mock_torch, monkeypatch):
monkeypatch.setenv("HELIX_DEVICE", "CPU")
info = hardware._detect()
Expand Down
45 changes: 40 additions & 5 deletions tests/test_launcher_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
AlreadyRunning,
NotRunning,
ShutdownTimeout,
StartupTimeout,
SupervisorError,
)


Expand Down Expand Up @@ -187,17 +187,37 @@ def test_panels_partial_renders_disconnected_agents(self, client, fake_collector
class TestControlStart:
def test_start_success(self, client, fake_supervisor):
fake_supervisor.start.return_value = 99999
fake_supervisor.last_start_pending = False
resp = client.post("/api/control/start")
assert resp.status_code == 200
assert resp.json() == {"ok": True, "pid": 99999}
assert resp.json() == {"ok": True, "pid": 99999, "started_pending": False}

def test_start_already_running_returns_409(self, client, fake_supervisor):
fake_supervisor.start.side_effect = AlreadyRunning("already")
resp = client.post("/api/control/start")
assert resp.status_code == 409

def test_start_timeout_returns_500(self, client, fake_supervisor):
fake_supervisor.start.side_effect = StartupTimeout("did not start")
def test_start_pending_returns_202_with_started_pending_field(
self, client, fake_supervisor
):
"""Regression for #72: PR #68 made supervisor.start() return pid
on /stats timeout (proc left running for tray's next poll). REST
callers must see a distinct alive-but-not-ready signal instead of
the success-shaped 200."""
fake_supervisor.start.return_value = 99999
fake_supervisor.last_start_pending = True
resp = client.post("/api/control/start")
assert resp.status_code == 202
body = resp.json()
assert body["ok"] is True
assert body["pid"] == 99999
assert body["started_pending"] is True
assert "did not answer" in body["message"]

def test_start_supervisor_error_returns_500(self, client, fake_supervisor):
"""Non-timeout supervisor failures (port collision, psutil missing,
etc) still surface as 500 — only /stats timeouts are 202."""
fake_supervisor.start.side_effect = SupervisorError("port collision")
resp = client.post("/api/control/start")
assert resp.status_code == 500

Expand All @@ -223,9 +243,24 @@ def test_stop_shutdown_timeout_returns_408(self, client, fake_supervisor):
class TestControlRestart:
def test_restart_success(self, client, fake_supervisor):
fake_supervisor.restart.return_value = 88888
fake_supervisor.last_start_pending = False
resp = client.post("/api/control/restart")
assert resp.status_code == 200
assert resp.json() == {"ok": True, "pid": 88888}
assert resp.json() == {"ok": True, "pid": 88888, "started_pending": False}

def test_restart_pending_returns_202_with_started_pending_field(
self, client, fake_supervisor
):
"""Same alive-but-not-ready surface as /api/control/start (#72)."""
fake_supervisor.restart.return_value = 88888
fake_supervisor.last_start_pending = True
resp = client.post("/api/control/restart")
assert resp.status_code == 202
body = resp.json()
assert body["ok"] is True
assert body["pid"] == 88888
assert body["started_pending"] is True
assert "did not answer" in body["message"]


class TestNativeFailFast:
Expand Down
5 changes: 5 additions & 0 deletions tests/test_launcher_supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ def test_start_spawns_subprocess_and_writes_state(self, supervisor, store):
assert store.state.helix_pid == 54321
assert store.state.helix_port == 11999
assert popen_mock.called
# Success path: pending flag is cleared (closes #72).
assert supervisor.last_start_pending is False

# Verify CREATE_NO_WINDOW was passed on Windows, 0 elsewhere
popen_kwargs = popen_mock.call_args.kwargs
Expand Down Expand Up @@ -153,6 +155,9 @@ def test_start_keeps_process_running_on_startup_timeout(self, supervisor, store)

# State remains set; the tray will probe /stats on the next refresh.
assert store.state.helix_pid == 54321
# Pending flag is set so REST callers can distinguish ready vs
# alive-but-not-ready (closes #72).
assert supervisor.last_start_pending is True


class TestStop:
Expand Down
Loading