From 768ff040557eeabee1464bbeca7e92057eba9e04 Mon Sep 17 00:00:00 2001 From: Hermes Bot Date: Thu, 2 Jul 2026 10:47:03 -0400 Subject: [PATCH] ops-controller: first-class guardian GPU lease (/guardian/hold + /guardian/release) The reactive ComfyUI guardian can't reliably free the 32GB card for a VibeVoice-Large render (~30GB): its 30s graceful stop races the model load and its 20s drain resumes llamacpp mid-render, so a multi-line render flaps and OOMs. Add an explicit GPU lease: - POST /guardian/hold -> stop the target (llamacpp), wait until VRAM is actually free, and set a 'held' flag so the guardian loop keeps it stopped and never resumes it. - POST /guardian/release -> clear the hold and restart the target. - guardian loop honours 'held' (keeps target down, skips queue logic) and resets a stuck 'held' state back to idle once released. Callers (the dialogue-reel render) bracket the render with hold/release for a deterministic free card: 0 OOMs, no flapping. Additive; normal guardian behaviour and existing endpoints unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- ops-controller/main.py | 83 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/ops-controller/main.py b/ops-controller/main.py index 512afd8..0241c68 100644 --- a/ops-controller/main.py +++ b/ops-controller/main.py @@ -394,6 +394,15 @@ def _live_gpus() -> dict: return {} +def _compute_gpu_free_gb() -> float: + """Free VRAM (GB) on the largest GPU — the compute card. 0.0 if unknown.""" + gpus = _live_gpus() + if not gpus: + return 0.0 + big = max(gpus.values(), key=lambda g: g.get("total_gb", 0.0)) + return round(big.get("total_gb", 0.0) - big.get("used_gb", 0.0), 1) + + # Model download (ComfyUI files) COMFYUI_MODELS_DIR = Path(os.environ.get("COMFYUI_MODELS_DIR", "/models/comfyui")) # Same layout as docker-compose: ${BASE_PATH}/data/comfyui-storage → comfyui /root @@ -489,6 +498,8 @@ def _live_gpus() -> dict: "last_transition": None, "last_error": "", "paused_by_us": False, + "held": False, # external GPU lease — see /guardian/hold + /guardian/release + "held_reason": "", } @@ -2079,6 +2090,29 @@ def _guardian_loop() -> None: while True: try: + with _guardian_lock: + held = _guardian_status["held"] + held_state = _guardian_status["state"] + if held: + # External GPU lease (/guardian/hold): keep the target STOPPED and + # never resume it until /guardian/release. Ignore queue state while + # held, so a VRAM-heavy ComfyUI job owns the whole card. + for c in _containers_for_service(target): + if c.status == "running": + try: + c.stop(timeout=30) + except Exception: + pass + with _guardian_lock: + _guardian_status["state"] = "held" + _guardian_status["paused_by_us"] = False + time.sleep(COMFYUI_QUEUE_POLL_SECONDS) + continue + if held_state == "held": + # Lease was just released (held flipped False while state lagged at + # "held"). Reset to idle so the normal pause/resume machine resumes. + _guardian_transition("idle") + depth = _comfyui_queue_depth() if depth is None: with _guardian_lock: @@ -2191,6 +2225,55 @@ async def guardian_status(_: None = Depends(verify_token)): return dict(_guardian_status) +@app.post("/guardian/hold") +async def guardian_hold(body: ConfirmBody, request: Request, _: None = Depends(verify_token)): + """Acquire a GPU lease: stop the guardian target (llamacpp) and HOLD it down — + the guardian will not resume it — until /guardian/release, so a VRAM-heavy + ComfyUI job (e.g. the VibeVoice dialogue render) owns the whole card. Blocks + until VRAM has actually been released (or a 90s cap). Auth required.""" + target = COMFYUI_GUARDIAN_TARGET + if body.dry_run: + return {"would": "hold", "target": target} + min_free_gb = float(request.query_params.get("min_free_gb", "20")) + with _guardian_lock: + _guardian_status["held"] = True + _guardian_status["held_reason"] = request.query_params.get("reason", "gpu-lease") + for c in _containers_for_service(target): + try: + c.stop(timeout=30) + except Exception: + pass + deadline = time.time() + 90 + free = _compute_gpu_free_gb() + while time.time() < deadline and free < min_free_gb: + time.sleep(2) + free = _compute_gpu_free_gb() + ready = free >= min_free_gb + _audit("guardian_hold", target, "ok" if ready else "error", + f"free_gb={free} min={min_free_gb}", correlation_id=_correlation_id(request)) + return {"ok": True, "held": True, "target": target, "free_gb": free, "ready": ready} + + +@app.post("/guardian/release") +async def guardian_release(body: ConfirmBody, request: Request, _: None = Depends(verify_token)): + """Release the GPU lease: clear the hold and restart the target (llamacpp). + Auth required.""" + target = COMFYUI_GUARDIAN_TARGET + if body.dry_run: + return {"would": "release", "target": target} + with _guardian_lock: + _guardian_status["held"] = False + _guardian_status["held_reason"] = "" + for c in _containers_for_service(target): + try: + c.start() + except Exception: + pass + _guardian_transition("idle") + _audit("guardian_release", target, "ok", "", correlation_id=_correlation_id(request)) + return {"ok": True, "held": False, "target": target} + + # Start the guardian thread at module import. Doing it here instead of via # @app.on_event("startup") (deprecated in recent FastAPI) guarantees the thread # spawns regardless of the app lifecycle and surfaces errors immediately.