From 768ff040557eeabee1464bbeca7e92057eba9e04 Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Thu, 2 Jul 2026 10:47:03 -0400
Subject: [PATCH] ops-controller: first-class guardian GPU lease
 (/guardian/hold + /guardian/release)

The reactive ComfyUI guardian can't reliably free the 32GB card for a VibeVoice-Large
render (~30GB): its 30s graceful stop races the model load and its 20s drain resumes
llamacpp mid-render, so a multi-line render flaps and OOMs.

Add an explicit GPU lease:
- POST /guardian/hold  -> stop the target (llamacpp), wait until VRAM is actually free,
  and set a 'held' flag so the guardian loop keeps it stopped and never resumes it.
- POST /guardian/release -> clear the hold and restart the target.
- guardian loop honours 'held' (keeps target down, skips queue logic) and resets a
  stuck 'held' state back to idle once released.

Callers (the dialogue-reel render) bracket the render with hold/release for a
deterministic free card: 0 OOMs, no flapping. Additive; normal guardian behaviour
and existing endpoints unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ops-controller/main.py | 83 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/ops-controller/main.py b/ops-controller/main.py
index 512afd8..0241c68 100644
--- a/ops-controller/main.py
+++ b/ops-controller/main.py
@@ -394,6 +394,15 @@ def _live_gpus() -> dict:
         return {}
 
 
+def _compute_gpu_free_gb() -> float:
+    """Free VRAM (GB) on the largest GPU — the compute card. 0.0 if unknown."""
+    gpus = _live_gpus()
+    if not gpus:
+        return 0.0
+    big = max(gpus.values(), key=lambda g: g.get("total_gb", 0.0))
+    return round(big.get("total_gb", 0.0) - big.get("used_gb", 0.0), 1)
+
+
 # Model download (ComfyUI files)
 COMFYUI_MODELS_DIR = Path(os.environ.get("COMFYUI_MODELS_DIR", "/models/comfyui"))
 # Same layout as docker-compose: ${BASE_PATH}/data/comfyui-storage → comfyui /root
@@ -489,6 +498,8 @@ def _live_gpus() -> dict:
     "last_transition": None,
     "last_error": "",
     "paused_by_us": False,
+    "held": False,          # external GPU lease — see /guardian/hold + /guardian/release
+    "held_reason": "",
 }
 
 
@@ -2079,6 +2090,29 @@ def _guardian_loop() -> None:
 
     while True:
         try:
+            with _guardian_lock:
+                held = _guardian_status["held"]
+                held_state = _guardian_status["state"]
+            if held:
+                # External GPU lease (/guardian/hold): keep the target STOPPED and
+                # never resume it until /guardian/release. Ignore queue state while
+                # held, so a VRAM-heavy ComfyUI job owns the whole card.
+                for c in _containers_for_service(target):
+                    if c.status == "running":
+                        try:
+                            c.stop(timeout=30)
+                        except Exception:
+                            pass
+                with _guardian_lock:
+                    _guardian_status["state"] = "held"
+                    _guardian_status["paused_by_us"] = False
+                time.sleep(COMFYUI_QUEUE_POLL_SECONDS)
+                continue
+            if held_state == "held":
+                # Lease was just released (held flipped False while state lagged at
+                # "held"). Reset to idle so the normal pause/resume machine resumes.
+                _guardian_transition("idle")
+
             depth = _comfyui_queue_depth()
             if depth is None:
                 with _guardian_lock:
@@ -2191,6 +2225,55 @@ async def guardian_status(_: None = Depends(verify_token)):
         return dict(_guardian_status)
 
 
+@app.post("/guardian/hold")
+async def guardian_hold(body: ConfirmBody, request: Request, _: None = Depends(verify_token)):
+    """Acquire a GPU lease: stop the guardian target (llamacpp) and HOLD it down —
+    the guardian will not resume it — until /guardian/release, so a VRAM-heavy
+    ComfyUI job (e.g. the VibeVoice dialogue render) owns the whole card. Blocks
+    until VRAM has actually been released (or a 90s cap). Auth required."""
+    target = COMFYUI_GUARDIAN_TARGET
+    if body.dry_run:
+        return {"would": "hold", "target": target}
+    min_free_gb = float(request.query_params.get("min_free_gb", "20"))
+    with _guardian_lock:
+        _guardian_status["held"] = True
+        _guardian_status["held_reason"] = request.query_params.get("reason", "gpu-lease")
+    for c in _containers_for_service(target):
+        try:
+            c.stop(timeout=30)
+        except Exception:
+            pass
+    deadline = time.time() + 90
+    free = _compute_gpu_free_gb()
+    while time.time() < deadline and free < min_free_gb:
+        time.sleep(2)
+        free = _compute_gpu_free_gb()
+    ready = free >= min_free_gb
+    _audit("guardian_hold", target, "ok" if ready else "error",
+           f"free_gb={free} min={min_free_gb}", correlation_id=_correlation_id(request))
+    return {"ok": True, "held": True, "target": target, "free_gb": free, "ready": ready}
+
+
+@app.post("/guardian/release")
+async def guardian_release(body: ConfirmBody, request: Request, _: None = Depends(verify_token)):
+    """Release the GPU lease: clear the hold and restart the target (llamacpp).
+    Auth required."""
+    target = COMFYUI_GUARDIAN_TARGET
+    if body.dry_run:
+        return {"would": "release", "target": target}
+    with _guardian_lock:
+        _guardian_status["held"] = False
+        _guardian_status["held_reason"] = ""
+    for c in _containers_for_service(target):
+        try:
+            c.start()
+        except Exception:
+            pass
+    _guardian_transition("idle")
+    _audit("guardian_release", target, "ok", "", correlation_id=_correlation_id(request))
+    return {"ok": True, "held": False, "target": target}
+
+
 # Start the guardian thread at module import. Doing it here instead of via
 # @app.on_event("startup") (deprecated in recent FastAPI) guarantees the thread
 # spawns regardless of the app lifecycle and surfaces errors immediately.