swelljoe · swelljoe · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/competitors.example.yaml b/competitors.example.yaml
@@ -195,34 +195,42 @@
   size_class: medium
   knowledge_cutoff: 2025-06
 
-# Self-hosted Qwen 3.6 27B (8-bit) via LM Studio's OpenAI-compatible server, on the
-# shared raw-api-loop harness — apples-to-apples with the hosted models above, and the
-# point of comparison for "can a model you can actually run yourself do this work?".
-# Tool-calling verified live 2026-05-30: the model emits clean OpenAI tool_calls and
-# reports usage (reasoning tokens folded into the completion, which usage_delta handles).
+# Self-hosted Qwen 3.6 27B (8-bit) on the shared raw-api-loop harness — apples-to-apples
+# with the hosted models above, and the point of comparison for "can a model you can
+# actually run yourself do this work?". Tool-calling verified live 2026-05-30 on both
+# backends: the model emits clean OpenAI tool_calls and reports usage (reasoning tokens
+# folded into the completion, which usage_delta handles).
+#
+# SERVER: now llama-server (llama.cpp) on the Strix Halo (model id
+# unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL), reachable at http://10.20.30.2:8080/v1. It moved
+# here from a desktop LM Studio (qwen/qwen3.6-27b @ http://192.168.122.1:1234/v1) because
+# the desktop's GPUs OOM-CRASHED LM Studio on the large freerdp planar.c case (HTTP 400
+# "model has crashed", which then took the server down and cascaded onto the next case).
+# The Strix Halo has more memory (slower, but survives the heavy cases). NOTE: the UD-Q8_K_XL
+# unsloth dynamic quant differs slightly from LM Studio's vanilla Q8 — a minor methodological
+# seam for the 2 cases (freerdp, Ghost) re-run here vs the other 7 done on LM Studio.
 #
 # NETWORKING: the harness runs in a podman container with its own netns, so the endpoint
-# must be reachable at a real routable IP — NOT a loopback ssh-tunnel on the VM (the
-# container can't see the VM's 127.0.0.1). Enable LM Studio > Developer > "Serve on Local
-# Network" (binds 0.0.0.0) on the physical host and point base_url at that host's LAN IP.
-# >>> REPLACE the placeholder IP below before a live run. <<<
+# must be a real routable IP the container can reach (verified: 10.20.30.2:8080 → HTTP 200
+# from inside a container) — NOT a loopback ssh-tunnel on the VM (the container can't see
+# the VM's 127.0.0.1). Bind the server on 0.0.0.0.
 #
 # COST: self-hosted has no per-token dollar price, so no pricing is declared — cost stays
 # null (electricity/amortization is out of scope) while tokens are still recorded, which is
 # exactly the comparison you want (free-to-run vs paid frontier on the same matrix).
 #
-# AUTH: LM Studio doesn't check the key, but raw-api-loop requires an auth_profile; export
-# a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in nelson/auth.py).
+# AUTH: llama-server/LM Studio don't check the key, but raw-api-loop requires an auth_profile;
+# export a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in auth.py).
 #
 # knowledge_cutoff 2025-06 matches the cohort and keeps every (May-2026) corpus case in
 # the gate while staying plausibly pre-disclosure. VERIFY-AT-WIRING: if Qwen 3.6's real
 # training cutoff is later than a case's disclosure, raise this so memorized cases gate out.
 - name: raw-api-loop/qwen3.6-27b
-  model: qwen/qwen3.6-27b          # must match LM Studio's loaded model id exactly
+  model: unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL  # must match the server's loaded model id exactly
   runtime: raw-api-loop
   tool_profile: read-grep
   auth_profile: lmstudio-api-key   # value (any non-empty) from $LMSTUDIO_API_KEY on the host
-  cost_model: '{"base_url": "http://192.168.122.1:1234/v1"}'  # libvirt host bridge gateway
+  cost_model: '{"base_url": "http://10.20.30.2:8080/v1", "http_timeout": 1800}'  # Strix Halo, llama-server (slow APU → longer per-call timeout)
   size_class: small
   knowledge_cutoff: 2025-06
 

diff --git a/nelson/raw_api_loop.py b/nelson/raw_api_loop.py
@@ -33,6 +33,7 @@
 import os
 import subprocess
 import sys
+import time
 import urllib.error
 import urllib.request
 from collections.abc import Callable
@@ -43,8 +44,32 @@
 # Per-API-call read timeout. Reasoning models (Gemini 3.x pro, MiMo) can think for
 # minutes on a single turn over a large C file; a tight cap aborts the whole run as
 # an infra_error and silently drops the slowest (often strongest) models from the
-# matrix. 600s lets a slow reasoner finish a turn rather than penalising it.
-HTTP_TIMEOUT = 600  # seconds per API call
+# matrix. 600s lets a slow reasoner finish a turn rather than penalising it. A slow
+# self-hosted box (e.g. a 27B on an APU, where one big-context turn can exceed 600s)
+# raises it via NELSON_HTTP_TIMEOUT (set from the competitor's cost_model http_timeout).
+HTTP_TIMEOUT = 600  # default seconds per API call
+
+
+def _http_timeout() -> int:
+    """Per-call read timeout: NELSON_HTTP_TIMEOUT if set and valid, else the default."""
+    raw = os.environ.get("NELSON_HTTP_TIMEOUT")
+    if raw:
+        with contextlib.suppress(ValueError):
+            return int(raw)
+    return HTTP_TIMEOUT
+
+
+# Transient faults that warrant a retry rather than failing the whole run: provider
+# rate limits (429 — Mistral caps tokens/minute, which an agentic burst of large-
+# context calls trips even on a paid tier) and the 5xx family, plus transport errors
+# (a self-hosted endpoint like LM Studio closing the socket mid-response on a heavy
+# request). We pace-and-recover with exponential backoff; a persistent outage still
+# surfaces after the cap so the runner records a real infra_error, never a false
+# "found nothing".
+HTTP_RETRY_STATUS = (429, 500, 502, 503, 504)
+MAX_HTTP_RETRIES = 5  # retries after the first attempt (6 tries total)
+BACKOFF_BASE_S = 2.0  # exponential: 1, 2, 4, 8, 16 s ...
+BACKOFF_CAP_S = 60.0  # never wait longer than this between tries
 
 SYSTEM_PROMPT = (
     "You are a tool-using security auditor. Use the provided tools to read the "
@@ -199,25 +224,65 @@ def dispatch_tool(name: str, args: dict[str, Any], src_root: str | None = None)
 # -- HTTP + cost -------------------------------------------------------------
 
 
-def _post_chat(url: str, payload: dict[str, Any], api_key: str) -> dict[str, Any]:
+def _retry_after_seconds(err: urllib.error.HTTPError) -> float | None:
+    """Parse a Retry-After header (delta-seconds form) from a 429/503, if present."""
+    with contextlib.suppress(Exception):
+        raw = err.headers.get("Retry-After") if err.headers else None
+        if raw is not None:
+            return max(0.0, float(raw.strip()))
+    return None
+
+
+def _post_chat(
+    url: str,
+    payload: dict[str, Any],
+    api_key: str,
+    *,
+    max_retries: int = MAX_HTTP_RETRIES,
+    sleep: Callable[[float], None] = time.sleep,
+) -> dict[str, Any]:
     """POST a chat/completions request and return the parsed JSON body.
 
-    Raises urllib.error.HTTPError / URLError on HTTP or transport failure; main()
-    turns those into a non-zero exit + the provider error so the runner classifies
-    auth/rate/infra. Injectable in tests via ``run_loop(post=...)``.
+    Transient faults are retried with exponential backoff (honoring Retry-After):
+    HTTP 429/5xx and transport errors (connection reset, the provider closing the
+    socket mid-response, TLS read errors). This lets a rate-limited provider and a
+    flaky self-hosted endpoint pace-and-recover instead of failing the whole run on
+    the first hiccup. After ``max_retries`` the last error propagates so main()
+    still exits non-zero with the provider error — the runner then classifies a
+    persistent failure as auth/infra, never masking it as a model finding nothing.
+    Injectable in tests via ``run_loop(post=...)``; ``sleep`` is injectable too.
     """
     data = json.dumps(payload).encode("utf-8")
-
-    # operator-configured base_url, not model-controlled, so this S310 audit warning
-    # is acceptable here.
-    req = urllib.request.Request(url, data=data, method="POST")  # noqa: S310
-    req.add_header("Content-Type", "application/json")
-    if api_key:
-        req.add_header("Authorization", f"Bearer {api_key}")
-    with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp:  # noqa: S310
-        body = resp.read().decode("utf-8", errors="replace")
-    parsed = json.loads(body)
-    return parsed if isinstance(parsed, dict) else {}
+    for attempt in range(max_retries + 1):
+        # operator-configured base_url, not model-controlled, so this S310 audit
+        # warning is acceptable here. A fresh Request per attempt keeps retries clean.
+        req = urllib.request.Request(url, data=data, method="POST")  # noqa: S310
+        req.add_header("Content-Type", "application/json")
+        if api_key:
+            req.add_header("Authorization", f"Bearer {api_key}")
+        try:
+            with urllib.request.urlopen(req, timeout=_http_timeout()) as resp:  # noqa: S310
+                body = resp.read().decode("utf-8", errors="replace")
+            parsed = json.loads(body)
+            return parsed if isinstance(parsed, dict) else {}
+        except urllib.error.HTTPError as e:
+            if e.code in HTTP_RETRY_STATUS and attempt < max_retries:
+                delay = _retry_after_seconds(e)
+                if delay is None:
+                    delay = min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt)
+                sleep(delay)
+                continue
+            raise
+        except urllib.error.URLError:
+            # Transport-level failure (connection refused/reset, socket closed
+            # mid-response, TLS read error). HTTPError is a URLError subclass but is
+            # handled above, so this is purely the no-HTTP-response case.
+            if attempt < max_retries:
+                sleep(min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt))
+                continue
+            raise
+    # Unreachable: the final attempt either returns or raises. Satisfy type checkers.
+    raise RuntimeError("retry loop exited without returning")  # pragma: no cover
 
 
 def usage_delta(usage: dict[str, Any]) -> tuple[int, int]:

diff --git a/nelson/runtimes.py b/nelson/runtimes.py
@@ -380,6 +380,10 @@ def build_spec(self, ctx: RuntimeContext) -> ContainerSpec:
             env["NELSON_INPUT_USD_PER_MTOK"] = str(cfg["input_usd_per_mtok"])
         if cfg.get("output_usd_per_mtok") is not None:
             env["NELSON_OUTPUT_USD_PER_MTOK"] = str(cfg["output_usd_per_mtok"])
+        if cfg.get("http_timeout") is not None:
+            # Slow self-hosted endpoints raise the per-API-call read timeout so a single
+            # big-context turn isn't cut off as an infra_error (see _http_timeout).
+            env["NELSON_HTTP_TIMEOUT"] = str(cfg["http_timeout"])
         mounts = [
             (str(_RAW_API_SCRIPT_HOST), _RAW_API_SCRIPT_CONTAINER, "ro"),
             (str(ctx.src_dir), "/src", "ro"),

diff --git a/tests/test_raw_api_loop.py b/tests/test_raw_api_loop.py
@@ -6,8 +6,15 @@
 """
 
 import json
+import urllib.error
+from email.message import Message
 
+import pytest
+
+import nelson.raw_api_loop as ral
 from nelson.raw_api_loop import (
+    MAX_HTTP_RETRIES,
+    _post_chat,
     _resolve_in_src,
     compute_cost,
     dispatch_tool,
@@ -195,3 +202,118 @@ def test_usage_delta_falls_back_to_completion_without_total():
         {"prompt_tokens": 100, "completion_tokens": 40, "total_tokens": 140}
     ) == (100, 40)
     assert usage_delta({}) == (0, 0)
+
+
+# -- _post_chat retry / backoff ----------------------------------------------
+#
+# Transient faults (provider 429 tokens/min, a self-hosted endpoint dropping the
+# socket mid-response) must be retried with backoff, not fail the whole run. The
+# real urlopen is monkeypatched; sleep is injected so no test actually waits.
+
+
+class _FakeResp:
+    def __init__(self, body: str):
+        self._body = body.encode("utf-8")
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_a):
+        return False
+
+    def read(self):
+        return self._body
+
+
+def _http_error(code: int, retry_after=None) -> urllib.error.HTTPError:
+    hdrs = Message()
+    if retry_after is not None:
+        hdrs["Retry-After"] = str(retry_after)
+    return urllib.error.HTTPError(
+        "http://x/v1/chat/completions", code, "err", hdrs, None
+    )
+
+
+def _scripted_urlopen(events):
+    """Return a fake urlopen yielding each event: raise Exceptions, return responses."""
+    it = iter(events)
+
+    def fake(_req, timeout=None):
+        ev = next(it)
+        if isinstance(ev, Exception):
+            raise ev
+        return ev
+
+    return fake
+
+
+def test_post_chat_retries_on_429_then_succeeds(monkeypatch):
+    slept = []
+    monkeypatch.setattr(
+        ral.urllib.request,
+        "urlopen",
+        _scripted_urlopen(
+            [_http_error(429), _http_error(429), _FakeResp('{"ok": true}')]
+        ),
+    )
+    out = _post_chat("http://x", {"m": 1}, "k", sleep=slept.append)
+    assert out == {"ok": True}
+    assert len(slept) == 2  # backed off before each of the two retries
+
+
+def test_post_chat_honors_retry_after_header(monkeypatch):
+    slept = []
+    monkeypatch.setattr(
+        ral.urllib.request,
+        "urlopen",
+        _scripted_urlopen([_http_error(429, retry_after=7), _FakeResp('{"ok": 1}')]),
+    )
+    _post_chat("http://x", {}, "k", sleep=slept.append)
+    assert slept == [7.0]  # honored the server's Retry-After, not the backoff curve
+
+
+def test_post_chat_retries_on_transport_error(monkeypatch):
+    slept = []
+    monkeypatch.setattr(
+        ral.urllib.request,
+        "urlopen",
+        _scripted_urlopen(
+            [urllib.error.URLError("connection reset"), _FakeResp('{"ok": 1}')]
+        ),
+    )
+    out = _post_chat("http://x", {}, "k", sleep=slept.append)
+    assert out == {"ok": 1}
+    assert len(slept) == 1
+
+
+def test_post_chat_gives_up_after_max_retries(monkeypatch):
+    monkeypatch.setattr(
+        ral.urllib.request,
+        "urlopen",
+        _scripted_urlopen([_http_error(429)] * (MAX_HTTP_RETRIES + 1)),
+    )
+    # A persistent rate limit still surfaces (so the runner records infra_error).
+    with pytest.raises(urllib.error.HTTPError):
+        _post_chat("http://x", {}, "k", sleep=lambda _s: None)
+
+
+def test_http_timeout_defaults_and_env_override(monkeypatch):
+    monkeypatch.delenv("NELSON_HTTP_TIMEOUT", raising=False)
+    assert ral._http_timeout() == ral.HTTP_TIMEOUT  # default when unset
+    monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "1800")
+    assert ral._http_timeout() == 1800  # slow self-hosted box raises it
+    monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "not-an-int")
+    assert ral._http_timeout() == ral.HTTP_TIMEOUT  # garbage falls back to default
+
+
+def test_post_chat_does_not_retry_non_retryable_status(monkeypatch):
+    calls = []
+
+    def fake(_req, timeout=None):
+        calls.append(1)
+        raise _http_error(400)
+
+    monkeypatch.setattr(ral.urllib.request, "urlopen", fake)
+    with pytest.raises(urllib.error.HTTPError):
+        _post_chat("http://x", {}, "k", sleep=lambda _s: None)
+    assert len(calls) == 1  # 400 is a client error, not retried