diff --git a/competitors.example.yaml b/competitors.example.yaml index 9d73ad6..7a7f42a 100644 --- a/competitors.example.yaml +++ b/competitors.example.yaml @@ -195,34 +195,42 @@ size_class: medium knowledge_cutoff: 2025-06 -# Self-hosted Qwen 3.6 27B (8-bit) via LM Studio's OpenAI-compatible server, on the -# shared raw-api-loop harness — apples-to-apples with the hosted models above, and the -# point of comparison for "can a model you can actually run yourself do this work?". -# Tool-calling verified live 2026-05-30: the model emits clean OpenAI tool_calls and -# reports usage (reasoning tokens folded into the completion, which usage_delta handles). +# Self-hosted Qwen 3.6 27B (8-bit) on the shared raw-api-loop harness — apples-to-apples +# with the hosted models above, and the point of comparison for "can a model you can +# actually run yourself do this work?". Tool-calling verified live 2026-05-30 on both +# backends: the model emits clean OpenAI tool_calls and reports usage (reasoning tokens +# folded into the completion, which usage_delta handles). +# +# SERVER: now llama-server (llama.cpp) on the Strix Halo (model id +# unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL), reachable at http://10.20.30.2:8080/v1. It moved +# here from a desktop LM Studio (qwen/qwen3.6-27b @ http://192.168.122.1:1234/v1) because +# the desktop's GPUs OOM-CRASHED LM Studio on the large freerdp planar.c case (HTTP 400 +# "model has crashed", which then took the server down and cascaded onto the next case). +# The Strix Halo has more memory (slower, but survives the heavy cases). NOTE: the UD-Q8_K_XL +# unsloth dynamic quant differs slightly from LM Studio's vanilla Q8 — a minor methodological +# seam for the 2 cases (freerdp, Ghost) re-run here vs the other 7 done on LM Studio. # # NETWORKING: the harness runs in a podman container with its own netns, so the endpoint -# must be reachable at a real routable IP — NOT a loopback ssh-tunnel on the VM (the -# container can't see the VM's 127.0.0.1). Enable LM Studio > Developer > "Serve on Local -# Network" (binds 0.0.0.0) on the physical host and point base_url at that host's LAN IP. -# >>> REPLACE the placeholder IP below before a live run. <<< +# must be a real routable IP the container can reach (verified: 10.20.30.2:8080 → HTTP 200 +# from inside a container) — NOT a loopback ssh-tunnel on the VM (the container can't see +# the VM's 127.0.0.1). Bind the server on 0.0.0.0. # # COST: self-hosted has no per-token dollar price, so no pricing is declared — cost stays # null (electricity/amortization is out of scope) while tokens are still recorded, which is # exactly the comparison you want (free-to-run vs paid frontier on the same matrix). # -# AUTH: LM Studio doesn't check the key, but raw-api-loop requires an auth_profile; export -# a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in nelson/auth.py). +# AUTH: llama-server/LM Studio don't check the key, but raw-api-loop requires an auth_profile; +# export a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in auth.py). # # knowledge_cutoff 2025-06 matches the cohort and keeps every (May-2026) corpus case in # the gate while staying plausibly pre-disclosure. VERIFY-AT-WIRING: if Qwen 3.6's real # training cutoff is later than a case's disclosure, raise this so memorized cases gate out. - name: raw-api-loop/qwen3.6-27b - model: qwen/qwen3.6-27b # must match LM Studio's loaded model id exactly + model: unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL # must match the server's loaded model id exactly runtime: raw-api-loop tool_profile: read-grep auth_profile: lmstudio-api-key # value (any non-empty) from $LMSTUDIO_API_KEY on the host - cost_model: '{"base_url": "http://192.168.122.1:1234/v1"}' # libvirt host bridge gateway + cost_model: '{"base_url": "http://10.20.30.2:8080/v1", "http_timeout": 1800}' # Strix Halo, llama-server (slow APU → longer per-call timeout) size_class: small knowledge_cutoff: 2025-06 diff --git a/nelson/raw_api_loop.py b/nelson/raw_api_loop.py index c92436d..0f20af2 100644 --- a/nelson/raw_api_loop.py +++ b/nelson/raw_api_loop.py @@ -33,6 +33,7 @@ import os import subprocess import sys +import time import urllib.error import urllib.request from collections.abc import Callable @@ -43,8 +44,32 @@ # Per-API-call read timeout. Reasoning models (Gemini 3.x pro, MiMo) can think for # minutes on a single turn over a large C file; a tight cap aborts the whole run as # an infra_error and silently drops the slowest (often strongest) models from the -# matrix. 600s lets a slow reasoner finish a turn rather than penalising it. -HTTP_TIMEOUT = 600 # seconds per API call +# matrix. 600s lets a slow reasoner finish a turn rather than penalising it. A slow +# self-hosted box (e.g. a 27B on an APU, where one big-context turn can exceed 600s) +# raises it via NELSON_HTTP_TIMEOUT (set from the competitor's cost_model http_timeout). +HTTP_TIMEOUT = 600 # default seconds per API call + + +def _http_timeout() -> int: + """Per-call read timeout: NELSON_HTTP_TIMEOUT if set and valid, else the default.""" + raw = os.environ.get("NELSON_HTTP_TIMEOUT") + if raw: + with contextlib.suppress(ValueError): + return int(raw) + return HTTP_TIMEOUT + + +# Transient faults that warrant a retry rather than failing the whole run: provider +# rate limits (429 — Mistral caps tokens/minute, which an agentic burst of large- +# context calls trips even on a paid tier) and the 5xx family, plus transport errors +# (a self-hosted endpoint like LM Studio closing the socket mid-response on a heavy +# request). We pace-and-recover with exponential backoff; a persistent outage still +# surfaces after the cap so the runner records a real infra_error, never a false +# "found nothing". +HTTP_RETRY_STATUS = (429, 500, 502, 503, 504) +MAX_HTTP_RETRIES = 5 # retries after the first attempt (6 tries total) +BACKOFF_BASE_S = 2.0 # exponential: 1, 2, 4, 8, 16 s ... +BACKOFF_CAP_S = 60.0 # never wait longer than this between tries SYSTEM_PROMPT = ( "You are a tool-using security auditor. Use the provided tools to read the " @@ -199,25 +224,65 @@ def dispatch_tool(name: str, args: dict[str, Any], src_root: str | None = None) # -- HTTP + cost ------------------------------------------------------------- -def _post_chat(url: str, payload: dict[str, Any], api_key: str) -> dict[str, Any]: +def _retry_after_seconds(err: urllib.error.HTTPError) -> float | None: + """Parse a Retry-After header (delta-seconds form) from a 429/503, if present.""" + with contextlib.suppress(Exception): + raw = err.headers.get("Retry-After") if err.headers else None + if raw is not None: + return max(0.0, float(raw.strip())) + return None + + +def _post_chat( + url: str, + payload: dict[str, Any], + api_key: str, + *, + max_retries: int = MAX_HTTP_RETRIES, + sleep: Callable[[float], None] = time.sleep, +) -> dict[str, Any]: """POST a chat/completions request and return the parsed JSON body. - Raises urllib.error.HTTPError / URLError on HTTP or transport failure; main() - turns those into a non-zero exit + the provider error so the runner classifies - auth/rate/infra. Injectable in tests via ``run_loop(post=...)``. + Transient faults are retried with exponential backoff (honoring Retry-After): + HTTP 429/5xx and transport errors (connection reset, the provider closing the + socket mid-response, TLS read errors). This lets a rate-limited provider and a + flaky self-hosted endpoint pace-and-recover instead of failing the whole run on + the first hiccup. After ``max_retries`` the last error propagates so main() + still exits non-zero with the provider error — the runner then classifies a + persistent failure as auth/infra, never masking it as a model finding nothing. + Injectable in tests via ``run_loop(post=...)``; ``sleep`` is injectable too. """ data = json.dumps(payload).encode("utf-8") - - # operator-configured base_url, not model-controlled, so this S310 audit warning - # is acceptable here. - req = urllib.request.Request(url, data=data, method="POST") # noqa: S310 - req.add_header("Content-Type", "application/json") - if api_key: - req.add_header("Authorization", f"Bearer {api_key}") - with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp: # noqa: S310 - body = resp.read().decode("utf-8", errors="replace") - parsed = json.loads(body) - return parsed if isinstance(parsed, dict) else {} + for attempt in range(max_retries + 1): + # operator-configured base_url, not model-controlled, so this S310 audit + # warning is acceptable here. A fresh Request per attempt keeps retries clean. + req = urllib.request.Request(url, data=data, method="POST") # noqa: S310 + req.add_header("Content-Type", "application/json") + if api_key: + req.add_header("Authorization", f"Bearer {api_key}") + try: + with urllib.request.urlopen(req, timeout=_http_timeout()) as resp: # noqa: S310 + body = resp.read().decode("utf-8", errors="replace") + parsed = json.loads(body) + return parsed if isinstance(parsed, dict) else {} + except urllib.error.HTTPError as e: + if e.code in HTTP_RETRY_STATUS and attempt < max_retries: + delay = _retry_after_seconds(e) + if delay is None: + delay = min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt) + sleep(delay) + continue + raise + except urllib.error.URLError: + # Transport-level failure (connection refused/reset, socket closed + # mid-response, TLS read error). HTTPError is a URLError subclass but is + # handled above, so this is purely the no-HTTP-response case. + if attempt < max_retries: + sleep(min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt)) + continue + raise + # Unreachable: the final attempt either returns or raises. Satisfy type checkers. + raise RuntimeError("retry loop exited without returning") # pragma: no cover def usage_delta(usage: dict[str, Any]) -> tuple[int, int]: diff --git a/nelson/runtimes.py b/nelson/runtimes.py index 75418bc..79f5dd6 100644 --- a/nelson/runtimes.py +++ b/nelson/runtimes.py @@ -380,6 +380,10 @@ def build_spec(self, ctx: RuntimeContext) -> ContainerSpec: env["NELSON_INPUT_USD_PER_MTOK"] = str(cfg["input_usd_per_mtok"]) if cfg.get("output_usd_per_mtok") is not None: env["NELSON_OUTPUT_USD_PER_MTOK"] = str(cfg["output_usd_per_mtok"]) + if cfg.get("http_timeout") is not None: + # Slow self-hosted endpoints raise the per-API-call read timeout so a single + # big-context turn isn't cut off as an infra_error (see _http_timeout). + env["NELSON_HTTP_TIMEOUT"] = str(cfg["http_timeout"]) mounts = [ (str(_RAW_API_SCRIPT_HOST), _RAW_API_SCRIPT_CONTAINER, "ro"), (str(ctx.src_dir), "/src", "ro"), diff --git a/tests/test_raw_api_loop.py b/tests/test_raw_api_loop.py index 148b6b8..952c144 100644 --- a/tests/test_raw_api_loop.py +++ b/tests/test_raw_api_loop.py @@ -6,8 +6,15 @@ """ import json +import urllib.error +from email.message import Message +import pytest + +import nelson.raw_api_loop as ral from nelson.raw_api_loop import ( + MAX_HTTP_RETRIES, + _post_chat, _resolve_in_src, compute_cost, dispatch_tool, @@ -195,3 +202,118 @@ def test_usage_delta_falls_back_to_completion_without_total(): {"prompt_tokens": 100, "completion_tokens": 40, "total_tokens": 140} ) == (100, 40) assert usage_delta({}) == (0, 0) + + +# -- _post_chat retry / backoff ---------------------------------------------- +# +# Transient faults (provider 429 tokens/min, a self-hosted endpoint dropping the +# socket mid-response) must be retried with backoff, not fail the whole run. The +# real urlopen is monkeypatched; sleep is injected so no test actually waits. + + +class _FakeResp: + def __init__(self, body: str): + self._body = body.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, *_a): + return False + + def read(self): + return self._body + + +def _http_error(code: int, retry_after=None) -> urllib.error.HTTPError: + hdrs = Message() + if retry_after is not None: + hdrs["Retry-After"] = str(retry_after) + return urllib.error.HTTPError( + "http://x/v1/chat/completions", code, "err", hdrs, None + ) + + +def _scripted_urlopen(events): + """Return a fake urlopen yielding each event: raise Exceptions, return responses.""" + it = iter(events) + + def fake(_req, timeout=None): + ev = next(it) + if isinstance(ev, Exception): + raise ev + return ev + + return fake + + +def test_post_chat_retries_on_429_then_succeeds(monkeypatch): + slept = [] + monkeypatch.setattr( + ral.urllib.request, + "urlopen", + _scripted_urlopen( + [_http_error(429), _http_error(429), _FakeResp('{"ok": true}')] + ), + ) + out = _post_chat("http://x", {"m": 1}, "k", sleep=slept.append) + assert out == {"ok": True} + assert len(slept) == 2 # backed off before each of the two retries + + +def test_post_chat_honors_retry_after_header(monkeypatch): + slept = [] + monkeypatch.setattr( + ral.urllib.request, + "urlopen", + _scripted_urlopen([_http_error(429, retry_after=7), _FakeResp('{"ok": 1}')]), + ) + _post_chat("http://x", {}, "k", sleep=slept.append) + assert slept == [7.0] # honored the server's Retry-After, not the backoff curve + + +def test_post_chat_retries_on_transport_error(monkeypatch): + slept = [] + monkeypatch.setattr( + ral.urllib.request, + "urlopen", + _scripted_urlopen( + [urllib.error.URLError("connection reset"), _FakeResp('{"ok": 1}')] + ), + ) + out = _post_chat("http://x", {}, "k", sleep=slept.append) + assert out == {"ok": 1} + assert len(slept) == 1 + + +def test_post_chat_gives_up_after_max_retries(monkeypatch): + monkeypatch.setattr( + ral.urllib.request, + "urlopen", + _scripted_urlopen([_http_error(429)] * (MAX_HTTP_RETRIES + 1)), + ) + # A persistent rate limit still surfaces (so the runner records infra_error). + with pytest.raises(urllib.error.HTTPError): + _post_chat("http://x", {}, "k", sleep=lambda _s: None) + + +def test_http_timeout_defaults_and_env_override(monkeypatch): + monkeypatch.delenv("NELSON_HTTP_TIMEOUT", raising=False) + assert ral._http_timeout() == ral.HTTP_TIMEOUT # default when unset + monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "1800") + assert ral._http_timeout() == 1800 # slow self-hosted box raises it + monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "not-an-int") + assert ral._http_timeout() == ral.HTTP_TIMEOUT # garbage falls back to default + + +def test_post_chat_does_not_retry_non_retryable_status(monkeypatch): + calls = [] + + def fake(_req, timeout=None): + calls.append(1) + raise _http_error(400) + + monkeypatch.setattr(ral.urllib.request, "urlopen", fake) + with pytest.raises(urllib.error.HTTPError): + _post_chat("http://x", {}, "k", sleep=lambda _s: None) + assert len(calls) == 1 # 400 is a client error, not retried