Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 21 additions & 13 deletions competitors.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,34 +195,42 @@
size_class: medium
knowledge_cutoff: 2025-06

# Self-hosted Qwen 3.6 27B (8-bit) via LM Studio's OpenAI-compatible server, on the
# shared raw-api-loop harness — apples-to-apples with the hosted models above, and the
# point of comparison for "can a model you can actually run yourself do this work?".
# Tool-calling verified live 2026-05-30: the model emits clean OpenAI tool_calls and
# reports usage (reasoning tokens folded into the completion, which usage_delta handles).
# Self-hosted Qwen 3.6 27B (8-bit) on the shared raw-api-loop harness — apples-to-apples
# with the hosted models above, and the point of comparison for "can a model you can
# actually run yourself do this work?". Tool-calling verified live 2026-05-30 on both
# backends: the model emits clean OpenAI tool_calls and reports usage (reasoning tokens
# folded into the completion, which usage_delta handles).
#
# SERVER: now llama-server (llama.cpp) on the Strix Halo (model id
# unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL), reachable at http://10.20.30.2:8080/v1. It moved
# here from a desktop LM Studio (qwen/qwen3.6-27b @ http://192.168.122.1:1234/v1) because
# the desktop's GPUs OOM-CRASHED LM Studio on the large freerdp planar.c case (HTTP 400
# "model has crashed", which then took the server down and cascaded onto the next case).
# The Strix Halo has more memory (slower, but survives the heavy cases). NOTE: the UD-Q8_K_XL
# unsloth dynamic quant differs slightly from LM Studio's vanilla Q8 — a minor methodological
# seam for the 2 cases (freerdp, Ghost) re-run here vs the other 7 done on LM Studio.
#
# NETWORKING: the harness runs in a podman container with its own netns, so the endpoint
# must be reachable at a real routable IP — NOT a loopback ssh-tunnel on the VM (the
# container can't see the VM's 127.0.0.1). Enable LM Studio > Developer > "Serve on Local
# Network" (binds 0.0.0.0) on the physical host and point base_url at that host's LAN IP.
# >>> REPLACE the placeholder IP below before a live run. <<<
# must be a real routable IP the container can reach (verified: 10.20.30.2:8080 → HTTP 200
# from inside a container) — NOT a loopback ssh-tunnel on the VM (the container can't see
# the VM's 127.0.0.1). Bind the server on 0.0.0.0.
#
# COST: self-hosted has no per-token dollar price, so no pricing is declared — cost stays
# null (electricity/amortization is out of scope) while tokens are still recorded, which is
# exactly the comparison you want (free-to-run vs paid frontier on the same matrix).
#
# AUTH: LM Studio doesn't check the key, but raw-api-loop requires an auth_profile; export
# a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in nelson/auth.py).
# AUTH: llama-server/LM Studio don't check the key, but raw-api-loop requires an auth_profile;
# export a dummy `LMSTUDIO_API_KEY=lm-studio` on the host (see lmstudio-api-key in auth.py).
#
# knowledge_cutoff 2025-06 matches the cohort and keeps every (May-2026) corpus case in
# the gate while staying plausibly pre-disclosure. VERIFY-AT-WIRING: if Qwen 3.6's real
# training cutoff is later than a case's disclosure, raise this so memorized cases gate out.
- name: raw-api-loop/qwen3.6-27b
model: qwen/qwen3.6-27b # must match LM Studio's loaded model id exactly
model: unsloth/Qwen3.6-27B-GGUF:UD-Q8_K_XL # must match the server's loaded model id exactly
runtime: raw-api-loop
tool_profile: read-grep
auth_profile: lmstudio-api-key # value (any non-empty) from $LMSTUDIO_API_KEY on the host
cost_model: '{"base_url": "http://192.168.122.1:1234/v1"}' # libvirt host bridge gateway
cost_model: '{"base_url": "http://10.20.30.2:8080/v1", "http_timeout": 1800}' # Strix Halo, llama-server (slow APU → longer per-call timeout)
size_class: small
knowledge_cutoff: 2025-06

Expand Down
99 changes: 82 additions & 17 deletions nelson/raw_api_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import os
import subprocess
import sys
import time
import urllib.error
import urllib.request
from collections.abc import Callable
Expand All @@ -43,8 +44,32 @@
# Per-API-call read timeout. Reasoning models (Gemini 3.x pro, MiMo) can think for
# minutes on a single turn over a large C file; a tight cap aborts the whole run as
# an infra_error and silently drops the slowest (often strongest) models from the
# matrix. 600s lets a slow reasoner finish a turn rather than penalising it.
HTTP_TIMEOUT = 600 # seconds per API call
# matrix. 600s lets a slow reasoner finish a turn rather than penalising it. A slow
# self-hosted box (e.g. a 27B on an APU, where one big-context turn can exceed 600s)
# raises it via NELSON_HTTP_TIMEOUT (set from the competitor's cost_model http_timeout).
HTTP_TIMEOUT = 600 # default seconds per API call


def _http_timeout() -> int:
"""Per-call read timeout: NELSON_HTTP_TIMEOUT if set and valid, else the default."""
raw = os.environ.get("NELSON_HTTP_TIMEOUT")
if raw:
with contextlib.suppress(ValueError):
return int(raw)
return HTTP_TIMEOUT


# Transient faults that warrant a retry rather than failing the whole run: provider
# rate limits (429 — Mistral caps tokens/minute, which an agentic burst of large-
# context calls trips even on a paid tier) and the 5xx family, plus transport errors
# (a self-hosted endpoint like LM Studio closing the socket mid-response on a heavy
# request). We pace-and-recover with exponential backoff; a persistent outage still
# surfaces after the cap so the runner records a real infra_error, never a false
# "found nothing".
HTTP_RETRY_STATUS = (429, 500, 502, 503, 504)
MAX_HTTP_RETRIES = 5 # retries after the first attempt (6 tries total)
BACKOFF_BASE_S = 2.0 # exponential: 1, 2, 4, 8, 16 s ...
BACKOFF_CAP_S = 60.0 # never wait longer than this between tries

SYSTEM_PROMPT = (
"You are a tool-using security auditor. Use the provided tools to read the "
Expand Down Expand Up @@ -199,25 +224,65 @@ def dispatch_tool(name: str, args: dict[str, Any], src_root: str | None = None)
# -- HTTP + cost -------------------------------------------------------------


def _post_chat(url: str, payload: dict[str, Any], api_key: str) -> dict[str, Any]:
def _retry_after_seconds(err: urllib.error.HTTPError) -> float | None:
"""Parse a Retry-After header (delta-seconds form) from a 429/503, if present."""
with contextlib.suppress(Exception):
raw = err.headers.get("Retry-After") if err.headers else None
if raw is not None:
return max(0.0, float(raw.strip()))
return None


def _post_chat(
url: str,
payload: dict[str, Any],
api_key: str,
*,
max_retries: int = MAX_HTTP_RETRIES,
sleep: Callable[[float], None] = time.sleep,
) -> dict[str, Any]:
"""POST a chat/completions request and return the parsed JSON body.

Raises urllib.error.HTTPError / URLError on HTTP or transport failure; main()
turns those into a non-zero exit + the provider error so the runner classifies
auth/rate/infra. Injectable in tests via ``run_loop(post=...)``.
Transient faults are retried with exponential backoff (honoring Retry-After):
HTTP 429/5xx and transport errors (connection reset, the provider closing the
socket mid-response, TLS read errors). This lets a rate-limited provider and a
flaky self-hosted endpoint pace-and-recover instead of failing the whole run on
the first hiccup. After ``max_retries`` the last error propagates so main()
still exits non-zero with the provider error — the runner then classifies a
persistent failure as auth/infra, never masking it as a model finding nothing.
Injectable in tests via ``run_loop(post=...)``; ``sleep`` is injectable too.
"""
data = json.dumps(payload).encode("utf-8")

# operator-configured base_url, not model-controlled, so this S310 audit warning
# is acceptable here.
req = urllib.request.Request(url, data=data, method="POST") # noqa: S310
req.add_header("Content-Type", "application/json")
if api_key:
req.add_header("Authorization", f"Bearer {api_key}")
with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT) as resp: # noqa: S310
body = resp.read().decode("utf-8", errors="replace")
parsed = json.loads(body)
return parsed if isinstance(parsed, dict) else {}
for attempt in range(max_retries + 1):
# operator-configured base_url, not model-controlled, so this S310 audit
# warning is acceptable here. A fresh Request per attempt keeps retries clean.
req = urllib.request.Request(url, data=data, method="POST") # noqa: S310
req.add_header("Content-Type", "application/json")
if api_key:
req.add_header("Authorization", f"Bearer {api_key}")
try:
with urllib.request.urlopen(req, timeout=_http_timeout()) as resp: # noqa: S310
body = resp.read().decode("utf-8", errors="replace")
parsed = json.loads(body)
return parsed if isinstance(parsed, dict) else {}
except urllib.error.HTTPError as e:
if e.code in HTTP_RETRY_STATUS and attempt < max_retries:
delay = _retry_after_seconds(e)
if delay is None:
delay = min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt)
sleep(delay)
continue
raise
except urllib.error.URLError:
# Transport-level failure (connection refused/reset, socket closed
# mid-response, TLS read error). HTTPError is a URLError subclass but is
# handled above, so this is purely the no-HTTP-response case.
if attempt < max_retries:
sleep(min(BACKOFF_CAP_S, BACKOFF_BASE_S**attempt))
continue
raise
# Unreachable: the final attempt either returns or raises. Satisfy type checkers.
raise RuntimeError("retry loop exited without returning") # pragma: no cover


def usage_delta(usage: dict[str, Any]) -> tuple[int, int]:
Expand Down
4 changes: 4 additions & 0 deletions nelson/runtimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,10 @@ def build_spec(self, ctx: RuntimeContext) -> ContainerSpec:
env["NELSON_INPUT_USD_PER_MTOK"] = str(cfg["input_usd_per_mtok"])
if cfg.get("output_usd_per_mtok") is not None:
env["NELSON_OUTPUT_USD_PER_MTOK"] = str(cfg["output_usd_per_mtok"])
if cfg.get("http_timeout") is not None:
# Slow self-hosted endpoints raise the per-API-call read timeout so a single
# big-context turn isn't cut off as an infra_error (see _http_timeout).
env["NELSON_HTTP_TIMEOUT"] = str(cfg["http_timeout"])
mounts = [
(str(_RAW_API_SCRIPT_HOST), _RAW_API_SCRIPT_CONTAINER, "ro"),
(str(ctx.src_dir), "/src", "ro"),
Expand Down
122 changes: 122 additions & 0 deletions tests/test_raw_api_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@
"""

import json
import urllib.error
from email.message import Message

import pytest

import nelson.raw_api_loop as ral
from nelson.raw_api_loop import (
MAX_HTTP_RETRIES,
_post_chat,
_resolve_in_src,
compute_cost,
dispatch_tool,
Expand Down Expand Up @@ -195,3 +202,118 @@ def test_usage_delta_falls_back_to_completion_without_total():
{"prompt_tokens": 100, "completion_tokens": 40, "total_tokens": 140}
) == (100, 40)
assert usage_delta({}) == (0, 0)


# -- _post_chat retry / backoff ----------------------------------------------
#
# Transient faults (provider 429 tokens/min, a self-hosted endpoint dropping the
# socket mid-response) must be retried with backoff, not fail the whole run. The
# real urlopen is monkeypatched; sleep is injected so no test actually waits.


class _FakeResp:
def __init__(self, body: str):
self._body = body.encode("utf-8")

def __enter__(self):
return self

def __exit__(self, *_a):
return False

def read(self):
return self._body


def _http_error(code: int, retry_after=None) -> urllib.error.HTTPError:
hdrs = Message()
if retry_after is not None:
hdrs["Retry-After"] = str(retry_after)
return urllib.error.HTTPError(
"http://x/v1/chat/completions", code, "err", hdrs, None
)


def _scripted_urlopen(events):
"""Return a fake urlopen yielding each event: raise Exceptions, return responses."""
it = iter(events)

def fake(_req, timeout=None):
ev = next(it)
if isinstance(ev, Exception):
raise ev
return ev

return fake


def test_post_chat_retries_on_429_then_succeeds(monkeypatch):
slept = []
monkeypatch.setattr(
ral.urllib.request,
"urlopen",
_scripted_urlopen(
[_http_error(429), _http_error(429), _FakeResp('{"ok": true}')]
),
)
out = _post_chat("http://x", {"m": 1}, "k", sleep=slept.append)
assert out == {"ok": True}
assert len(slept) == 2 # backed off before each of the two retries


def test_post_chat_honors_retry_after_header(monkeypatch):
slept = []
monkeypatch.setattr(
ral.urllib.request,
"urlopen",
_scripted_urlopen([_http_error(429, retry_after=7), _FakeResp('{"ok": 1}')]),
)
_post_chat("http://x", {}, "k", sleep=slept.append)
assert slept == [7.0] # honored the server's Retry-After, not the backoff curve


def test_post_chat_retries_on_transport_error(monkeypatch):
slept = []
monkeypatch.setattr(
ral.urllib.request,
"urlopen",
_scripted_urlopen(
[urllib.error.URLError("connection reset"), _FakeResp('{"ok": 1}')]
),
)
out = _post_chat("http://x", {}, "k", sleep=slept.append)
assert out == {"ok": 1}
assert len(slept) == 1


def test_post_chat_gives_up_after_max_retries(monkeypatch):
monkeypatch.setattr(
ral.urllib.request,
"urlopen",
_scripted_urlopen([_http_error(429)] * (MAX_HTTP_RETRIES + 1)),
)
# A persistent rate limit still surfaces (so the runner records infra_error).
with pytest.raises(urllib.error.HTTPError):
_post_chat("http://x", {}, "k", sleep=lambda _s: None)


def test_http_timeout_defaults_and_env_override(monkeypatch):
monkeypatch.delenv("NELSON_HTTP_TIMEOUT", raising=False)
assert ral._http_timeout() == ral.HTTP_TIMEOUT # default when unset
monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "1800")
assert ral._http_timeout() == 1800 # slow self-hosted box raises it
monkeypatch.setenv("NELSON_HTTP_TIMEOUT", "not-an-int")
assert ral._http_timeout() == ral.HTTP_TIMEOUT # garbage falls back to default


def test_post_chat_does_not_retry_non_retryable_status(monkeypatch):
calls = []

def fake(_req, timeout=None):
calls.append(1)
raise _http_error(400)

monkeypatch.setattr(ral.urllib.request, "urlopen", fake)
with pytest.raises(urllib.error.HTTPError):
_post_chat("http://x", {}, "k", sleep=lambda _s: None)
assert len(calls) == 1 # 400 is a client error, not retried
Loading