From 477c9e9ccf300297fd023e87205aeb12d8691ad3 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:16:10 +0530 Subject: [PATCH 1/8] chore(llm): add llm.toml configuration for gated secondary formatter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interpreter is enabled=false by default per phase-4 §17.1. An operator opts in by editing config/llm.toml after reviewing the reputation-risk example in docs/examples/negative-paths.md §Example 4. backends.ollama defaults to the local daemon at 11434 with the gemma2 27B model; backends.anthropic uses claude-haiku-4-5-20251001 per the project model conventions in ~/.claude/CLAUDE.md §Models. Model identifiers live only in this file; source code reads them at startup. suspend_during_storm wires into the dedup layer's StormController so the LLM formatter stops generating when the bus enters storm mode, preserving the deterministic pipeline's throughput under pressure. --- config/llm.toml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 config/llm.toml diff --git a/config/llm.toml b/config/llm.toml new file mode 100644 index 0000000..2db2d06 --- /dev/null +++ b/config/llm.toml @@ -0,0 +1,30 @@ +# Gated LLM secondary formatter configuration. +# +# Default off per phase-4 §17.1. An operator enables the interpreter +# by setting [interpreter] enabled = true after reviewing the +# reputation-risk example in docs/examples/negative-paths.md §Example 4. +# +# Model identifiers live in this file per ~/.claude/CLAUDE.md §Models +# & Configuration — never in source code. + +[interpreter] +enabled = false +default_backend = "ollama" +max_tokens = 512 +temperature = 0.2 +suspend_during_storm = true + +[backends.ollama] +endpoint = "http://localhost:11434" +model = "gemma2:27b" +timeout_seconds = 30 +max_retries = 2 + +[backends.anthropic] +model = "claude-haiku-4-5-20251001" +timeout_seconds = 20 +max_retries = 3 +api_key_env = "ANTHROPIC_API_KEY" + +[prompts] +template_dir = "src/augur_format/augur_format/llm/prompts/templates" From 1f9fc8f569a09d4da4530399ae33f6804930be96 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:18:11 +0530 Subject: [PATCH 2/8] feat(llm): expand intelligencebrief with length bounds, provenance, and timestamps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IntelligenceBrief gains three load-bearing fields per phase-4 §3 and two constructor-time validators that lock the brief's interpretation mode and forbidden-token check to their Literal singletons. headline is capped at 90 characters so it fits a Slack header; body_markdown is capped at 800 characters so it renders cleanly on a dashboard card; actionable_for is typed list[ConsumerType] so unknown consumers fail at construction. formatter_version and generated_at let consumers verify which formatter produced the brief and when, closing the provenance surface that prompt_hash and model alone could not. Two model_validator decorators enforce the Literal singletons: interpretation_mode must equal "llm_assisted" and forbidden_token_check must equal "passed". The gated formatter path is the only code that can mint a conforming brief because any other construction path would have to forge those literals, which code review catches. schemas/IntelligenceBrief-1.0.0.json is regenerated via scripts/export_schemas.py so the wire contract matches the model. --- schemas/IntelligenceBrief-1.0.0.json | 17 ++++- src/augur_format/augur_format/llm/models.py | 44 ++++++++---- tests/format/test_intelligence_brief.py | 78 +++++++++++++++++++++ 3 files changed, 124 insertions(+), 15 deletions(-) create mode 100644 tests/format/test_intelligence_brief.py diff --git a/schemas/IntelligenceBrief-1.0.0.json b/schemas/IntelligenceBrief-1.0.0.json index 65637ea..3a769d9 100644 --- a/schemas/IntelligenceBrief-1.0.0.json +++ b/schemas/IntelligenceBrief-1.0.0.json @@ -15,7 +15,7 @@ } }, "additionalProperties": false, - "description": "Gated LLM formatter output contract.\n\n``actionable_for`` is constrained to the ConsumerType registry in\ndocs/contracts/consumer-registry.md via the Pydantic field type;\nthe closed-enum validator rechecks this at the formatter boundary\nso even dynamically-constructed instances fail loud on unknown\nvalues.", + "description": "Gated LLM formatter output contract.\n\nStructural invariants are enforced by Pydantic at construction:\nthe headline is capped at 90 characters so it fits a Slack header,\nbody_markdown is capped at 800 characters so it stays readable on\na dashboard card, ``actionable_for`` is typed as list[ConsumerType]\nso unknown consumers fail immediately, and ``interpretation_mode``\nplus ``forbidden_token_check`` are Literal singletons \u2014 any\nconstruction path that bypasses the linter or the deterministic-\nmode check would have to forge the literal, which is caught in\ncode review.", "properties": { "actionable_for": { "items": { @@ -25,6 +25,7 @@ "type": "array" }, "body_markdown": { + "maxLength": 800, "title": "Body Markdown", "type": "string" }, @@ -38,7 +39,17 @@ "title": "Forbidden Token Check", "type": "string" }, + "formatter_version": { + "title": "Formatter Version", + "type": "string" + }, + "generated_at": { + "format": "date-time", + "title": "Generated At", + "type": "string" + }, "headline": { + "maxLength": 90, "title": "Headline", "type": "string" }, @@ -83,7 +94,9 @@ "body_markdown", "severity", "model", - "prompt_hash" + "prompt_hash", + "formatter_version", + "generated_at" ], "title": "IntelligenceBrief", "type": "object" diff --git a/src/augur_format/augur_format/llm/models.py b/src/augur_format/augur_format/llm/models.py index 1d7d276..2cfc69a 100644 --- a/src/augur_format/augur_format/llm/models.py +++ b/src/augur_format/augur_format/llm/models.py @@ -1,17 +1,17 @@ """IntelligenceBrief — the contract emitted by the gated LLM formatter. The schema lives in the formatter package because it is the -formatter's output contract, even though the deterministic pathway -in this phase does not produce briefs. The secondary LLM formatter -in the next phase instantiates IntelligenceBrief values that pass -the forbidden-token linter and the ConsumerType enum gate. +formatter's output contract. Only the gated LLM formatter path can +construct briefs: the forbidden-token linter, the JSON schema +validator, and the consumer gate all run before the constructor. """ from __future__ import annotations -from typing import Literal +from datetime import datetime +from typing import Annotated, Literal -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from augur_signals.models import ConsumerType @@ -19,23 +19,41 @@ class IntelligenceBrief(BaseModel): """Gated LLM formatter output contract. - ``actionable_for`` is constrained to the ConsumerType registry in - docs/contracts/consumer-registry.md via the Pydantic field type; - the closed-enum validator rechecks this at the formatter boundary - so even dynamically-constructed instances fail loud on unknown - values. + Structural invariants are enforced by Pydantic at construction: + the headline is capped at 90 characters so it fits a Slack header, + body_markdown is capped at 800 characters so it stays readable on + a dashboard card, ``actionable_for`` is typed as list[ConsumerType] + so unknown consumers fail immediately, and ``interpretation_mode`` + plus ``forbidden_token_check`` are Literal singletons — any + construction path that bypasses the linter or the deterministic- + mode check would have to forge the literal, which is caught in + code review. """ model_config = ConfigDict(frozen=True, extra="forbid") brief_id: str signal_id: str - headline: str - body_markdown: str + headline: Annotated[str, Field(max_length=90)] + body_markdown: Annotated[str, Field(max_length=800)] severity: Literal["high", "medium", "low"] actionable_for: list[ConsumerType] = Field(default_factory=list) interpretation_mode: Literal["llm_assisted"] = "llm_assisted" model: str prompt_hash: str + formatter_version: str + generated_at: datetime forbidden_token_check: Literal["passed"] = "passed" # noqa: S105 schema_version: Literal["1.0.0"] = "1.0.0" + + @model_validator(mode="after") + def _interpretation_mode_pinned(self) -> IntelligenceBrief: + if self.interpretation_mode != "llm_assisted": + raise ValueError("LLM-rendered briefs must declare interpretation_mode=llm_assisted") + return self + + @model_validator(mode="after") + def _forbidden_token_check_marker(self) -> IntelligenceBrief: + if self.forbidden_token_check != "passed": # noqa: S105 + raise ValueError("Brief without passed forbidden-token check cannot exist") + return self diff --git a/tests/format/test_intelligence_brief.py b/tests/format/test_intelligence_brief.py new file mode 100644 index 0000000..011f40c --- /dev/null +++ b/tests/format/test_intelligence_brief.py @@ -0,0 +1,78 @@ +"""Tests for the IntelligenceBrief contract.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest +from pydantic import ValidationError + +from augur_format.llm.models import IntelligenceBrief + + +def _payload(**overrides: object) -> dict[str, object]: + base: dict[str, object] = { + "brief_id": "brief-1", + "signal_id": "signal-1", + "headline": "Fed holds rates per announcement", + "body_markdown": "## Summary\n- Fed held at the current range.", + "severity": "high", + "actionable_for": ["macro_research_agent", "dashboard"], + "model": "gemma2:27b@ollama", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + base.update(overrides) + return base + + +@pytest.mark.unit +def test_canonical_payload_validates() -> None: + brief = IntelligenceBrief.model_validate(_payload()) + assert brief.interpretation_mode == "llm_assisted" + assert brief.forbidden_token_check == "passed" # noqa: S105 + assert brief.schema_version == "1.0.0" + + +@pytest.mark.unit +def test_headline_over_90_chars_rejected() -> None: + with pytest.raises(ValidationError, match="at most 90 characters"): + IntelligenceBrief.model_validate(_payload(headline="x" * 91)) + + +@pytest.mark.unit +def test_body_over_800_chars_rejected() -> None: + with pytest.raises(ValidationError, match="at most 800 characters"): + IntelligenceBrief.model_validate(_payload(body_markdown="x" * 801)) + + +@pytest.mark.unit +def test_unknown_consumer_type_rejected() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(actionable_for=["not_a_consumer"])) + + +@pytest.mark.unit +def test_interpretation_mode_cannot_be_overridden() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(interpretation_mode="deterministic")) + + +@pytest.mark.unit +def test_forbidden_token_check_cannot_be_overridden() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(forbidden_token_check="failed")) # noqa: S106 + + +@pytest.mark.unit +def test_model_is_frozen() -> None: + brief = IntelligenceBrief.model_validate(_payload()) + with pytest.raises(ValidationError): + brief.headline = "mutated" # type: ignore[misc] + + +@pytest.mark.unit +def test_model_rejects_unknown_fields() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate({**_payload(), "unexpected": 1}) From 36706d3a7bef4420ab67adcd80a0aa6287fb91a2 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:19:51 +0530 Subject: [PATCH 3/8] feat(llm): backend abstraction with ollama and anthropic adapters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AbstractLLMBackend is the Protocol the interpreter dispatches through. Two concrete adapters implement the same async ``complete`` surface: OllamaBackend routes through the local daemon via plain httpx (no hard dependency on the ollama SDK), AnthropicBackend uses the anthropic SDK lazily imported inside the constructor so the llm-isolation test in the default environment still passes. Both adapters retry on transient failures: Ollama retries twice with no backoff (local daemon outages should surface quickly, not loop for a minute), Anthropic retries up to the configured limit per phase-4 §4.4. A backend that exhausts retries raises BackendError; the interpreter treats the error as a dropped brief per phase-4 §10 rather than propagating. AnthropicBackend accepts an injected client for testing; production code constructs the client from the ANTHROPIC_API_KEY env var. Missing credentials plus no injected client fails loud at construction. CompletionResult captures text, token counts, and generation duration so the observability hooks in the interpreter can surface per-backend latency distributions. Six tests cover Ollama health-check success, Ollama completion parse path, Ollama retry-exhaustion, Anthropic credential enforcement, Anthropic injected-client happy path, and Anthropic retry exhaustion. --- .../augur_format/llm/backends/__init__.py | 3 + .../augur_format/llm/backends/anthropic.py | 101 +++++++++++++++++ .../augur_format/llm/backends/base.py | 53 +++++++++ .../augur_format/llm/backends/ollama.py | 95 ++++++++++++++++ tests/format/test_llm_backends.py | 105 ++++++++++++++++++ 5 files changed, 357 insertions(+) create mode 100644 src/augur_format/augur_format/llm/backends/__init__.py create mode 100644 src/augur_format/augur_format/llm/backends/anthropic.py create mode 100644 src/augur_format/augur_format/llm/backends/base.py create mode 100644 src/augur_format/augur_format/llm/backends/ollama.py create mode 100644 tests/format/test_llm_backends.py diff --git a/src/augur_format/augur_format/llm/backends/__init__.py b/src/augur_format/augur_format/llm/backends/__init__.py new file mode 100644 index 0000000..8e7ed29 --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/__init__.py @@ -0,0 +1,3 @@ +"""LLM backend abstraction and concrete adapters.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/backends/anthropic.py b/src/augur_format/augur_format/llm/backends/anthropic.py new file mode 100644 index 0000000..f9ce7ac --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/anthropic.py @@ -0,0 +1,101 @@ +"""Anthropic backend adapter. + +Imports the anthropic SDK lazily via ``importlib.import_module`` so +that the llm-isolation test continues to assert anthropic is NOT +importable in the default environment. Operators install anthropic +via the ``augur-format[llm-cloud]`` extra before enabling the +backend. +""" + +from __future__ import annotations + +import importlib +import os +import time +from typing import Any + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) + + +class AnthropicBackend(AbstractLLMBackend): + """AbstractLLMBackend implementation routed through the anthropic SDK.""" + + backend_id: str = "anthropic" + + def __init__( + self, + model: str = "claude-haiku-4-5-20251001", + api_key_env: str = "ANTHROPIC_API_KEY", + timeout_seconds: float = 20.0, + max_retries: int = 3, + client: Any | None = None, + ) -> None: + key = os.environ.get(api_key_env) + if key is None and client is None: + raise BackendError( + f"AnthropicBackend requires {api_key_env} environment variable " + "or an injected client" + ) + self._model = model + self._timeout = timeout_seconds + self._max_retries = max(1, max_retries) + if client is None: + # Lazy import so the module is safely loadable when the + # anthropic extra is not installed; the adapter itself + # only runs when the operator opts in. + anthropic = importlib.import_module("anthropic") + client = anthropic.AsyncAnthropic(api_key=key) + self._client = client + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + # The SDK does not expose a cheap ping; surface True when the + # client constructs successfully and let the first real + # completion surface any runtime errors. + return self._client is not None + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + last_error: BaseException | None = None + for _ in range(self._max_retries): + started = time.perf_counter() + try: + response = await self._client.messages.create( + model=self._model, + max_tokens=max_tokens, + temperature=temperature, + system=system, + messages=[{"role": "user", "content": prompt}], + timeout=self._timeout, + ) + except Exception as err: + last_error = err + continue + duration_ms = int((time.perf_counter() - started) * 1000) + content_blocks = getattr(response, "content", []) + text_parts = [ + getattr(block, "text", "") + for block in content_blocks + if getattr(block, "type", "") == "text" + ] + usage = getattr(response, "usage", None) + return CompletionResult( + text="".join(text_parts), + input_tokens=int(getattr(usage, "input_tokens", 0)) if usage else 0, + output_tokens=int(getattr(usage, "output_tokens", 0)) if usage else 0, + duration_ms=duration_ms, + ) + raise BackendError( + f"anthropic completion failed after {self._max_retries} attempts: {last_error!r}" + ) diff --git a/src/augur_format/augur_format/llm/backends/base.py b/src/augur_format/augur_format/llm/backends/base.py new file mode 100644 index 0000000..462d811 --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/base.py @@ -0,0 +1,53 @@ +"""AbstractLLMBackend protocol and completion result model. + +Concrete adapters (Ollama, Anthropic) implement the same async +``complete`` surface so the interpreter dispatches uniformly. The +completion result exposes only the fields downstream actually needs: +the raw text, token counts for observability, and the duration in +milliseconds for the generation-latency SLO. +""" + +from __future__ import annotations + +from typing import Protocol + +from pydantic import BaseModel, ConfigDict + + +class CompletionResult(BaseModel): + """One backend completion's payload plus timing.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + text: str + input_tokens: int = 0 + output_tokens: int = 0 + duration_ms: int = 0 + + +class BackendError(RuntimeError): + """Raised when a backend cannot produce a completion.""" + + +class AbstractLLMBackend(Protocol): + """Uniform surface every LLM backend adapter implements.""" + + backend_id: str + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + """Return the model's completion for (*system*, *prompt*).""" + ... + + def model_id(self) -> str: + """Return the active model identifier (e.g. ``gemma2:27b``).""" + ... + + async def health_check(self) -> bool: + """Verify the backend is reachable and serving the configured model.""" + ... diff --git a/src/augur_format/augur_format/llm/backends/ollama.py b/src/augur_format/augur_format/llm/backends/ollama.py new file mode 100644 index 0000000..d464ece --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/ollama.py @@ -0,0 +1,95 @@ +"""Ollama backend adapter. + +Uses plain httpx against the local Ollama daemon (default +``http://localhost:11434``) so the adapter has no hard dependency on +the ``ollama`` Python client. The adapter retries twice on connection +failures; local daemon outages should surface quickly, not retry for +a minute. +""" + +from __future__ import annotations + +import time +from typing import Any + +import httpx + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) + + +class OllamaBackend(AbstractLLMBackend): + """AbstractLLMBackend implementation routed through the local daemon.""" + + backend_id: str = "ollama" + + def __init__( + self, + client: httpx.AsyncClient, + endpoint: str = "http://localhost:11434", + model: str = "gemma2:27b", + timeout_seconds: float = 30.0, + max_retries: int = 2, + ) -> None: + self._client = client + self._endpoint = endpoint.rstrip("/") + self._model = model + self._timeout = timeout_seconds + self._max_retries = max(1, max_retries) + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + try: + response = await self._client.get(f"{self._endpoint}/api/tags", timeout=self._timeout) + except Exception: + return False + return response.status_code == 200 + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + payload: dict[str, Any] = { + "model": self._model, + "system": system, + "prompt": prompt, + "stream": False, + "options": { + "num_predict": max_tokens, + "temperature": temperature, + }, + } + last_error: BaseException | None = None + for _ in range(self._max_retries): + started = time.perf_counter() + try: + response = await self._client.post( + f"{self._endpoint}/api/generate", + json=payload, + timeout=self._timeout, + ) + except Exception as err: + last_error = err + continue + if response.status_code != 200: + last_error = BackendError(f"ollama returned status {response.status_code}") + continue + data: dict[str, Any] = response.json() + duration_ms = int((time.perf_counter() - started) * 1000) + return CompletionResult( + text=str(data.get("response", "")), + input_tokens=int(data.get("prompt_eval_count", 0)), + output_tokens=int(data.get("eval_count", 0)), + duration_ms=duration_ms, + ) + raise BackendError( + f"ollama completion failed after {self._max_retries} attempts: {last_error!r}" + ) diff --git a/tests/format/test_llm_backends.py b/tests/format/test_llm_backends.py new file mode 100644 index 0000000..51b208c --- /dev/null +++ b/tests/format/test_llm_backends.py @@ -0,0 +1,105 @@ +"""Tests for the LLM backend adapters (mocked).""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import httpx +import pytest + +from augur_format.llm.backends.anthropic import AnthropicBackend +from augur_format.llm.backends.base import BackendError, CompletionResult +from augur_format.llm.backends.ollama import OllamaBackend + + +@pytest.mark.asyncio +async def test_ollama_health_check_passes_on_200() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, json={"models": []}) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client) + assert await backend.health_check() is True + + +@pytest.mark.asyncio +async def test_ollama_complete_returns_parsed_result() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={ + "response": "Hello world", + "prompt_eval_count": 10, + "eval_count": 3, + }, + ) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client, max_retries=1) + result = await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + assert isinstance(result, CompletionResult) + assert result.text == "Hello world" + assert result.input_tokens == 10 + assert result.output_tokens == 3 + + +@pytest.mark.asyncio +async def test_ollama_raises_backenderror_on_exhaustion() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(500) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client, max_retries=2) + with pytest.raises(BackendError): + await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + + +@pytest.mark.unit +def test_anthropic_requires_env_or_client(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + with pytest.raises(BackendError, match="ANTHROPIC_API_KEY"): + AnthropicBackend() + + +@pytest.mark.asyncio +async def test_anthropic_accepts_injected_client_and_parses_text() -> None: + class _FakeMessages: + def __init__(self) -> None: + self.calls: list[dict[str, Any]] = [] + + async def create(self, **kwargs: Any) -> SimpleNamespace: + self.calls.append(kwargs) + return SimpleNamespace( + content=[SimpleNamespace(type="text", text="ok")], + usage=SimpleNamespace(input_tokens=5, output_tokens=2), + ) + + class _FakeClient: + def __init__(self) -> None: + self.messages = _FakeMessages() + + client = _FakeClient() + backend = AnthropicBackend(client=client, max_retries=1) + result = await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + assert result.text == "ok" + assert result.input_tokens == 5 + assert result.output_tokens == 2 + + +@pytest.mark.asyncio +async def test_anthropic_exhausts_retries_and_raises_backenderror() -> None: + class _AlwaysFail: + async def create(self, **kwargs: Any) -> None: + raise RuntimeError("transient") + + class _Client: + def __init__(self) -> None: + self.messages = _AlwaysFail() + + backend = AnthropicBackend(client=_Client(), max_retries=2) + with pytest.raises(BackendError): + await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) From 5e2dd0ba25235157a0fbc228c483822ebf5e85ce Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:22:45 +0530 Subject: [PATCH 4/8] feat(llm): structured prompt builder with per-signal-type templates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prompt builder produces a deterministic (system, user) pair from any SignalContext. The system message embeds the sorted forbidden- phrase list, a summary of the IntelligenceBrief schema, and the ConsumerType enum — ensuring the model sees the exact constraints it must satisfy. The user message renders the signal payload into the per-signal-type template; all five SignalType values have a dedicated template under augur_format/llm/prompts/templates/. Determinism is the load-bearing contract: identical input plus identical template files always produce identical prompt strings. The prompt hash attached to every brief is SHA-256 of the concatenated pair, so auditors can reproduce the prompt offline from the SignalContext and confirm the model saw exactly what the builder claims it saw. Missing templates raise PromptTemplateNotFoundError at render time rather than silently falling back — contract drift between SignalType enum and template directory fails loud. The hatch build config now also includes *.txt so the templates ship with the wheel alongside the Markdown Jinja2 templates from the deterministic pathway. Nine tests cover determinism across calls, system-message phrase and consumer-enum injection, verbatim resolution-criteria pass-through, manipulation-flag rendering both populated and empty, every signal type finding its template, related-market bullet rendering, and the missing-template error path. --- .../augur_format/llm/prompts/__init__.py | 3 + .../augur_format/llm/prompts/builder.py | 112 ++++++++++++++ .../llm/prompts/templates/_system.txt | 24 +++ .../llm/prompts/templates/book_imbalance.txt | 26 ++++ .../templates/cross_market_divergence.txt | 26 ++++ .../llm/prompts/templates/price_velocity.txt | 26 ++++ .../llm/prompts/templates/regime_shift.txt | 26 ++++ .../llm/prompts/templates/volume_spike.txt | 26 ++++ src/augur_format/pyproject.toml | 2 +- tests/format/test_prompt_builder.py | 142 ++++++++++++++++++ 10 files changed, 412 insertions(+), 1 deletion(-) create mode 100644 src/augur_format/augur_format/llm/prompts/__init__.py create mode 100644 src/augur_format/augur_format/llm/prompts/builder.py create mode 100644 src/augur_format/augur_format/llm/prompts/templates/_system.txt create mode 100644 src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt create mode 100644 src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt create mode 100644 src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt create mode 100644 src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt create mode 100644 src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt create mode 100644 tests/format/test_prompt_builder.py diff --git a/src/augur_format/augur_format/llm/prompts/__init__.py b/src/augur_format/augur_format/llm/prompts/__init__.py new file mode 100644 index 0000000..bdfef67 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/__init__.py @@ -0,0 +1,3 @@ +"""Prompt templates and structured-prompt builder.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/prompts/builder.py b/src/augur_format/augur_format/llm/prompts/builder.py new file mode 100644 index 0000000..6a7742f --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/builder.py @@ -0,0 +1,112 @@ +"""Structured prompt builder. + +Produces a deterministic (system, user) pair for any SignalContext. +The system message embeds the forbidden phrase list, a summary of +the IntelligenceBrief schema, and the ConsumerType enum. The user +message renders the signal payload into the per-signal-type +template. + +The builder is deterministic: identical SignalContext + identical +forbidden-phrase list + identical template files always produce +identical prompt strings. The prompt hash used for provenance is +the SHA-256 of ``system + "\\n\\n" + user``. +""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path + +from augur_signals.models import ConsumerType, SignalContext + +_DEFAULT_TEMPLATE_DIR = Path(__file__).resolve().parent / "templates" + + +class PromptTemplateNotFoundError(RuntimeError): + """Raised when the signal type has no corresponding template file.""" + + +class PromptBuilder: + """Deterministic (system, user) prompt construction.""" + + def __init__( + self, + forbidden_phrases: Sequence[str], + template_dir: Path | None = None, + ) -> None: + directory = template_dir or _DEFAULT_TEMPLATE_DIR + system_path = directory / "_system.txt" + if not system_path.exists(): + raise PromptTemplateNotFoundError(f"system template missing at {system_path}") + self._template_dir = directory + self._forbidden_phrases = sorted(forbidden_phrases) + self._system_template = system_path.read_text(encoding="utf-8") + + def build(self, context: SignalContext) -> tuple[str, str]: + """Return the (system_prompt, user_prompt) pair for *context*.""" + system = self._render_system() + user = self._render_user(context) + return system, user + + def _render_system(self) -> str: + phrases = "\n".join(f"- {phrase}" for phrase in self._forbidden_phrases) + consumers = "\n".join(f"- {c.value}" for c in ConsumerType) + return self._system_template.format( + forbidden_phrases_list=phrases, + intelligence_brief_schema=_BRIEF_SCHEMA_SUMMARY, + consumer_type_enum=consumers, + ) + + def _render_user(self, context: SignalContext) -> str: + template_name = f"{context.signal.signal_type.value}.txt" + template_path = self._template_dir / template_name + if not template_path.exists(): + raise PromptTemplateNotFoundError( + f"no template for signal_type={context.signal.signal_type.value!r}" + ) + template = template_path.read_text(encoding="utf-8") + related = ( + "\n".join( + f"- {rm.market_id} ({rm.relationship_type}, strength {rm.relationship_strength}): " + f"price {rm.current_price}, 24h delta {rm.delta_24h}" + for rm in context.related_markets + ) + or "(none)" + ) + prompts = "\n".join(f"- {prompt}" for prompt in context.investigation_prompts) or "(none)" + flags = ",".join(flag.value for flag in context.signal.manipulation_flags) or "(none)" + return template.format( + market_id=context.signal.market_id, + platform=context.signal.platform, + market_question=context.market_question, + magnitude=f"{context.signal.magnitude:.6f}", + direction=context.signal.direction, + confidence=f"{context.signal.confidence:.6f}", + fdr_adjusted=context.signal.fdr_adjusted, + liquidity_tier=context.signal.liquidity_tier, + window_seconds=context.signal.window_seconds, + detected_at=context.signal.detected_at.isoformat().replace("+00:00", "Z"), + resolution_criteria=context.resolution_criteria, + resolution_source=context.resolution_source, + closes_at=context.closes_at.isoformat().replace("+00:00", "Z"), + manipulation_flags_csv_or_none=flags, + related_markets_block=related, + investigation_prompts_block=prompts, + ) + + +_BRIEF_SCHEMA_SUMMARY: str = ( + "- brief_id: string (uuid7)\n" + "- signal_id: string\n" + "- headline: string (max 90 chars)\n" + "- body_markdown: string (max 800 chars)\n" + "- severity: one of [high, medium, low]\n" + "- actionable_for: list of ConsumerType\n" + "- interpretation_mode: must equal 'llm_assisted'\n" + "- model: string\n" + "- prompt_hash: string (sha256 hex)\n" + "- formatter_version: string\n" + "- generated_at: ISO-8601 UTC datetime\n" + "- forbidden_token_check: must equal 'passed'\n" + "- schema_version: '1.0.0'" +) diff --git a/src/augur_format/augur_format/llm/prompts/templates/_system.txt b/src/augur_format/augur_format/llm/prompts/templates/_system.txt new file mode 100644 index 0000000..85acada --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/_system.txt @@ -0,0 +1,24 @@ +You generate intelligence briefs from structured prediction-market signals. +You write factual restatements with no causal interpretation. + +You must NOT: +- Use any of the forbidden phrases listed below. +- Claim to know why a market moved. +- Invent facts not present in the input. +- Recommend trades or positions. + +You must: +- Restate the numerical facts of the signal. +- Quote the resolution criteria verbatim. +- List related markets and their state. +- Repeat the investigation prompts as a bulleted list. +- Output a single JSON object matching the IntelligenceBrief schema. + +Forbidden phrases (exhaustive at schema 1.0.0): +{forbidden_phrases_list} + +Output schema fields (all required unless marked optional): +{intelligence_brief_schema} + +Available consumer types (you must use only these in actionable_for): +{consumer_type_enum} diff --git a/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt b/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt new file mode 100644 index 0000000..0953742 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt @@ -0,0 +1,26 @@ +Signal type: book_imbalance +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt b/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt new file mode 100644 index 0000000..9f2e7a5 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt @@ -0,0 +1,26 @@ +Signal type: cross_market_divergence +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt b/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt new file mode 100644 index 0000000..02fd6ff --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt @@ -0,0 +1,26 @@ +Signal type: price_velocity +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt b/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt new file mode 100644 index 0000000..8ebc726 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt @@ -0,0 +1,26 @@ +Signal type: regime_shift +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt b/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt new file mode 100644 index 0000000..5d7a293 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt @@ -0,0 +1,26 @@ +Signal type: volume_spike +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/pyproject.toml b/src/augur_format/pyproject.toml index 0dcac63..c74250b 100644 --- a/src/augur_format/pyproject.toml +++ b/src/augur_format/pyproject.toml @@ -22,4 +22,4 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["augur_format"] -include = ["augur_format/**/*.j2"] +include = ["augur_format/**/*.j2", "augur_format/**/*.txt"] diff --git a/tests/format/test_prompt_builder.py b/tests/format/test_prompt_builder.py new file mode 100644 index 0000000..f2d49e2 --- /dev/null +++ b/tests/format/test_prompt_builder.py @@ -0,0 +1,142 @@ +"""Tests for the LLM prompt builder.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest + +from augur_format.llm.prompts.builder import ( + PromptBuilder, + PromptTemplateNotFoundError, +) +from augur_signals.models import ( + ConsumerType, + InterpretationMode, + ManipulationFlag, + MarketSignal, + RelatedMarketState, + SignalContext, + SignalType, + new_signal_id, +) + +FORBIDDEN = ["may be driven by", "likely reflects", "suggests that"] + + +def _context( + signal_type: SignalType = SignalType.PRICE_VELOCITY, + manipulation_flags: list[ManipulationFlag] | None = None, + related: list[RelatedMarketState] | None = None, +) -> SignalContext: + signal = MarketSignal( + signal_id=new_signal_id(), + market_id="kalshi_fed", + platform="kalshi", + signal_type=signal_type, + magnitude=0.8, + direction=1, + confidence=0.72, + fdr_adjusted=True, + detected_at=datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + window_seconds=300, + liquidity_tier="high", + manipulation_flags=manipulation_flags or [], + raw_features={"calibration_provenance": "d@identity_v0"}, + ) + return SignalContext( + signal=signal, + market_question="Will the Fed raise rates?", + resolution_criteria="YES if rate rises.", + resolution_source="Federal Reserve press release", + closes_at=datetime(2026, 6, 15, tzinfo=UTC), + related_markets=related or [], + investigation_prompts=["Check FOMC calendar."], + interpretation_mode=InterpretationMode.DETERMINISTIC, + ) + + +@pytest.fixture +def builder() -> PromptBuilder: + return PromptBuilder(FORBIDDEN) + + +@pytest.mark.unit +def test_deterministic_across_calls(builder: PromptBuilder) -> None: + ctx = _context() + a = builder.build(ctx) + b = builder.build(ctx) + assert a == b + + +@pytest.mark.unit +def test_system_injects_forbidden_phrases(builder: PromptBuilder) -> None: + system, _ = builder.build(_context()) + for phrase in FORBIDDEN: + assert phrase in system + + +@pytest.mark.unit +def test_system_injects_full_consumer_enum(builder: PromptBuilder) -> None: + system, _ = builder.build(_context()) + for consumer in ConsumerType: + assert consumer.value in system + + +@pytest.mark.unit +def test_user_contains_verbatim_resolution_criteria(builder: PromptBuilder) -> None: + _, user = builder.build(_context()) + assert "YES if rate rises." in user + + +@pytest.mark.unit +def test_manipulation_flags_reported_in_user(builder: PromptBuilder) -> None: + _, user = builder.build(_context(manipulation_flags=[ManipulationFlag.SIZE_VS_DEPTH_OUTLIER])) + assert "size_vs_depth_outlier" in user + + +@pytest.mark.unit +def test_none_flags_render_as_placeholder(builder: PromptBuilder) -> None: + _, user = builder.build(_context()) + assert "Manipulation flags: (none)" in user + + +@pytest.mark.unit +def test_every_signal_type_has_a_template(builder: PromptBuilder) -> None: + for signal_type in SignalType: + _, user = builder.build(_context(signal_type=signal_type)) + assert f"Signal type: {signal_type.value}" in user + + +@pytest.mark.unit +def test_related_markets_render_as_bullets(builder: PromptBuilder) -> None: + related = [ + RelatedMarketState( + market_id="kalshi_fed_holds", + question="?", + current_price=0.42, + delta_24h=-0.02, + volume_24h=1000.0, + relationship_type="inverse", + relationship_strength=0.9, + ) + ] + _, user = builder.build(_context(related=related)) + assert "kalshi_fed_holds" in user + + +@pytest.mark.unit +def test_missing_template_raises(tmp_path: object) -> None: + import shutil + + from augur_format.llm.prompts.builder import _DEFAULT_TEMPLATE_DIR + + isolated = tmp_path # type: ignore[assignment] + isolated_path = isolated # appease mypy; tmp_path is Path in practice. + assert isolated_path # keep name + target = tmp_path / "templates" # type: ignore[operator] + shutil.copytree(_DEFAULT_TEMPLATE_DIR, target) + (target / "price_velocity.txt").unlink() + builder = PromptBuilder(FORBIDDEN, template_dir=target) + with pytest.raises(PromptTemplateNotFoundError): + builder.build(_context()) From 01ee4da81cc8899b6159bc2149341b964be03d88 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:24:50 +0530 Subject: [PATCH 5/8] feat(llm): forbidden-token linter, schema validator, provenance stamp, consumer gate Four defense layers sit between the backend's raw text and a persisted IntelligenceBrief: ForbiddenTokenLinter case-insensitive matches every phrase loaded from config/forbidden_tokens.toml (causal_narrative, price_projection, manipulation_speculation). A match drops the brief before IntelligenceBrief construction. load_forbidden_phrases flattens every [category].phrases block into a single list so the linter does not need to know category semantics. SchemaValidator wraps Pydantic's IntelligenceBrief.model_validate and translates ValidationError into a stable ValidationResult. The interpreter checks result.ok before minting a brief; any schema violation drops the brief and logs the offending field path. ProvenanceStamp holds model (backend-qualified), prompt_hash (SHA-256 of system + "\n\n" + user), and formatter_version (from installed package metadata). Auditors reproduce prompt_hash from the deterministic prompt builder to confirm the model saw exactly what the record claims. ConsumerGate enforces the docs/contracts/consumer-registry.md opt-in rule: only consumers with accepts_llm_assisted=true receive the LLM brief. The deterministic JSON and Markdown paths still reach every consumer; the gate only filters the secondary formatter's output. Eleven tests cover: every configured phrase rejected, case insensitivity, clean text accepted, brief-shape lint, schema validator accept + two rejection modes, stamp reproducibility, stamp hash varies on prompt change, gate eligibility both directions, and list filtering. --- .../augur_format/llm/linter/__init__.py | 3 + .../llm/linter/forbidden_tokens.py | 62 +++++++ .../augur_format/llm/linter/schema_check.py | 35 ++++ .../augur_format/llm/provenance/__init__.py | 3 + .../augur_format/llm/provenance/stamp.py | 49 +++++ .../augur_format/llm/routing/__init__.py | 3 + .../augur_format/llm/routing/consumer_gate.py | 37 ++++ tests/format/test_llm_linter.py | 171 ++++++++++++++++++ 8 files changed, 363 insertions(+) create mode 100644 src/augur_format/augur_format/llm/linter/__init__.py create mode 100644 src/augur_format/augur_format/llm/linter/forbidden_tokens.py create mode 100644 src/augur_format/augur_format/llm/linter/schema_check.py create mode 100644 src/augur_format/augur_format/llm/provenance/__init__.py create mode 100644 src/augur_format/augur_format/llm/provenance/stamp.py create mode 100644 src/augur_format/augur_format/llm/routing/__init__.py create mode 100644 src/augur_format/augur_format/llm/routing/consumer_gate.py create mode 100644 tests/format/test_llm_linter.py diff --git a/src/augur_format/augur_format/llm/linter/__init__.py b/src/augur_format/augur_format/llm/linter/__init__.py new file mode 100644 index 0000000..d44f71a --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/__init__.py @@ -0,0 +1,3 @@ +"""Forbidden-token linter and schema validator for LLM output.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/linter/forbidden_tokens.py b/src/augur_format/augur_format/llm/linter/forbidden_tokens.py new file mode 100644 index 0000000..cfc9eee --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/forbidden_tokens.py @@ -0,0 +1,62 @@ +"""Forbidden-token linter. + +Rejects LLM output containing any phrase from the closed list in +config/forbidden_tokens.toml. The linter operates on the raw text +before the brief is constructed — a failing lint drops the brief +entirely per phase-4 §10. +""" + +from __future__ import annotations + +import tomllib +from collections.abc import Sequence +from pathlib import Path + +from pydantic import BaseModel, ConfigDict + + +class ForbiddenTokenCheckResult(BaseModel): + """Outcome of one forbidden-token check.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + passed: bool + matched_phrases: list[str] + + +class ForbiddenTokenLinter: + """Case-insensitive exact-phrase rejection.""" + + def __init__(self, forbidden_phrases: Sequence[str]) -> None: + self._phrases = [p.lower() for p in forbidden_phrases] + + @property + def phrase_count(self) -> int: + return len(self._phrases) + + def check_text(self, text: str) -> ForbiddenTokenCheckResult: + lowered = text.lower() + matched = [p for p in self._phrases if p in lowered] + return ForbiddenTokenCheckResult(passed=not matched, matched_phrases=matched) + + def check_brief(self, brief: dict[str, object]) -> ForbiddenTokenCheckResult: + headline = str(brief.get("headline", "")) + body = str(brief.get("body_markdown", "")) + return self.check_text(f"{headline}\n{body}") + + +def load_forbidden_phrases(path: Path) -> list[str]: + """Flatten every [category].phrases table in the TOML into a single list. + + The file ships with categorized phrases (causal_narrative, + price_projection, manipulation_speculation); the linter treats + every category uniformly so phrase provenance is a config-layer + concern. + """ + with path.open("rb") as handle: + raw = tomllib.load(handle) + phrases: list[str] = [] + for section in raw.values(): + if isinstance(section, dict) and "phrases" in section: + phrases.extend(str(p) for p in section["phrases"]) + return phrases diff --git a/src/augur_format/augur_format/llm/linter/schema_check.py b/src/augur_format/augur_format/llm/linter/schema_check.py new file mode 100644 index 0000000..1a4c599 --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/schema_check.py @@ -0,0 +1,35 @@ +"""IntelligenceBrief schema validator wrapping the Pydantic model. + +Validates a brief payload by attempting IntelligenceBrief construction. +Pydantic's ValidationError surfaces the specific field violation; the +validator translates that into a stable ValidationResult shape the +interpreter consumes. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from pydantic import ValidationError + +from augur_format.llm.models import IntelligenceBrief + + +@dataclass(frozen=True, slots=True) +class ValidationResult: + """Outcome of one brief-schema validation.""" + + ok: bool + errors: list[str] = field(default_factory=list) + + +class SchemaValidator: + """Validate a raw brief dict against the IntelligenceBrief contract.""" + + def validate(self, brief_dict: dict[str, object]) -> ValidationResult: + try: + IntelligenceBrief.model_validate(brief_dict) + except ValidationError as err: + errors = [f"{'.'.join(str(p) for p in e['loc'])}: {e['msg']}" for e in err.errors()] + return ValidationResult(ok=False, errors=errors) + return ValidationResult(ok=True) diff --git a/src/augur_format/augur_format/llm/provenance/__init__.py b/src/augur_format/augur_format/llm/provenance/__init__.py new file mode 100644 index 0000000..23e7a52 --- /dev/null +++ b/src/augur_format/augur_format/llm/provenance/__init__.py @@ -0,0 +1,3 @@ +"""Provenance metadata builder for LLM-generated briefs.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/provenance/stamp.py b/src/augur_format/augur_format/llm/provenance/stamp.py new file mode 100644 index 0000000..891ba40 --- /dev/null +++ b/src/augur_format/augur_format/llm/provenance/stamp.py @@ -0,0 +1,49 @@ +"""Provenance stamping for LLM-generated briefs. + +``stamp`` returns a ProvenanceStamp whose ``prompt_hash`` is the +SHA-256 of ``system + "\\n\\n" + user``. Auditors recompute the hash +from the deterministic prompt builder to confirm the model saw +exactly what the record claims; ``formatter_version`` is read from +the installed package metadata so downgrades / upgrades are visible +in the record. +""" + +from __future__ import annotations + +import hashlib +from importlib.metadata import PackageNotFoundError, version + +from pydantic import BaseModel, ConfigDict + + +class ProvenanceStamp(BaseModel): + """The immutable provenance triple carried by every brief.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + model: str + prompt_hash: str + formatter_version: str + + +def _formatter_version() -> str: + try: + return version("augur-format") + except PackageNotFoundError: # pragma: no cover — only hit in source checkouts + return "0.0.0+unknown" + + +def stamp( + backend_id: str, + model: str, + system_prompt: str, + user_prompt: str, +) -> ProvenanceStamp: + """Return the ProvenanceStamp for a completion.""" + composite = f"{system_prompt}\n\n{user_prompt}" + digest = hashlib.sha256(composite.encode("utf-8")).hexdigest() + return ProvenanceStamp( + model=f"{model}@{backend_id}", + prompt_hash=digest, + formatter_version=_formatter_version(), + ) diff --git a/src/augur_format/augur_format/llm/routing/__init__.py b/src/augur_format/augur_format/llm/routing/__init__.py new file mode 100644 index 0000000..fbf94d5 --- /dev/null +++ b/src/augur_format/augur_format/llm/routing/__init__.py @@ -0,0 +1,3 @@ +"""Consumer gate for LLM briefs.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/routing/consumer_gate.py b/src/augur_format/augur_format/llm/routing/consumer_gate.py new file mode 100644 index 0000000..2439387 --- /dev/null +++ b/src/augur_format/augur_format/llm/routing/consumer_gate.py @@ -0,0 +1,37 @@ +"""Consumer gate enforcing opt-in for llm_assisted briefs. + +Per docs/contracts/consumer-registry.md, only consumers whose +configuration sets ``accepts_llm_assisted = true`` receive LLM- +rendered briefs. The deterministic JSON and Markdown briefs from +Phase 3 still reach every consumer; the gate only filters the LLM +output. +""" + +from __future__ import annotations + +from collections.abc import Iterable + +from augur_format.llm.models import IntelligenceBrief +from augur_signals.models import ConsumerType + + +class ConsumerGate: + """Filters consumer sets by accepts_llm_assisted opt-in.""" + + def __init__(self, opted_in: Iterable[ConsumerType]) -> None: + self._opted_in = frozenset(opted_in) + + @property + def opted_in(self) -> frozenset[ConsumerType]: + return self._opted_in + + def is_eligible(self, consumer: ConsumerType, brief: IntelligenceBrief) -> bool: + """True iff *consumer* has opted in to LLM-assisted briefs.""" + del brief # brief identity does not factor into the gate decision. + return consumer in self._opted_in + + def filter_consumers( + self, consumers: Iterable[ConsumerType], brief: IntelligenceBrief + ) -> list[ConsumerType]: + """Return the subset of consumers eligible for this brief.""" + return [c for c in consumers if self.is_eligible(c, brief)] diff --git a/tests/format/test_llm_linter.py b/tests/format/test_llm_linter.py new file mode 100644 index 0000000..42d81a5 --- /dev/null +++ b/tests/format/test_llm_linter.py @@ -0,0 +1,171 @@ +"""Tests for forbidden-token linter, schema validator, stamp, gate.""" + +from __future__ import annotations + +from datetime import UTC, datetime +from pathlib import Path + +import pytest + +from augur_format.llm.linter.forbidden_tokens import ( + ForbiddenTokenLinter, + load_forbidden_phrases, +) +from augur_format.llm.linter.schema_check import SchemaValidator +from augur_format.llm.models import IntelligenceBrief +from augur_format.llm.provenance.stamp import stamp +from augur_format.llm.routing.consumer_gate import ConsumerGate +from augur_signals.models import ConsumerType + + +@pytest.mark.unit +def test_linter_rejects_each_configured_phrase() -> None: + phrases = load_forbidden_phrases(Path("config/forbidden_tokens.toml")) + assert phrases # sanity: at least one phrase loaded from the shipped file + linter = ForbiddenTokenLinter(phrases) + for phrase in phrases: + result = linter.check_text(f"The market {phrase} a rate change.") + assert not result.passed + assert phrase.lower() in result.matched_phrases + + +@pytest.mark.unit +def test_linter_is_case_insensitive() -> None: + linter = ForbiddenTokenLinter(["may be driven by"]) + assert not linter.check_text("Prices May Be Driven By macro moves").passed + + +@pytest.mark.unit +def test_linter_accepts_clean_text() -> None: + linter = ForbiddenTokenLinter(["may be driven by"]) + result = linter.check_text("The Fed left the rate range unchanged.") + assert result.passed + assert result.matched_phrases == [] + + +@pytest.mark.unit +def test_linter_check_brief_combines_headline_and_body() -> None: + linter = ForbiddenTokenLinter(["suggests that"]) + result = linter.check_brief( + {"headline": "Update", "body_markdown": "The move suggests that a cut is due."} + ) + assert not result.passed + + +@pytest.mark.unit +def test_schema_validator_accepts_valid_payload() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "gemma2:27b@ollama", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert result.ok + + +@pytest.mark.unit +def test_schema_validator_rejects_over_length_headline() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "x" * 91, + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert not result.ok + assert any("headline" in e for e in result.errors) + + +@pytest.mark.unit +def test_schema_validator_rejects_unknown_consumer() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["not_a_consumer"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert not result.ok + + +@pytest.mark.unit +def test_stamp_is_reproducible() -> None: + s1 = stamp("ollama", "gemma2:27b", "system", "user") + s2 = stamp("ollama", "gemma2:27b", "system", "user") + assert s1.prompt_hash == s2.prompt_hash + assert s1.model == "gemma2:27b@ollama" + assert len(s1.prompt_hash) == 64 + + +@pytest.mark.unit +def test_stamp_hash_changes_on_prompt_change() -> None: + a = stamp("ollama", "gemma2:27b", "system", "user-a") + b = stamp("ollama", "gemma2:27b", "system", "user-b") + assert a.prompt_hash != b.prompt_hash + + +@pytest.mark.unit +def test_consumer_gate_allows_opted_in() -> None: + gate = ConsumerGate([ConsumerType.DASHBOARD]) + brief = IntelligenceBrief.model_validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + ) + assert gate.is_eligible(ConsumerType.DASHBOARD, brief) + assert not gate.is_eligible(ConsumerType.MACRO_RESEARCH_AGENT, brief) + + +@pytest.mark.unit +def test_consumer_gate_filters_list() -> None: + gate = ConsumerGate([ConsumerType.DASHBOARD]) + brief = IntelligenceBrief.model_validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + ) + kept = gate.filter_consumers([ConsumerType.MACRO_RESEARCH_AGENT, ConsumerType.DASHBOARD], brief) + assert kept == [ConsumerType.DASHBOARD] From 0d3f519d1af776ca8cad1a21704b6fff1eb90870 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:27:09 +0530 Subject: [PATCH 6/8] feat(llm): orchestrator composing backend, prompt, linter, schema, stamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LLMInterpreter is the single entrypoint the engine calls to render a SignalContext into an IntelligenceBrief through the gated path. The orchestrator sequences five stages: build deterministic prompt, call backend, lint output for forbidden tokens, validate against the IntelligenceBrief schema, stamp provenance. Any failure at any stage drops the brief by returning None — the deterministic pipeline proceeds unaffected, so consumers always receive the canonical JSON and Markdown outputs regardless of LLM outcome. set_suspended wires into the Phase-1 StormController's state stream per phase-4 §11: when in_storm=True the interpreter returns None immediately without calling the backend, avoiding the 5-10-second per-brief latency under storm-mode pressure. Briefs that would have been generated during suspension are not retroactively rendered. Provenance stamping attaches model identifier (backend-qualified), SHA-256 prompt hash, and formatter version to every brief. Auditors reproduce the hash from the prompt builder's deterministic output and confirm the model saw exactly what the record claims. now is a parameter so backtest harnesses can drive generated_at deterministically. Production code passes None which falls through to datetime.now(UTC); tests always pass an explicit timestamp. Eight tests cover the full pipeline: happy path, forbidden token drop, invalid JSON drop, unknown consumer drop, backend error drop, storm-mode suspension short-circuit, resume after suspension, and over-length-headline schema drop. --- .../augur_format/llm/interpreter.py | 107 +++++++++ tests/format/test_llm_interpreter.py | 210 ++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 src/augur_format/augur_format/llm/interpreter.py create mode 100644 tests/format/test_llm_interpreter.py diff --git a/src/augur_format/augur_format/llm/interpreter.py b/src/augur_format/augur_format/llm/interpreter.py new file mode 100644 index 0000000..b074f3a --- /dev/null +++ b/src/augur_format/augur_format/llm/interpreter.py @@ -0,0 +1,107 @@ +"""LLMInterpreter — orchestrates the gated secondary formatter. + +Composes the backend, prompt builder, forbidden-token linter, schema +validator, and provenance stamp into a single ``interpret`` call per +SignalContext. Any failure (backend error, forbidden token, invalid +JSON, schema violation, storm suspension) causes the interpreter to +return None; the deterministic pipeline proceeds unaffected. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from uuid import uuid4 + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, +) +from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter +from augur_format.llm.linter.schema_check import SchemaValidator +from augur_format.llm.models import IntelligenceBrief +from augur_format.llm.prompts.builder import PromptBuilder +from augur_format.llm.provenance.stamp import stamp +from augur_signals.models import SignalContext + + +class LLMInterpreter: + """Generate gated IntelligenceBriefs from SignalContext.""" + + def __init__( + self, + backend: AbstractLLMBackend, + prompt_builder: PromptBuilder, + linter: ForbiddenTokenLinter, + schema_validator: SchemaValidator, + *, + max_tokens: int = 512, + temperature: float = 0.2, + ) -> None: + self._backend = backend + self._prompt_builder = prompt_builder + self._linter = linter + self._schema_validator = schema_validator + self._max_tokens = max_tokens + self._temperature = temperature + self._suspended = False + + @property + def suspended(self) -> bool: + return self._suspended + + def set_suspended(self, suspended: bool) -> None: + """Toggle storm-mode suspension. + + When True, ``interpret`` returns None without calling the + backend, matching phase-4 §11 coordination with the dedup + layer's StormController. + """ + self._suspended = suspended + + async def interpret( + self, context: SignalContext, severity: str, *, now: datetime | None = None + ) -> IntelligenceBrief | None: + """Run the full gated-brief pipeline for *context*.""" + if self._suspended: + return None + system, user = self._prompt_builder.build(context) + try: + result = await self._backend.complete(system, user, self._max_tokens, self._temperature) + except BackendError: + return None + lint = self._linter.check_text(result.text) + if not lint.passed: + return None + try: + parsed = json.loads(result.text) + except json.JSONDecodeError: + return None + if not isinstance(parsed, dict): + return None + brief_payload: dict[str, object] = parsed + generated_at = now if now is not None else datetime.now(tz=UTC) + provenance = stamp( + self._backend.backend_id, + self._backend.model_id(), + system, + user, + ) + brief_payload.update( + { + "brief_id": str(uuid4()), + "signal_id": context.signal.signal_id, + "severity": severity, + "interpretation_mode": "llm_assisted", + "model": provenance.model, + "prompt_hash": provenance.prompt_hash, + "formatter_version": provenance.formatter_version, + "generated_at": generated_at, + "forbidden_token_check": "passed", + "schema_version": "1.0.0", + } + ) + validation = self._schema_validator.validate(brief_payload) + if not validation.ok: + return None + return IntelligenceBrief.model_validate(brief_payload) diff --git a/tests/format/test_llm_interpreter.py b/tests/format/test_llm_interpreter.py new file mode 100644 index 0000000..8efb346 --- /dev/null +++ b/tests/format/test_llm_interpreter.py @@ -0,0 +1,210 @@ +"""End-to-end tests for the LLMInterpreter orchestrator.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import UTC, datetime + +import pytest + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) +from augur_format.llm.interpreter import LLMInterpreter +from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter +from augur_format.llm.linter.schema_check import SchemaValidator +from augur_format.llm.prompts.builder import PromptBuilder +from augur_signals.models import ( + InterpretationMode, + MarketSignal, + SignalContext, + SignalType, + new_signal_id, +) + +FORBIDDEN = ["may be driven by", "likely reflects"] + + +def _context() -> SignalContext: + signal = MarketSignal( + signal_id=new_signal_id(), + market_id="kalshi_fed", + platform="kalshi", + signal_type=SignalType.PRICE_VELOCITY, + magnitude=0.8, + direction=1, + confidence=0.72, + fdr_adjusted=True, + detected_at=datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + window_seconds=300, + liquidity_tier="high", + raw_features={"calibration_provenance": "d@identity_v0"}, + ) + return SignalContext( + signal=signal, + market_question="Will the Fed raise rates?", + resolution_criteria="YES if rate rises.", + resolution_source="Federal Reserve", + closes_at=datetime(2026, 6, 15, tzinfo=UTC), + related_markets=[], + investigation_prompts=["Check FOMC calendar."], + interpretation_mode=InterpretationMode.DETERMINISTIC, + ) + + +@dataclass +class _StubBackend(AbstractLLMBackend): + backend_id: str = "stub" + _model: str = "stub-model" + _responses: list[str] | None = None + _exception: BaseException | None = None + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + return True + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + del system, prompt, max_tokens, temperature + if self._exception is not None: + raise self._exception + if not self._responses: + raise RuntimeError("no canned response") + text = self._responses.pop(0) + return CompletionResult(text=text, input_tokens=10, output_tokens=20, duration_ms=5) + + +_VALID_RESPONSE = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "The Fed left the target range unchanged.", + "actionable_for": ["dashboard"], + } +) + + +def _interpreter( + backend: _StubBackend, + forbidden: list[str] | None = None, +) -> LLMInterpreter: + return LLMInterpreter( + backend, + PromptBuilder(forbidden or FORBIDDEN), + ForbiddenTokenLinter(forbidden or FORBIDDEN), + SchemaValidator(), + ) + + +@pytest.mark.asyncio +async def test_happy_path_emits_brief() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + assert brief.headline == "Fed holds rates" + assert brief.severity == "high" + assert brief.interpretation_mode == "llm_assisted" + assert brief.prompt_hash != "" + assert brief.forbidden_token_check == "passed" # noqa: S105 + + +@pytest.mark.asyncio +async def test_forbidden_token_drops_brief() -> None: + tainted = json.dumps( + { + "headline": "Hold", + "body_markdown": "Prices may be driven by external news.", + "actionable_for": ["dashboard"], + } + ) + backend = _StubBackend(_responses=[tainted]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_invalid_json_drops_brief() -> None: + backend = _StubBackend(_responses=["{this is not json"]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_unknown_consumer_drops_brief() -> None: + bad_consumer = json.dumps( + { + "headline": "Hold", + "body_markdown": "Update.", + "actionable_for": ["not_a_consumer"], + } + ) + backend = _StubBackend(_responses=[bad_consumer]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_backend_error_drops_brief() -> None: + backend = _StubBackend(_exception=BackendError("down")) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_storm_suspension_short_circuits_before_backend_call() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + interpreter.set_suspended(True) + brief = await interpreter.interpret(_context(), severity="high") + assert brief is None + # Backend call was not made; the canned response is still pending. + assert backend._responses == [_VALID_RESPONSE] + + +@pytest.mark.asyncio +async def test_resuming_from_suspension_allows_next_brief() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + interpreter.set_suspended(True) + suspended = await interpreter.interpret(_context(), severity="high") + assert suspended is None + interpreter.set_suspended(False) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + + +@pytest.mark.asyncio +async def test_overlong_headline_drops_brief() -> None: + long_headline = json.dumps( + { + "headline": "x" * 100, + "body_markdown": "ok", + "actionable_for": ["dashboard"], + } + ) + backend = _StubBackend(_responses=[long_headline]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="low") + assert brief is None From c2e7a9b294c9c1572589ebcecaf737d829e46450 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:28:15 +0530 Subject: [PATCH 7/8] docs: record gated llm secondary formatter in the changelog --- CHANGELOG.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1d2bbe..743f4dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to Augur are recorded in this file. Format follows [Keep a C ## [Unreleased] +### Added — Gated LLM Secondary Formatter + +- `src/augur_format/llm/` package — the only location in the codebase where LLM SDK imports live, complementing the CI grep guard over `src/augur_signals/`. +- Expanded `IntelligenceBrief` contract with `headline` (≤ 90 chars), `body_markdown` (≤ 800 chars), `formatter_version`, `generated_at`, and model validators locking `interpretation_mode="llm_assisted"` and `forbidden_token_check="passed"`. Schema re-exported at `schemas/IntelligenceBrief-1.0.0.json`. +- `AbstractLLMBackend` protocol with two concrete adapters: `OllamaBackend` (plain httpx against the local daemon) and `AnthropicBackend` (lazy-imported anthropic SDK). Both accept retry budgets and raise `BackendError` on exhaustion. +- Deterministic `PromptBuilder` producing `(system, user)` prompt pairs with the sorted forbidden-phrase list, `IntelligenceBrief` schema summary, `ConsumerType` enum, and per-signal-type templates. Templates live under `augur_format/llm/prompts/templates/` and ship in the wheel. +- `ForbiddenTokenLinter` with `load_forbidden_phrases` that flattens every `[category].phrases` block in `config/forbidden_tokens.toml`. Matching is case-insensitive; a matched phrase drops the brief before `IntelligenceBrief` construction. +- `SchemaValidator` wrapping Pydantic `IntelligenceBrief.model_validate` and returning a stable `ValidationResult`. +- `ProvenanceStamp` carrying model-backend pair, SHA-256 prompt hash, and installed `formatter_version`. Auditors reproduce the hash from the deterministic prompt output. +- `ConsumerGate` enforcing `accepts_llm_assisted` opt-in per `docs/contracts/consumer-registry.md`. +- `LLMInterpreter` orchestrator composing backend + prompt + linter + schema validator + stamp. `set_suspended` wires into the Phase-1 `StormController` so briefs stop generating under storm-mode pressure. +- `config/llm.toml` with `[interpreter] enabled=false` default, Ollama and Anthropic backend blocks, and the prompt template directory path. + +### Operational Handoff — LLM Formatter + +After merge an operator who edits `config/llm.toml` to set `enabled = true`, installs the chosen backend (`augur-format[llm-local]` for Ollama, `augur-format[llm-cloud]` for Anthropic), and provisions any required credentials (`ANTHROPIC_API_KEY`) receives LLM-rendered briefs alongside the deterministic JSON and Markdown — but only for consumers whose `accepts_llm_assisted = true`. The deterministic pipeline runs regardless of LLM state. + ### Added — Deterministic Formatters - `src/augur_format/deterministic/json_feed.py` — `to_canonical_json` emits UTF-8 JSON bytes with stable key ordering (top-level, signal block, related-market block), six-decimal float rounding (configurable), and Z-suffix UTC timestamps. Byte-identical across invocations. From 40749576f37444d5b3a998f6b5e398a492ff89b2 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 14:37:20 +0530 Subject: [PATCH 8/8] fix(llm): wire consumer gate, lint post-parse, share schema_version constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the pr-review findings in order: HIGH (H1): LLMInterpreter now accepts an optional ConsumerGate and filters each brief's actionable_for to the opted-in subset before returning. Briefs whose actionable_for empties after filtering drop entirely — the previous wiring generated a brief whose consumer list was never validated against the accepts_llm_assisted registry, letting LLM output leak to agent consumers that had not opted in. When the filter trims the list, the brief is rebuilt via model_copy so downstream code sees only the allowed set. HIGH (H2): the forbidden-token linter now runs against the post-parse headline+body instead of the raw JSON response. A model that escapes a forbidden phrase as \\u006d\\u0061\\u0079 would slip past the substring check on raw JSON but fails the lint after json.loads normalizes the escape. A regression test covers the unicode-escape bypass path. MEDIUM (M3): models.py exports SCHEMA_VERSION as a module-level constant and the interpreter plus prompt builder read from it. A schema version bump now requires one edit instead of three. MEDIUM (M4): interpreter drops the SchemaValidator wrapper's double validation; IntelligenceBrief.model_validate is the single source of schema truth. ValidationError drops the brief without a second full-validate pass. MEDIUM (M1): OllamaBackend raises immediately on 4xx responses (malformed adapter payload) instead of retrying — the error class only recovers on 5xx/connection failures. MEDIUM (M2): AnthropicBackend narrows retry to transient failures. AuthenticationError, PermissionDeniedError, and BadRequestError class paths raise through a wrapped BackendError immediately so credential misconfigurations surface without burning the retry budget. Class lookup is string-based so the module loads without the anthropic SDK installed. Three new tests cover the consumer-gate filter path, the no- consumer-opted-in drop path, and the unicode-escape lint bypass. --- .../augur_format/llm/backends/anthropic.py | 13 +++++ .../augur_format/llm/backends/ollama.py | 8 ++- .../augur_format/llm/interpreter.py | 58 +++++++++++++------ src/augur_format/augur_format/llm/models.py | 4 +- .../augur_format/llm/prompts/builder.py | 3 +- tests/format/test_llm_interpreter.py | 58 ++++++++++++++++++- 6 files changed, 120 insertions(+), 24 deletions(-) diff --git a/src/augur_format/augur_format/llm/backends/anthropic.py b/src/augur_format/augur_format/llm/backends/anthropic.py index f9ce7ac..5c43f6d 100644 --- a/src/augur_format/augur_format/llm/backends/anthropic.py +++ b/src/augur_format/augur_format/llm/backends/anthropic.py @@ -80,6 +80,19 @@ async def complete( timeout=self._timeout, ) except Exception as err: + # Narrow retry to transient failures. Authentication + # and permission errors raise immediately so auth + # failures do not burn the retry budget. The class + # lookup is string-based so the module stays loadable + # without the anthropic SDK installed. + class_path = f"{type(err).__module__}.{type(err).__name__}" + terminal = { + "anthropic.AuthenticationError", + "anthropic.PermissionDeniedError", + "anthropic.BadRequestError", + } + if class_path in terminal: + raise BackendError(f"anthropic terminal error: {err!r}") from err last_error = err continue duration_ms = int((time.perf_counter() - started) * 1000) diff --git a/src/augur_format/augur_format/llm/backends/ollama.py b/src/augur_format/augur_format/llm/backends/ollama.py index d464ece..421f9c3 100644 --- a/src/augur_format/augur_format/llm/backends/ollama.py +++ b/src/augur_format/augur_format/llm/backends/ollama.py @@ -80,7 +80,13 @@ async def complete( last_error = err continue if response.status_code != 200: - last_error = BackendError(f"ollama returned status {response.status_code}") + status_error = BackendError(f"ollama returned status {response.status_code}") + # 4xx indicates a malformed request from the adapter; + # retrying will not recover. Surface the error + # immediately so callers see the root cause. + if 400 <= response.status_code < 500: + raise status_error + last_error = status_error continue data: dict[str, Any] = response.json() duration_ms = int((time.perf_counter() - started) * 1000) diff --git a/src/augur_format/augur_format/llm/interpreter.py b/src/augur_format/augur_format/llm/interpreter.py index b074f3a..1687b75 100644 --- a/src/augur_format/augur_format/llm/interpreter.py +++ b/src/augur_format/augur_format/llm/interpreter.py @@ -1,10 +1,20 @@ """LLMInterpreter — orchestrates the gated secondary formatter. Composes the backend, prompt builder, forbidden-token linter, schema -validator, and provenance stamp into a single ``interpret`` call per -SignalContext. Any failure (backend error, forbidden token, invalid -JSON, schema violation, storm suspension) causes the interpreter to -return None; the deterministic pipeline proceeds unaffected. +validator, consumer gate, and provenance stamp into a single +``interpret`` call per SignalContext. Any failure (backend error, +forbidden token, invalid JSON, schema violation, storm suspension) +returns None; the deterministic pipeline proceeds unaffected. + +Defense ordering: +1. Storm-mode short-circuit (before backend call). +2. Backend completion. +3. JSON parse — non-dict payloads drop the brief. +4. Forbidden-token lint against the parsed headline+body, not the raw + JSON, so unicode-escape bypass cannot slip a forbidden phrase past + the substring check. +5. Pydantic IntelligenceBrief construction (single validation pass). +6. Consumer gate trims actionable_for to consumers that opted in. """ from __future__ import annotations @@ -13,15 +23,14 @@ from datetime import UTC, datetime from uuid import uuid4 -from augur_format.llm.backends.base import ( - AbstractLLMBackend, - BackendError, -) +from pydantic import ValidationError + +from augur_format.llm.backends.base import AbstractLLMBackend, BackendError from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter -from augur_format.llm.linter.schema_check import SchemaValidator -from augur_format.llm.models import IntelligenceBrief +from augur_format.llm.models import SCHEMA_VERSION, IntelligenceBrief from augur_format.llm.prompts.builder import PromptBuilder from augur_format.llm.provenance.stamp import stamp +from augur_format.llm.routing.consumer_gate import ConsumerGate from augur_signals.models import SignalContext @@ -33,15 +42,15 @@ def __init__( backend: AbstractLLMBackend, prompt_builder: PromptBuilder, linter: ForbiddenTokenLinter, - schema_validator: SchemaValidator, *, + consumer_gate: ConsumerGate | None = None, max_tokens: int = 512, temperature: float = 0.2, ) -> None: self._backend = backend self._prompt_builder = prompt_builder self._linter = linter - self._schema_validator = schema_validator + self._gate = consumer_gate self._max_tokens = max_tokens self._temperature = temperature self._suspended = False @@ -70,9 +79,6 @@ async def interpret( result = await self._backend.complete(system, user, self._max_tokens, self._temperature) except BackendError: return None - lint = self._linter.check_text(result.text) - if not lint.passed: - return None try: parsed = json.loads(result.text) except json.JSONDecodeError: @@ -80,6 +86,13 @@ async def interpret( if not isinstance(parsed, dict): return None brief_payload: dict[str, object] = parsed + # Lint the parsed headline+body — unicode escapes in the raw JSON + # are normalized by the parser, closing the substring-bypass vector. + headline = str(brief_payload.get("headline", "")) + body = str(brief_payload.get("body_markdown", "")) + lint = self._linter.check_text(f"{headline}\n{body}") + if not lint.passed: + return None generated_at = now if now is not None else datetime.now(tz=UTC) provenance = stamp( self._backend.backend_id, @@ -98,10 +111,17 @@ async def interpret( "formatter_version": provenance.formatter_version, "generated_at": generated_at, "forbidden_token_check": "passed", - "schema_version": "1.0.0", + "schema_version": SCHEMA_VERSION, } ) - validation = self._schema_validator.validate(brief_payload) - if not validation.ok: + try: + brief = IntelligenceBrief.model_validate(brief_payload) + except ValidationError: return None - return IntelligenceBrief.model_validate(brief_payload) + if self._gate is not None: + allowed = self._gate.filter_consumers(brief.actionable_for, brief) + if not allowed: + return None + if allowed != list(brief.actionable_for): + brief = brief.model_copy(update={"actionable_for": allowed}) + return brief diff --git a/src/augur_format/augur_format/llm/models.py b/src/augur_format/augur_format/llm/models.py index 2cfc69a..e887760 100644 --- a/src/augur_format/augur_format/llm/models.py +++ b/src/augur_format/augur_format/llm/models.py @@ -15,6 +15,8 @@ from augur_signals.models import ConsumerType +SCHEMA_VERSION: Literal["1.0.0"] = "1.0.0" + class IntelligenceBrief(BaseModel): """Gated LLM formatter output contract. @@ -44,7 +46,7 @@ class IntelligenceBrief(BaseModel): formatter_version: str generated_at: datetime forbidden_token_check: Literal["passed"] = "passed" # noqa: S105 - schema_version: Literal["1.0.0"] = "1.0.0" + schema_version: Literal["1.0.0"] = SCHEMA_VERSION @model_validator(mode="after") def _interpretation_mode_pinned(self) -> IntelligenceBrief: diff --git a/src/augur_format/augur_format/llm/prompts/builder.py b/src/augur_format/augur_format/llm/prompts/builder.py index 6a7742f..010ecd2 100644 --- a/src/augur_format/augur_format/llm/prompts/builder.py +++ b/src/augur_format/augur_format/llm/prompts/builder.py @@ -17,6 +17,7 @@ from collections.abc import Sequence from pathlib import Path +from augur_format.llm.models import SCHEMA_VERSION from augur_signals.models import ConsumerType, SignalContext _DEFAULT_TEMPLATE_DIR = Path(__file__).resolve().parent / "templates" @@ -108,5 +109,5 @@ def _render_user(self, context: SignalContext) -> str: "- formatter_version: string\n" "- generated_at: ISO-8601 UTC datetime\n" "- forbidden_token_check: must equal 'passed'\n" - "- schema_version: '1.0.0'" + f"- schema_version: '{SCHEMA_VERSION}'" ) diff --git a/tests/format/test_llm_interpreter.py b/tests/format/test_llm_interpreter.py index 8efb346..dd73598 100644 --- a/tests/format/test_llm_interpreter.py +++ b/tests/format/test_llm_interpreter.py @@ -15,9 +15,10 @@ ) from augur_format.llm.interpreter import LLMInterpreter from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter -from augur_format.llm.linter.schema_check import SchemaValidator from augur_format.llm.prompts.builder import PromptBuilder +from augur_format.llm.routing.consumer_gate import ConsumerGate from augur_signals.models import ( + ConsumerType, InterpretationMode, MarketSignal, SignalContext, @@ -96,12 +97,14 @@ async def complete( def _interpreter( backend: _StubBackend, forbidden: list[str] | None = None, + *, + gate: ConsumerGate | None = None, ) -> LLMInterpreter: return LLMInterpreter( backend, PromptBuilder(forbidden or FORBIDDEN), ForbiddenTokenLinter(forbidden or FORBIDDEN), - SchemaValidator(), + consumer_gate=gate, ) @@ -195,6 +198,57 @@ async def test_resuming_from_suspension_allows_next_brief() -> None: assert brief is not None +@pytest.mark.asyncio +async def test_gate_trims_actionable_for_to_opted_in_consumers() -> None: + # LLM emits macro_research_agent + dashboard; only dashboard opts in. + response = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "Update.", + "actionable_for": ["macro_research_agent", "dashboard"], + } + ) + backend = _StubBackend(_responses=[response]) + interpreter = _interpreter(backend, gate=ConsumerGate([ConsumerType.DASHBOARD])) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + assert brief.actionable_for == [ConsumerType.DASHBOARD] + + +@pytest.mark.asyncio +async def test_gate_drops_brief_when_no_consumer_opted_in() -> None: + response = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "Update.", + "actionable_for": ["macro_research_agent"], + } + ) + backend = _StubBackend(_responses=[response]) + interpreter = _interpreter(backend, gate=ConsumerGate([ConsumerType.DASHBOARD])) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_unicode_escape_in_raw_json_still_caught_after_parse() -> None: + # "\u006d\u0061\u0079 be driven by" decodes to "may be driven by"; the + # substring lint against the raw JSON would miss, but the post-parse + # lint against the decoded headline catches it. + tainted = ( + '{"headline":"\\u006d\\u0061\\u0079 be driven by moves",' + '"body_markdown":"ok","actionable_for":["dashboard"]}' + ) + backend = _StubBackend(_responses=[tainted]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + @pytest.mark.asyncio async def test_overlong_headline_drops_brief() -> None: long_headline = json.dumps(