diff --git a/CHANGELOG.md b/CHANGELOG.md index f1d2bbe..743f4dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,23 @@ All notable changes to Augur are recorded in this file. Format follows [Keep a C ## [Unreleased] +### Added — Gated LLM Secondary Formatter + +- `src/augur_format/llm/` package — the only location in the codebase where LLM SDK imports live, complementing the CI grep guard over `src/augur_signals/`. +- Expanded `IntelligenceBrief` contract with `headline` (≤ 90 chars), `body_markdown` (≤ 800 chars), `formatter_version`, `generated_at`, and model validators locking `interpretation_mode="llm_assisted"` and `forbidden_token_check="passed"`. Schema re-exported at `schemas/IntelligenceBrief-1.0.0.json`. +- `AbstractLLMBackend` protocol with two concrete adapters: `OllamaBackend` (plain httpx against the local daemon) and `AnthropicBackend` (lazy-imported anthropic SDK). Both accept retry budgets and raise `BackendError` on exhaustion. +- Deterministic `PromptBuilder` producing `(system, user)` prompt pairs with the sorted forbidden-phrase list, `IntelligenceBrief` schema summary, `ConsumerType` enum, and per-signal-type templates. Templates live under `augur_format/llm/prompts/templates/` and ship in the wheel. +- `ForbiddenTokenLinter` with `load_forbidden_phrases` that flattens every `[category].phrases` block in `config/forbidden_tokens.toml`. Matching is case-insensitive; a matched phrase drops the brief before `IntelligenceBrief` construction. +- `SchemaValidator` wrapping Pydantic `IntelligenceBrief.model_validate` and returning a stable `ValidationResult`. +- `ProvenanceStamp` carrying model-backend pair, SHA-256 prompt hash, and installed `formatter_version`. Auditors reproduce the hash from the deterministic prompt output. +- `ConsumerGate` enforcing `accepts_llm_assisted` opt-in per `docs/contracts/consumer-registry.md`. +- `LLMInterpreter` orchestrator composing backend + prompt + linter + schema validator + stamp. `set_suspended` wires into the Phase-1 `StormController` so briefs stop generating under storm-mode pressure. +- `config/llm.toml` with `[interpreter] enabled=false` default, Ollama and Anthropic backend blocks, and the prompt template directory path. + +### Operational Handoff — LLM Formatter + +After merge an operator who edits `config/llm.toml` to set `enabled = true`, installs the chosen backend (`augur-format[llm-local]` for Ollama, `augur-format[llm-cloud]` for Anthropic), and provisions any required credentials (`ANTHROPIC_API_KEY`) receives LLM-rendered briefs alongside the deterministic JSON and Markdown — but only for consumers whose `accepts_llm_assisted = true`. The deterministic pipeline runs regardless of LLM state. + ### Added — Deterministic Formatters - `src/augur_format/deterministic/json_feed.py` — `to_canonical_json` emits UTF-8 JSON bytes with stable key ordering (top-level, signal block, related-market block), six-decimal float rounding (configurable), and Z-suffix UTC timestamps. Byte-identical across invocations. diff --git a/config/llm.toml b/config/llm.toml new file mode 100644 index 0000000..2db2d06 --- /dev/null +++ b/config/llm.toml @@ -0,0 +1,30 @@ +# Gated LLM secondary formatter configuration. +# +# Default off per phase-4 §17.1. An operator enables the interpreter +# by setting [interpreter] enabled = true after reviewing the +# reputation-risk example in docs/examples/negative-paths.md §Example 4. +# +# Model identifiers live in this file per ~/.claude/CLAUDE.md §Models +# & Configuration — never in source code. + +[interpreter] +enabled = false +default_backend = "ollama" +max_tokens = 512 +temperature = 0.2 +suspend_during_storm = true + +[backends.ollama] +endpoint = "http://localhost:11434" +model = "gemma2:27b" +timeout_seconds = 30 +max_retries = 2 + +[backends.anthropic] +model = "claude-haiku-4-5-20251001" +timeout_seconds = 20 +max_retries = 3 +api_key_env = "ANTHROPIC_API_KEY" + +[prompts] +template_dir = "src/augur_format/augur_format/llm/prompts/templates" diff --git a/schemas/IntelligenceBrief-1.0.0.json b/schemas/IntelligenceBrief-1.0.0.json index 65637ea..3a769d9 100644 --- a/schemas/IntelligenceBrief-1.0.0.json +++ b/schemas/IntelligenceBrief-1.0.0.json @@ -15,7 +15,7 @@ } }, "additionalProperties": false, - "description": "Gated LLM formatter output contract.\n\n``actionable_for`` is constrained to the ConsumerType registry in\ndocs/contracts/consumer-registry.md via the Pydantic field type;\nthe closed-enum validator rechecks this at the formatter boundary\nso even dynamically-constructed instances fail loud on unknown\nvalues.", + "description": "Gated LLM formatter output contract.\n\nStructural invariants are enforced by Pydantic at construction:\nthe headline is capped at 90 characters so it fits a Slack header,\nbody_markdown is capped at 800 characters so it stays readable on\na dashboard card, ``actionable_for`` is typed as list[ConsumerType]\nso unknown consumers fail immediately, and ``interpretation_mode``\nplus ``forbidden_token_check`` are Literal singletons \u2014 any\nconstruction path that bypasses the linter or the deterministic-\nmode check would have to forge the literal, which is caught in\ncode review.", "properties": { "actionable_for": { "items": { @@ -25,6 +25,7 @@ "type": "array" }, "body_markdown": { + "maxLength": 800, "title": "Body Markdown", "type": "string" }, @@ -38,7 +39,17 @@ "title": "Forbidden Token Check", "type": "string" }, + "formatter_version": { + "title": "Formatter Version", + "type": "string" + }, + "generated_at": { + "format": "date-time", + "title": "Generated At", + "type": "string" + }, "headline": { + "maxLength": 90, "title": "Headline", "type": "string" }, @@ -83,7 +94,9 @@ "body_markdown", "severity", "model", - "prompt_hash" + "prompt_hash", + "formatter_version", + "generated_at" ], "title": "IntelligenceBrief", "type": "object" diff --git a/src/augur_format/augur_format/llm/backends/__init__.py b/src/augur_format/augur_format/llm/backends/__init__.py new file mode 100644 index 0000000..8e7ed29 --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/__init__.py @@ -0,0 +1,3 @@ +"""LLM backend abstraction and concrete adapters.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/backends/anthropic.py b/src/augur_format/augur_format/llm/backends/anthropic.py new file mode 100644 index 0000000..5c43f6d --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/anthropic.py @@ -0,0 +1,114 @@ +"""Anthropic backend adapter. + +Imports the anthropic SDK lazily via ``importlib.import_module`` so +that the llm-isolation test continues to assert anthropic is NOT +importable in the default environment. Operators install anthropic +via the ``augur-format[llm-cloud]`` extra before enabling the +backend. +""" + +from __future__ import annotations + +import importlib +import os +import time +from typing import Any + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) + + +class AnthropicBackend(AbstractLLMBackend): + """AbstractLLMBackend implementation routed through the anthropic SDK.""" + + backend_id: str = "anthropic" + + def __init__( + self, + model: str = "claude-haiku-4-5-20251001", + api_key_env: str = "ANTHROPIC_API_KEY", + timeout_seconds: float = 20.0, + max_retries: int = 3, + client: Any | None = None, + ) -> None: + key = os.environ.get(api_key_env) + if key is None and client is None: + raise BackendError( + f"AnthropicBackend requires {api_key_env} environment variable " + "or an injected client" + ) + self._model = model + self._timeout = timeout_seconds + self._max_retries = max(1, max_retries) + if client is None: + # Lazy import so the module is safely loadable when the + # anthropic extra is not installed; the adapter itself + # only runs when the operator opts in. + anthropic = importlib.import_module("anthropic") + client = anthropic.AsyncAnthropic(api_key=key) + self._client = client + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + # The SDK does not expose a cheap ping; surface True when the + # client constructs successfully and let the first real + # completion surface any runtime errors. + return self._client is not None + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + last_error: BaseException | None = None + for _ in range(self._max_retries): + started = time.perf_counter() + try: + response = await self._client.messages.create( + model=self._model, + max_tokens=max_tokens, + temperature=temperature, + system=system, + messages=[{"role": "user", "content": prompt}], + timeout=self._timeout, + ) + except Exception as err: + # Narrow retry to transient failures. Authentication + # and permission errors raise immediately so auth + # failures do not burn the retry budget. The class + # lookup is string-based so the module stays loadable + # without the anthropic SDK installed. + class_path = f"{type(err).__module__}.{type(err).__name__}" + terminal = { + "anthropic.AuthenticationError", + "anthropic.PermissionDeniedError", + "anthropic.BadRequestError", + } + if class_path in terminal: + raise BackendError(f"anthropic terminal error: {err!r}") from err + last_error = err + continue + duration_ms = int((time.perf_counter() - started) * 1000) + content_blocks = getattr(response, "content", []) + text_parts = [ + getattr(block, "text", "") + for block in content_blocks + if getattr(block, "type", "") == "text" + ] + usage = getattr(response, "usage", None) + return CompletionResult( + text="".join(text_parts), + input_tokens=int(getattr(usage, "input_tokens", 0)) if usage else 0, + output_tokens=int(getattr(usage, "output_tokens", 0)) if usage else 0, + duration_ms=duration_ms, + ) + raise BackendError( + f"anthropic completion failed after {self._max_retries} attempts: {last_error!r}" + ) diff --git a/src/augur_format/augur_format/llm/backends/base.py b/src/augur_format/augur_format/llm/backends/base.py new file mode 100644 index 0000000..462d811 --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/base.py @@ -0,0 +1,53 @@ +"""AbstractLLMBackend protocol and completion result model. + +Concrete adapters (Ollama, Anthropic) implement the same async +``complete`` surface so the interpreter dispatches uniformly. The +completion result exposes only the fields downstream actually needs: +the raw text, token counts for observability, and the duration in +milliseconds for the generation-latency SLO. +""" + +from __future__ import annotations + +from typing import Protocol + +from pydantic import BaseModel, ConfigDict + + +class CompletionResult(BaseModel): + """One backend completion's payload plus timing.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + text: str + input_tokens: int = 0 + output_tokens: int = 0 + duration_ms: int = 0 + + +class BackendError(RuntimeError): + """Raised when a backend cannot produce a completion.""" + + +class AbstractLLMBackend(Protocol): + """Uniform surface every LLM backend adapter implements.""" + + backend_id: str + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + """Return the model's completion for (*system*, *prompt*).""" + ... + + def model_id(self) -> str: + """Return the active model identifier (e.g. ``gemma2:27b``).""" + ... + + async def health_check(self) -> bool: + """Verify the backend is reachable and serving the configured model.""" + ... diff --git a/src/augur_format/augur_format/llm/backends/ollama.py b/src/augur_format/augur_format/llm/backends/ollama.py new file mode 100644 index 0000000..421f9c3 --- /dev/null +++ b/src/augur_format/augur_format/llm/backends/ollama.py @@ -0,0 +1,101 @@ +"""Ollama backend adapter. + +Uses plain httpx against the local Ollama daemon (default +``http://localhost:11434``) so the adapter has no hard dependency on +the ``ollama`` Python client. The adapter retries twice on connection +failures; local daemon outages should surface quickly, not retry for +a minute. +""" + +from __future__ import annotations + +import time +from typing import Any + +import httpx + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) + + +class OllamaBackend(AbstractLLMBackend): + """AbstractLLMBackend implementation routed through the local daemon.""" + + backend_id: str = "ollama" + + def __init__( + self, + client: httpx.AsyncClient, + endpoint: str = "http://localhost:11434", + model: str = "gemma2:27b", + timeout_seconds: float = 30.0, + max_retries: int = 2, + ) -> None: + self._client = client + self._endpoint = endpoint.rstrip("/") + self._model = model + self._timeout = timeout_seconds + self._max_retries = max(1, max_retries) + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + try: + response = await self._client.get(f"{self._endpoint}/api/tags", timeout=self._timeout) + except Exception: + return False + return response.status_code == 200 + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + payload: dict[str, Any] = { + "model": self._model, + "system": system, + "prompt": prompt, + "stream": False, + "options": { + "num_predict": max_tokens, + "temperature": temperature, + }, + } + last_error: BaseException | None = None + for _ in range(self._max_retries): + started = time.perf_counter() + try: + response = await self._client.post( + f"{self._endpoint}/api/generate", + json=payload, + timeout=self._timeout, + ) + except Exception as err: + last_error = err + continue + if response.status_code != 200: + status_error = BackendError(f"ollama returned status {response.status_code}") + # 4xx indicates a malformed request from the adapter; + # retrying will not recover. Surface the error + # immediately so callers see the root cause. + if 400 <= response.status_code < 500: + raise status_error + last_error = status_error + continue + data: dict[str, Any] = response.json() + duration_ms = int((time.perf_counter() - started) * 1000) + return CompletionResult( + text=str(data.get("response", "")), + input_tokens=int(data.get("prompt_eval_count", 0)), + output_tokens=int(data.get("eval_count", 0)), + duration_ms=duration_ms, + ) + raise BackendError( + f"ollama completion failed after {self._max_retries} attempts: {last_error!r}" + ) diff --git a/src/augur_format/augur_format/llm/interpreter.py b/src/augur_format/augur_format/llm/interpreter.py new file mode 100644 index 0000000..1687b75 --- /dev/null +++ b/src/augur_format/augur_format/llm/interpreter.py @@ -0,0 +1,127 @@ +"""LLMInterpreter — orchestrates the gated secondary formatter. + +Composes the backend, prompt builder, forbidden-token linter, schema +validator, consumer gate, and provenance stamp into a single +``interpret`` call per SignalContext. Any failure (backend error, +forbidden token, invalid JSON, schema violation, storm suspension) +returns None; the deterministic pipeline proceeds unaffected. + +Defense ordering: +1. Storm-mode short-circuit (before backend call). +2. Backend completion. +3. JSON parse — non-dict payloads drop the brief. +4. Forbidden-token lint against the parsed headline+body, not the raw + JSON, so unicode-escape bypass cannot slip a forbidden phrase past + the substring check. +5. Pydantic IntelligenceBrief construction (single validation pass). +6. Consumer gate trims actionable_for to consumers that opted in. +""" + +from __future__ import annotations + +import json +from datetime import UTC, datetime +from uuid import uuid4 + +from pydantic import ValidationError + +from augur_format.llm.backends.base import AbstractLLMBackend, BackendError +from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter +from augur_format.llm.models import SCHEMA_VERSION, IntelligenceBrief +from augur_format.llm.prompts.builder import PromptBuilder +from augur_format.llm.provenance.stamp import stamp +from augur_format.llm.routing.consumer_gate import ConsumerGate +from augur_signals.models import SignalContext + + +class LLMInterpreter: + """Generate gated IntelligenceBriefs from SignalContext.""" + + def __init__( + self, + backend: AbstractLLMBackend, + prompt_builder: PromptBuilder, + linter: ForbiddenTokenLinter, + *, + consumer_gate: ConsumerGate | None = None, + max_tokens: int = 512, + temperature: float = 0.2, + ) -> None: + self._backend = backend + self._prompt_builder = prompt_builder + self._linter = linter + self._gate = consumer_gate + self._max_tokens = max_tokens + self._temperature = temperature + self._suspended = False + + @property + def suspended(self) -> bool: + return self._suspended + + def set_suspended(self, suspended: bool) -> None: + """Toggle storm-mode suspension. + + When True, ``interpret`` returns None without calling the + backend, matching phase-4 §11 coordination with the dedup + layer's StormController. + """ + self._suspended = suspended + + async def interpret( + self, context: SignalContext, severity: str, *, now: datetime | None = None + ) -> IntelligenceBrief | None: + """Run the full gated-brief pipeline for *context*.""" + if self._suspended: + return None + system, user = self._prompt_builder.build(context) + try: + result = await self._backend.complete(system, user, self._max_tokens, self._temperature) + except BackendError: + return None + try: + parsed = json.loads(result.text) + except json.JSONDecodeError: + return None + if not isinstance(parsed, dict): + return None + brief_payload: dict[str, object] = parsed + # Lint the parsed headline+body — unicode escapes in the raw JSON + # are normalized by the parser, closing the substring-bypass vector. + headline = str(brief_payload.get("headline", "")) + body = str(brief_payload.get("body_markdown", "")) + lint = self._linter.check_text(f"{headline}\n{body}") + if not lint.passed: + return None + generated_at = now if now is not None else datetime.now(tz=UTC) + provenance = stamp( + self._backend.backend_id, + self._backend.model_id(), + system, + user, + ) + brief_payload.update( + { + "brief_id": str(uuid4()), + "signal_id": context.signal.signal_id, + "severity": severity, + "interpretation_mode": "llm_assisted", + "model": provenance.model, + "prompt_hash": provenance.prompt_hash, + "formatter_version": provenance.formatter_version, + "generated_at": generated_at, + "forbidden_token_check": "passed", + "schema_version": SCHEMA_VERSION, + } + ) + try: + brief = IntelligenceBrief.model_validate(brief_payload) + except ValidationError: + return None + if self._gate is not None: + allowed = self._gate.filter_consumers(brief.actionable_for, brief) + if not allowed: + return None + if allowed != list(brief.actionable_for): + brief = brief.model_copy(update={"actionable_for": allowed}) + return brief diff --git a/src/augur_format/augur_format/llm/linter/__init__.py b/src/augur_format/augur_format/llm/linter/__init__.py new file mode 100644 index 0000000..d44f71a --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/__init__.py @@ -0,0 +1,3 @@ +"""Forbidden-token linter and schema validator for LLM output.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/linter/forbidden_tokens.py b/src/augur_format/augur_format/llm/linter/forbidden_tokens.py new file mode 100644 index 0000000..cfc9eee --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/forbidden_tokens.py @@ -0,0 +1,62 @@ +"""Forbidden-token linter. + +Rejects LLM output containing any phrase from the closed list in +config/forbidden_tokens.toml. The linter operates on the raw text +before the brief is constructed — a failing lint drops the brief +entirely per phase-4 §10. +""" + +from __future__ import annotations + +import tomllib +from collections.abc import Sequence +from pathlib import Path + +from pydantic import BaseModel, ConfigDict + + +class ForbiddenTokenCheckResult(BaseModel): + """Outcome of one forbidden-token check.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + passed: bool + matched_phrases: list[str] + + +class ForbiddenTokenLinter: + """Case-insensitive exact-phrase rejection.""" + + def __init__(self, forbidden_phrases: Sequence[str]) -> None: + self._phrases = [p.lower() for p in forbidden_phrases] + + @property + def phrase_count(self) -> int: + return len(self._phrases) + + def check_text(self, text: str) -> ForbiddenTokenCheckResult: + lowered = text.lower() + matched = [p for p in self._phrases if p in lowered] + return ForbiddenTokenCheckResult(passed=not matched, matched_phrases=matched) + + def check_brief(self, brief: dict[str, object]) -> ForbiddenTokenCheckResult: + headline = str(brief.get("headline", "")) + body = str(brief.get("body_markdown", "")) + return self.check_text(f"{headline}\n{body}") + + +def load_forbidden_phrases(path: Path) -> list[str]: + """Flatten every [category].phrases table in the TOML into a single list. + + The file ships with categorized phrases (causal_narrative, + price_projection, manipulation_speculation); the linter treats + every category uniformly so phrase provenance is a config-layer + concern. + """ + with path.open("rb") as handle: + raw = tomllib.load(handle) + phrases: list[str] = [] + for section in raw.values(): + if isinstance(section, dict) and "phrases" in section: + phrases.extend(str(p) for p in section["phrases"]) + return phrases diff --git a/src/augur_format/augur_format/llm/linter/schema_check.py b/src/augur_format/augur_format/llm/linter/schema_check.py new file mode 100644 index 0000000..1a4c599 --- /dev/null +++ b/src/augur_format/augur_format/llm/linter/schema_check.py @@ -0,0 +1,35 @@ +"""IntelligenceBrief schema validator wrapping the Pydantic model. + +Validates a brief payload by attempting IntelligenceBrief construction. +Pydantic's ValidationError surfaces the specific field violation; the +validator translates that into a stable ValidationResult shape the +interpreter consumes. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +from pydantic import ValidationError + +from augur_format.llm.models import IntelligenceBrief + + +@dataclass(frozen=True, slots=True) +class ValidationResult: + """Outcome of one brief-schema validation.""" + + ok: bool + errors: list[str] = field(default_factory=list) + + +class SchemaValidator: + """Validate a raw brief dict against the IntelligenceBrief contract.""" + + def validate(self, brief_dict: dict[str, object]) -> ValidationResult: + try: + IntelligenceBrief.model_validate(brief_dict) + except ValidationError as err: + errors = [f"{'.'.join(str(p) for p in e['loc'])}: {e['msg']}" for e in err.errors()] + return ValidationResult(ok=False, errors=errors) + return ValidationResult(ok=True) diff --git a/src/augur_format/augur_format/llm/models.py b/src/augur_format/augur_format/llm/models.py index 1d7d276..e887760 100644 --- a/src/augur_format/augur_format/llm/models.py +++ b/src/augur_format/augur_format/llm/models.py @@ -1,41 +1,61 @@ """IntelligenceBrief — the contract emitted by the gated LLM formatter. The schema lives in the formatter package because it is the -formatter's output contract, even though the deterministic pathway -in this phase does not produce briefs. The secondary LLM formatter -in the next phase instantiates IntelligenceBrief values that pass -the forbidden-token linter and the ConsumerType enum gate. +formatter's output contract. Only the gated LLM formatter path can +construct briefs: the forbidden-token linter, the JSON schema +validator, and the consumer gate all run before the constructor. """ from __future__ import annotations -from typing import Literal +from datetime import datetime +from typing import Annotated, Literal -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, model_validator from augur_signals.models import ConsumerType +SCHEMA_VERSION: Literal["1.0.0"] = "1.0.0" + class IntelligenceBrief(BaseModel): """Gated LLM formatter output contract. - ``actionable_for`` is constrained to the ConsumerType registry in - docs/contracts/consumer-registry.md via the Pydantic field type; - the closed-enum validator rechecks this at the formatter boundary - so even dynamically-constructed instances fail loud on unknown - values. + Structural invariants are enforced by Pydantic at construction: + the headline is capped at 90 characters so it fits a Slack header, + body_markdown is capped at 800 characters so it stays readable on + a dashboard card, ``actionable_for`` is typed as list[ConsumerType] + so unknown consumers fail immediately, and ``interpretation_mode`` + plus ``forbidden_token_check`` are Literal singletons — any + construction path that bypasses the linter or the deterministic- + mode check would have to forge the literal, which is caught in + code review. """ model_config = ConfigDict(frozen=True, extra="forbid") brief_id: str signal_id: str - headline: str - body_markdown: str + headline: Annotated[str, Field(max_length=90)] + body_markdown: Annotated[str, Field(max_length=800)] severity: Literal["high", "medium", "low"] actionable_for: list[ConsumerType] = Field(default_factory=list) interpretation_mode: Literal["llm_assisted"] = "llm_assisted" model: str prompt_hash: str + formatter_version: str + generated_at: datetime forbidden_token_check: Literal["passed"] = "passed" # noqa: S105 - schema_version: Literal["1.0.0"] = "1.0.0" + schema_version: Literal["1.0.0"] = SCHEMA_VERSION + + @model_validator(mode="after") + def _interpretation_mode_pinned(self) -> IntelligenceBrief: + if self.interpretation_mode != "llm_assisted": + raise ValueError("LLM-rendered briefs must declare interpretation_mode=llm_assisted") + return self + + @model_validator(mode="after") + def _forbidden_token_check_marker(self) -> IntelligenceBrief: + if self.forbidden_token_check != "passed": # noqa: S105 + raise ValueError("Brief without passed forbidden-token check cannot exist") + return self diff --git a/src/augur_format/augur_format/llm/prompts/__init__.py b/src/augur_format/augur_format/llm/prompts/__init__.py new file mode 100644 index 0000000..bdfef67 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/__init__.py @@ -0,0 +1,3 @@ +"""Prompt templates and structured-prompt builder.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/prompts/builder.py b/src/augur_format/augur_format/llm/prompts/builder.py new file mode 100644 index 0000000..010ecd2 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/builder.py @@ -0,0 +1,113 @@ +"""Structured prompt builder. + +Produces a deterministic (system, user) pair for any SignalContext. +The system message embeds the forbidden phrase list, a summary of +the IntelligenceBrief schema, and the ConsumerType enum. The user +message renders the signal payload into the per-signal-type +template. + +The builder is deterministic: identical SignalContext + identical +forbidden-phrase list + identical template files always produce +identical prompt strings. The prompt hash used for provenance is +the SHA-256 of ``system + "\\n\\n" + user``. +""" + +from __future__ import annotations + +from collections.abc import Sequence +from pathlib import Path + +from augur_format.llm.models import SCHEMA_VERSION +from augur_signals.models import ConsumerType, SignalContext + +_DEFAULT_TEMPLATE_DIR = Path(__file__).resolve().parent / "templates" + + +class PromptTemplateNotFoundError(RuntimeError): + """Raised when the signal type has no corresponding template file.""" + + +class PromptBuilder: + """Deterministic (system, user) prompt construction.""" + + def __init__( + self, + forbidden_phrases: Sequence[str], + template_dir: Path | None = None, + ) -> None: + directory = template_dir or _DEFAULT_TEMPLATE_DIR + system_path = directory / "_system.txt" + if not system_path.exists(): + raise PromptTemplateNotFoundError(f"system template missing at {system_path}") + self._template_dir = directory + self._forbidden_phrases = sorted(forbidden_phrases) + self._system_template = system_path.read_text(encoding="utf-8") + + def build(self, context: SignalContext) -> tuple[str, str]: + """Return the (system_prompt, user_prompt) pair for *context*.""" + system = self._render_system() + user = self._render_user(context) + return system, user + + def _render_system(self) -> str: + phrases = "\n".join(f"- {phrase}" for phrase in self._forbidden_phrases) + consumers = "\n".join(f"- {c.value}" for c in ConsumerType) + return self._system_template.format( + forbidden_phrases_list=phrases, + intelligence_brief_schema=_BRIEF_SCHEMA_SUMMARY, + consumer_type_enum=consumers, + ) + + def _render_user(self, context: SignalContext) -> str: + template_name = f"{context.signal.signal_type.value}.txt" + template_path = self._template_dir / template_name + if not template_path.exists(): + raise PromptTemplateNotFoundError( + f"no template for signal_type={context.signal.signal_type.value!r}" + ) + template = template_path.read_text(encoding="utf-8") + related = ( + "\n".join( + f"- {rm.market_id} ({rm.relationship_type}, strength {rm.relationship_strength}): " + f"price {rm.current_price}, 24h delta {rm.delta_24h}" + for rm in context.related_markets + ) + or "(none)" + ) + prompts = "\n".join(f"- {prompt}" for prompt in context.investigation_prompts) or "(none)" + flags = ",".join(flag.value for flag in context.signal.manipulation_flags) or "(none)" + return template.format( + market_id=context.signal.market_id, + platform=context.signal.platform, + market_question=context.market_question, + magnitude=f"{context.signal.magnitude:.6f}", + direction=context.signal.direction, + confidence=f"{context.signal.confidence:.6f}", + fdr_adjusted=context.signal.fdr_adjusted, + liquidity_tier=context.signal.liquidity_tier, + window_seconds=context.signal.window_seconds, + detected_at=context.signal.detected_at.isoformat().replace("+00:00", "Z"), + resolution_criteria=context.resolution_criteria, + resolution_source=context.resolution_source, + closes_at=context.closes_at.isoformat().replace("+00:00", "Z"), + manipulation_flags_csv_or_none=flags, + related_markets_block=related, + investigation_prompts_block=prompts, + ) + + +_BRIEF_SCHEMA_SUMMARY: str = ( + "- brief_id: string (uuid7)\n" + "- signal_id: string\n" + "- headline: string (max 90 chars)\n" + "- body_markdown: string (max 800 chars)\n" + "- severity: one of [high, medium, low]\n" + "- actionable_for: list of ConsumerType\n" + "- interpretation_mode: must equal 'llm_assisted'\n" + "- model: string\n" + "- prompt_hash: string (sha256 hex)\n" + "- formatter_version: string\n" + "- generated_at: ISO-8601 UTC datetime\n" + "- forbidden_token_check: must equal 'passed'\n" + f"- schema_version: '{SCHEMA_VERSION}'" +) diff --git a/src/augur_format/augur_format/llm/prompts/templates/_system.txt b/src/augur_format/augur_format/llm/prompts/templates/_system.txt new file mode 100644 index 0000000..85acada --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/_system.txt @@ -0,0 +1,24 @@ +You generate intelligence briefs from structured prediction-market signals. +You write factual restatements with no causal interpretation. + +You must NOT: +- Use any of the forbidden phrases listed below. +- Claim to know why a market moved. +- Invent facts not present in the input. +- Recommend trades or positions. + +You must: +- Restate the numerical facts of the signal. +- Quote the resolution criteria verbatim. +- List related markets and their state. +- Repeat the investigation prompts as a bulleted list. +- Output a single JSON object matching the IntelligenceBrief schema. + +Forbidden phrases (exhaustive at schema 1.0.0): +{forbidden_phrases_list} + +Output schema fields (all required unless marked optional): +{intelligence_brief_schema} + +Available consumer types (you must use only these in actionable_for): +{consumer_type_enum} diff --git a/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt b/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt new file mode 100644 index 0000000..0953742 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/book_imbalance.txt @@ -0,0 +1,26 @@ +Signal type: book_imbalance +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt b/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt new file mode 100644 index 0000000..9f2e7a5 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/cross_market_divergence.txt @@ -0,0 +1,26 @@ +Signal type: cross_market_divergence +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt b/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt new file mode 100644 index 0000000..02fd6ff --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/price_velocity.txt @@ -0,0 +1,26 @@ +Signal type: price_velocity +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt b/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt new file mode 100644 index 0000000..8ebc726 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/regime_shift.txt @@ -0,0 +1,26 @@ +Signal type: regime_shift +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt b/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt new file mode 100644 index 0000000..5d7a293 --- /dev/null +++ b/src/augur_format/augur_format/llm/prompts/templates/volume_spike.txt @@ -0,0 +1,26 @@ +Signal type: volume_spike +Market: {market_id} ({platform}) +Question (verbatim): {market_question} + +Movement: +- Magnitude: {magnitude} +- Direction: {direction} (1 = up, -1 = down, 0 = neither) +- Calibrated confidence: {confidence} +- FDR-adjusted: {fdr_adjusted} +- Liquidity tier: {liquidity_tier} +- Detection window: {window_seconds} seconds +- Detected at: {detected_at} + +Resolution criteria (verbatim): {resolution_criteria} +Resolution source (verbatim): {resolution_source} +Closes at: {closes_at} + +Manipulation flags: {manipulation_flags_csv_or_none} + +Related markets: +{related_markets_block} + +Investigation prompts: +{investigation_prompts_block} + +Generate the IntelligenceBrief now as a JSON object. diff --git a/src/augur_format/augur_format/llm/provenance/__init__.py b/src/augur_format/augur_format/llm/provenance/__init__.py new file mode 100644 index 0000000..23e7a52 --- /dev/null +++ b/src/augur_format/augur_format/llm/provenance/__init__.py @@ -0,0 +1,3 @@ +"""Provenance metadata builder for LLM-generated briefs.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/provenance/stamp.py b/src/augur_format/augur_format/llm/provenance/stamp.py new file mode 100644 index 0000000..891ba40 --- /dev/null +++ b/src/augur_format/augur_format/llm/provenance/stamp.py @@ -0,0 +1,49 @@ +"""Provenance stamping for LLM-generated briefs. + +``stamp`` returns a ProvenanceStamp whose ``prompt_hash`` is the +SHA-256 of ``system + "\\n\\n" + user``. Auditors recompute the hash +from the deterministic prompt builder to confirm the model saw +exactly what the record claims; ``formatter_version`` is read from +the installed package metadata so downgrades / upgrades are visible +in the record. +""" + +from __future__ import annotations + +import hashlib +from importlib.metadata import PackageNotFoundError, version + +from pydantic import BaseModel, ConfigDict + + +class ProvenanceStamp(BaseModel): + """The immutable provenance triple carried by every brief.""" + + model_config = ConfigDict(frozen=True, extra="forbid") + + model: str + prompt_hash: str + formatter_version: str + + +def _formatter_version() -> str: + try: + return version("augur-format") + except PackageNotFoundError: # pragma: no cover — only hit in source checkouts + return "0.0.0+unknown" + + +def stamp( + backend_id: str, + model: str, + system_prompt: str, + user_prompt: str, +) -> ProvenanceStamp: + """Return the ProvenanceStamp for a completion.""" + composite = f"{system_prompt}\n\n{user_prompt}" + digest = hashlib.sha256(composite.encode("utf-8")).hexdigest() + return ProvenanceStamp( + model=f"{model}@{backend_id}", + prompt_hash=digest, + formatter_version=_formatter_version(), + ) diff --git a/src/augur_format/augur_format/llm/routing/__init__.py b/src/augur_format/augur_format/llm/routing/__init__.py new file mode 100644 index 0000000..fbf94d5 --- /dev/null +++ b/src/augur_format/augur_format/llm/routing/__init__.py @@ -0,0 +1,3 @@ +"""Consumer gate for LLM briefs.""" + +from __future__ import annotations diff --git a/src/augur_format/augur_format/llm/routing/consumer_gate.py b/src/augur_format/augur_format/llm/routing/consumer_gate.py new file mode 100644 index 0000000..2439387 --- /dev/null +++ b/src/augur_format/augur_format/llm/routing/consumer_gate.py @@ -0,0 +1,37 @@ +"""Consumer gate enforcing opt-in for llm_assisted briefs. + +Per docs/contracts/consumer-registry.md, only consumers whose +configuration sets ``accepts_llm_assisted = true`` receive LLM- +rendered briefs. The deterministic JSON and Markdown briefs from +Phase 3 still reach every consumer; the gate only filters the LLM +output. +""" + +from __future__ import annotations + +from collections.abc import Iterable + +from augur_format.llm.models import IntelligenceBrief +from augur_signals.models import ConsumerType + + +class ConsumerGate: + """Filters consumer sets by accepts_llm_assisted opt-in.""" + + def __init__(self, opted_in: Iterable[ConsumerType]) -> None: + self._opted_in = frozenset(opted_in) + + @property + def opted_in(self) -> frozenset[ConsumerType]: + return self._opted_in + + def is_eligible(self, consumer: ConsumerType, brief: IntelligenceBrief) -> bool: + """True iff *consumer* has opted in to LLM-assisted briefs.""" + del brief # brief identity does not factor into the gate decision. + return consumer in self._opted_in + + def filter_consumers( + self, consumers: Iterable[ConsumerType], brief: IntelligenceBrief + ) -> list[ConsumerType]: + """Return the subset of consumers eligible for this brief.""" + return [c for c in consumers if self.is_eligible(c, brief)] diff --git a/src/augur_format/pyproject.toml b/src/augur_format/pyproject.toml index 0dcac63..c74250b 100644 --- a/src/augur_format/pyproject.toml +++ b/src/augur_format/pyproject.toml @@ -22,4 +22,4 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["augur_format"] -include = ["augur_format/**/*.j2"] +include = ["augur_format/**/*.j2", "augur_format/**/*.txt"] diff --git a/tests/format/test_intelligence_brief.py b/tests/format/test_intelligence_brief.py new file mode 100644 index 0000000..011f40c --- /dev/null +++ b/tests/format/test_intelligence_brief.py @@ -0,0 +1,78 @@ +"""Tests for the IntelligenceBrief contract.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest +from pydantic import ValidationError + +from augur_format.llm.models import IntelligenceBrief + + +def _payload(**overrides: object) -> dict[str, object]: + base: dict[str, object] = { + "brief_id": "brief-1", + "signal_id": "signal-1", + "headline": "Fed holds rates per announcement", + "body_markdown": "## Summary\n- Fed held at the current range.", + "severity": "high", + "actionable_for": ["macro_research_agent", "dashboard"], + "model": "gemma2:27b@ollama", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + base.update(overrides) + return base + + +@pytest.mark.unit +def test_canonical_payload_validates() -> None: + brief = IntelligenceBrief.model_validate(_payload()) + assert brief.interpretation_mode == "llm_assisted" + assert brief.forbidden_token_check == "passed" # noqa: S105 + assert brief.schema_version == "1.0.0" + + +@pytest.mark.unit +def test_headline_over_90_chars_rejected() -> None: + with pytest.raises(ValidationError, match="at most 90 characters"): + IntelligenceBrief.model_validate(_payload(headline="x" * 91)) + + +@pytest.mark.unit +def test_body_over_800_chars_rejected() -> None: + with pytest.raises(ValidationError, match="at most 800 characters"): + IntelligenceBrief.model_validate(_payload(body_markdown="x" * 801)) + + +@pytest.mark.unit +def test_unknown_consumer_type_rejected() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(actionable_for=["not_a_consumer"])) + + +@pytest.mark.unit +def test_interpretation_mode_cannot_be_overridden() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(interpretation_mode="deterministic")) + + +@pytest.mark.unit +def test_forbidden_token_check_cannot_be_overridden() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate(_payload(forbidden_token_check="failed")) # noqa: S106 + + +@pytest.mark.unit +def test_model_is_frozen() -> None: + brief = IntelligenceBrief.model_validate(_payload()) + with pytest.raises(ValidationError): + brief.headline = "mutated" # type: ignore[misc] + + +@pytest.mark.unit +def test_model_rejects_unknown_fields() -> None: + with pytest.raises(ValidationError): + IntelligenceBrief.model_validate({**_payload(), "unexpected": 1}) diff --git a/tests/format/test_llm_backends.py b/tests/format/test_llm_backends.py new file mode 100644 index 0000000..51b208c --- /dev/null +++ b/tests/format/test_llm_backends.py @@ -0,0 +1,105 @@ +"""Tests for the LLM backend adapters (mocked).""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import httpx +import pytest + +from augur_format.llm.backends.anthropic import AnthropicBackend +from augur_format.llm.backends.base import BackendError, CompletionResult +from augur_format.llm.backends.ollama import OllamaBackend + + +@pytest.mark.asyncio +async def test_ollama_health_check_passes_on_200() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(200, json={"models": []}) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client) + assert await backend.health_check() is True + + +@pytest.mark.asyncio +async def test_ollama_complete_returns_parsed_result() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response( + 200, + json={ + "response": "Hello world", + "prompt_eval_count": 10, + "eval_count": 3, + }, + ) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client, max_retries=1) + result = await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + assert isinstance(result, CompletionResult) + assert result.text == "Hello world" + assert result.input_tokens == 10 + assert result.output_tokens == 3 + + +@pytest.mark.asyncio +async def test_ollama_raises_backenderror_on_exhaustion() -> None: + def handler(request: httpx.Request) -> httpx.Response: + return httpx.Response(500) + + transport = httpx.MockTransport(handler) + async with httpx.AsyncClient(transport=transport) as client: + backend = OllamaBackend(client, max_retries=2) + with pytest.raises(BackendError): + await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + + +@pytest.mark.unit +def test_anthropic_requires_env_or_client(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + with pytest.raises(BackendError, match="ANTHROPIC_API_KEY"): + AnthropicBackend() + + +@pytest.mark.asyncio +async def test_anthropic_accepts_injected_client_and_parses_text() -> None: + class _FakeMessages: + def __init__(self) -> None: + self.calls: list[dict[str, Any]] = [] + + async def create(self, **kwargs: Any) -> SimpleNamespace: + self.calls.append(kwargs) + return SimpleNamespace( + content=[SimpleNamespace(type="text", text="ok")], + usage=SimpleNamespace(input_tokens=5, output_tokens=2), + ) + + class _FakeClient: + def __init__(self) -> None: + self.messages = _FakeMessages() + + client = _FakeClient() + backend = AnthropicBackend(client=client, max_retries=1) + result = await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) + assert result.text == "ok" + assert result.input_tokens == 5 + assert result.output_tokens == 2 + + +@pytest.mark.asyncio +async def test_anthropic_exhausts_retries_and_raises_backenderror() -> None: + class _AlwaysFail: + async def create(self, **kwargs: Any) -> None: + raise RuntimeError("transient") + + class _Client: + def __init__(self) -> None: + self.messages = _AlwaysFail() + + backend = AnthropicBackend(client=_Client(), max_retries=2) + with pytest.raises(BackendError): + await backend.complete("system", "prompt", max_tokens=32, temperature=0.2) diff --git a/tests/format/test_llm_interpreter.py b/tests/format/test_llm_interpreter.py new file mode 100644 index 0000000..dd73598 --- /dev/null +++ b/tests/format/test_llm_interpreter.py @@ -0,0 +1,264 @@ +"""End-to-end tests for the LLMInterpreter orchestrator.""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from datetime import UTC, datetime + +import pytest + +from augur_format.llm.backends.base import ( + AbstractLLMBackend, + BackendError, + CompletionResult, +) +from augur_format.llm.interpreter import LLMInterpreter +from augur_format.llm.linter.forbidden_tokens import ForbiddenTokenLinter +from augur_format.llm.prompts.builder import PromptBuilder +from augur_format.llm.routing.consumer_gate import ConsumerGate +from augur_signals.models import ( + ConsumerType, + InterpretationMode, + MarketSignal, + SignalContext, + SignalType, + new_signal_id, +) + +FORBIDDEN = ["may be driven by", "likely reflects"] + + +def _context() -> SignalContext: + signal = MarketSignal( + signal_id=new_signal_id(), + market_id="kalshi_fed", + platform="kalshi", + signal_type=SignalType.PRICE_VELOCITY, + magnitude=0.8, + direction=1, + confidence=0.72, + fdr_adjusted=True, + detected_at=datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + window_seconds=300, + liquidity_tier="high", + raw_features={"calibration_provenance": "d@identity_v0"}, + ) + return SignalContext( + signal=signal, + market_question="Will the Fed raise rates?", + resolution_criteria="YES if rate rises.", + resolution_source="Federal Reserve", + closes_at=datetime(2026, 6, 15, tzinfo=UTC), + related_markets=[], + investigation_prompts=["Check FOMC calendar."], + interpretation_mode=InterpretationMode.DETERMINISTIC, + ) + + +@dataclass +class _StubBackend(AbstractLLMBackend): + backend_id: str = "stub" + _model: str = "stub-model" + _responses: list[str] | None = None + _exception: BaseException | None = None + + def model_id(self) -> str: + return self._model + + async def health_check(self) -> bool: + return True + + async def complete( + self, + system: str, + prompt: str, + max_tokens: int, + temperature: float, + ) -> CompletionResult: + del system, prompt, max_tokens, temperature + if self._exception is not None: + raise self._exception + if not self._responses: + raise RuntimeError("no canned response") + text = self._responses.pop(0) + return CompletionResult(text=text, input_tokens=10, output_tokens=20, duration_ms=5) + + +_VALID_RESPONSE = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "The Fed left the target range unchanged.", + "actionable_for": ["dashboard"], + } +) + + +def _interpreter( + backend: _StubBackend, + forbidden: list[str] | None = None, + *, + gate: ConsumerGate | None = None, +) -> LLMInterpreter: + return LLMInterpreter( + backend, + PromptBuilder(forbidden or FORBIDDEN), + ForbiddenTokenLinter(forbidden or FORBIDDEN), + consumer_gate=gate, + ) + + +@pytest.mark.asyncio +async def test_happy_path_emits_brief() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + assert brief.headline == "Fed holds rates" + assert brief.severity == "high" + assert brief.interpretation_mode == "llm_assisted" + assert brief.prompt_hash != "" + assert brief.forbidden_token_check == "passed" # noqa: S105 + + +@pytest.mark.asyncio +async def test_forbidden_token_drops_brief() -> None: + tainted = json.dumps( + { + "headline": "Hold", + "body_markdown": "Prices may be driven by external news.", + "actionable_for": ["dashboard"], + } + ) + backend = _StubBackend(_responses=[tainted]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_invalid_json_drops_brief() -> None: + backend = _StubBackend(_responses=["{this is not json"]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_unknown_consumer_drops_brief() -> None: + bad_consumer = json.dumps( + { + "headline": "Hold", + "body_markdown": "Update.", + "actionable_for": ["not_a_consumer"], + } + ) + backend = _StubBackend(_responses=[bad_consumer]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_backend_error_drops_brief() -> None: + backend = _StubBackend(_exception=BackendError("down")) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_storm_suspension_short_circuits_before_backend_call() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + interpreter.set_suspended(True) + brief = await interpreter.interpret(_context(), severity="high") + assert brief is None + # Backend call was not made; the canned response is still pending. + assert backend._responses == [_VALID_RESPONSE] + + +@pytest.mark.asyncio +async def test_resuming_from_suspension_allows_next_brief() -> None: + backend = _StubBackend(_responses=[_VALID_RESPONSE]) + interpreter = _interpreter(backend) + interpreter.set_suspended(True) + suspended = await interpreter.interpret(_context(), severity="high") + assert suspended is None + interpreter.set_suspended(False) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + + +@pytest.mark.asyncio +async def test_gate_trims_actionable_for_to_opted_in_consumers() -> None: + # LLM emits macro_research_agent + dashboard; only dashboard opts in. + response = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "Update.", + "actionable_for": ["macro_research_agent", "dashboard"], + } + ) + backend = _StubBackend(_responses=[response]) + interpreter = _interpreter(backend, gate=ConsumerGate([ConsumerType.DASHBOARD])) + brief = await interpreter.interpret( + _context(), + severity="high", + now=datetime(2026, 3, 15, 12, 5, tzinfo=UTC), + ) + assert brief is not None + assert brief.actionable_for == [ConsumerType.DASHBOARD] + + +@pytest.mark.asyncio +async def test_gate_drops_brief_when_no_consumer_opted_in() -> None: + response = json.dumps( + { + "headline": "Fed holds rates", + "body_markdown": "Update.", + "actionable_for": ["macro_research_agent"], + } + ) + backend = _StubBackend(_responses=[response]) + interpreter = _interpreter(backend, gate=ConsumerGate([ConsumerType.DASHBOARD])) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_unicode_escape_in_raw_json_still_caught_after_parse() -> None: + # "\u006d\u0061\u0079 be driven by" decodes to "may be driven by"; the + # substring lint against the raw JSON would miss, but the post-parse + # lint against the decoded headline catches it. + tainted = ( + '{"headline":"\\u006d\\u0061\\u0079 be driven by moves",' + '"body_markdown":"ok","actionable_for":["dashboard"]}' + ) + backend = _StubBackend(_responses=[tainted]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="medium") + assert brief is None + + +@pytest.mark.asyncio +async def test_overlong_headline_drops_brief() -> None: + long_headline = json.dumps( + { + "headline": "x" * 100, + "body_markdown": "ok", + "actionable_for": ["dashboard"], + } + ) + backend = _StubBackend(_responses=[long_headline]) + interpreter = _interpreter(backend) + brief = await interpreter.interpret(_context(), severity="low") + assert brief is None diff --git a/tests/format/test_llm_linter.py b/tests/format/test_llm_linter.py new file mode 100644 index 0000000..42d81a5 --- /dev/null +++ b/tests/format/test_llm_linter.py @@ -0,0 +1,171 @@ +"""Tests for forbidden-token linter, schema validator, stamp, gate.""" + +from __future__ import annotations + +from datetime import UTC, datetime +from pathlib import Path + +import pytest + +from augur_format.llm.linter.forbidden_tokens import ( + ForbiddenTokenLinter, + load_forbidden_phrases, +) +from augur_format.llm.linter.schema_check import SchemaValidator +from augur_format.llm.models import IntelligenceBrief +from augur_format.llm.provenance.stamp import stamp +from augur_format.llm.routing.consumer_gate import ConsumerGate +from augur_signals.models import ConsumerType + + +@pytest.mark.unit +def test_linter_rejects_each_configured_phrase() -> None: + phrases = load_forbidden_phrases(Path("config/forbidden_tokens.toml")) + assert phrases # sanity: at least one phrase loaded from the shipped file + linter = ForbiddenTokenLinter(phrases) + for phrase in phrases: + result = linter.check_text(f"The market {phrase} a rate change.") + assert not result.passed + assert phrase.lower() in result.matched_phrases + + +@pytest.mark.unit +def test_linter_is_case_insensitive() -> None: + linter = ForbiddenTokenLinter(["may be driven by"]) + assert not linter.check_text("Prices May Be Driven By macro moves").passed + + +@pytest.mark.unit +def test_linter_accepts_clean_text() -> None: + linter = ForbiddenTokenLinter(["may be driven by"]) + result = linter.check_text("The Fed left the rate range unchanged.") + assert result.passed + assert result.matched_phrases == [] + + +@pytest.mark.unit +def test_linter_check_brief_combines_headline_and_body() -> None: + linter = ForbiddenTokenLinter(["suggests that"]) + result = linter.check_brief( + {"headline": "Update", "body_markdown": "The move suggests that a cut is due."} + ) + assert not result.passed + + +@pytest.mark.unit +def test_schema_validator_accepts_valid_payload() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "gemma2:27b@ollama", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert result.ok + + +@pytest.mark.unit +def test_schema_validator_rejects_over_length_headline() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "x" * 91, + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert not result.ok + assert any("headline" in e for e in result.errors) + + +@pytest.mark.unit +def test_schema_validator_rejects_unknown_consumer() -> None: + validator = SchemaValidator() + result = validator.validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["not_a_consumer"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": "2026-03-15T12:00:00Z", + } + ) + assert not result.ok + + +@pytest.mark.unit +def test_stamp_is_reproducible() -> None: + s1 = stamp("ollama", "gemma2:27b", "system", "user") + s2 = stamp("ollama", "gemma2:27b", "system", "user") + assert s1.prompt_hash == s2.prompt_hash + assert s1.model == "gemma2:27b@ollama" + assert len(s1.prompt_hash) == 64 + + +@pytest.mark.unit +def test_stamp_hash_changes_on_prompt_change() -> None: + a = stamp("ollama", "gemma2:27b", "system", "user-a") + b = stamp("ollama", "gemma2:27b", "system", "user-b") + assert a.prompt_hash != b.prompt_hash + + +@pytest.mark.unit +def test_consumer_gate_allows_opted_in() -> None: + gate = ConsumerGate([ConsumerType.DASHBOARD]) + brief = IntelligenceBrief.model_validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + ) + assert gate.is_eligible(ConsumerType.DASHBOARD, brief) + assert not gate.is_eligible(ConsumerType.MACRO_RESEARCH_AGENT, brief) + + +@pytest.mark.unit +def test_consumer_gate_filters_list() -> None: + gate = ConsumerGate([ConsumerType.DASHBOARD]) + brief = IntelligenceBrief.model_validate( + { + "brief_id": "b1", + "signal_id": "s1", + "headline": "h", + "body_markdown": "body", + "severity": "high", + "actionable_for": ["dashboard"], + "model": "m@b", + "prompt_hash": "a" * 64, + "formatter_version": "0.0.0", + "generated_at": datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + } + ) + kept = gate.filter_consumers([ConsumerType.MACRO_RESEARCH_AGENT, ConsumerType.DASHBOARD], brief) + assert kept == [ConsumerType.DASHBOARD] diff --git a/tests/format/test_prompt_builder.py b/tests/format/test_prompt_builder.py new file mode 100644 index 0000000..f2d49e2 --- /dev/null +++ b/tests/format/test_prompt_builder.py @@ -0,0 +1,142 @@ +"""Tests for the LLM prompt builder.""" + +from __future__ import annotations + +from datetime import UTC, datetime + +import pytest + +from augur_format.llm.prompts.builder import ( + PromptBuilder, + PromptTemplateNotFoundError, +) +from augur_signals.models import ( + ConsumerType, + InterpretationMode, + ManipulationFlag, + MarketSignal, + RelatedMarketState, + SignalContext, + SignalType, + new_signal_id, +) + +FORBIDDEN = ["may be driven by", "likely reflects", "suggests that"] + + +def _context( + signal_type: SignalType = SignalType.PRICE_VELOCITY, + manipulation_flags: list[ManipulationFlag] | None = None, + related: list[RelatedMarketState] | None = None, +) -> SignalContext: + signal = MarketSignal( + signal_id=new_signal_id(), + market_id="kalshi_fed", + platform="kalshi", + signal_type=signal_type, + magnitude=0.8, + direction=1, + confidence=0.72, + fdr_adjusted=True, + detected_at=datetime(2026, 3, 15, 12, 0, tzinfo=UTC), + window_seconds=300, + liquidity_tier="high", + manipulation_flags=manipulation_flags or [], + raw_features={"calibration_provenance": "d@identity_v0"}, + ) + return SignalContext( + signal=signal, + market_question="Will the Fed raise rates?", + resolution_criteria="YES if rate rises.", + resolution_source="Federal Reserve press release", + closes_at=datetime(2026, 6, 15, tzinfo=UTC), + related_markets=related or [], + investigation_prompts=["Check FOMC calendar."], + interpretation_mode=InterpretationMode.DETERMINISTIC, + ) + + +@pytest.fixture +def builder() -> PromptBuilder: + return PromptBuilder(FORBIDDEN) + + +@pytest.mark.unit +def test_deterministic_across_calls(builder: PromptBuilder) -> None: + ctx = _context() + a = builder.build(ctx) + b = builder.build(ctx) + assert a == b + + +@pytest.mark.unit +def test_system_injects_forbidden_phrases(builder: PromptBuilder) -> None: + system, _ = builder.build(_context()) + for phrase in FORBIDDEN: + assert phrase in system + + +@pytest.mark.unit +def test_system_injects_full_consumer_enum(builder: PromptBuilder) -> None: + system, _ = builder.build(_context()) + for consumer in ConsumerType: + assert consumer.value in system + + +@pytest.mark.unit +def test_user_contains_verbatim_resolution_criteria(builder: PromptBuilder) -> None: + _, user = builder.build(_context()) + assert "YES if rate rises." in user + + +@pytest.mark.unit +def test_manipulation_flags_reported_in_user(builder: PromptBuilder) -> None: + _, user = builder.build(_context(manipulation_flags=[ManipulationFlag.SIZE_VS_DEPTH_OUTLIER])) + assert "size_vs_depth_outlier" in user + + +@pytest.mark.unit +def test_none_flags_render_as_placeholder(builder: PromptBuilder) -> None: + _, user = builder.build(_context()) + assert "Manipulation flags: (none)" in user + + +@pytest.mark.unit +def test_every_signal_type_has_a_template(builder: PromptBuilder) -> None: + for signal_type in SignalType: + _, user = builder.build(_context(signal_type=signal_type)) + assert f"Signal type: {signal_type.value}" in user + + +@pytest.mark.unit +def test_related_markets_render_as_bullets(builder: PromptBuilder) -> None: + related = [ + RelatedMarketState( + market_id="kalshi_fed_holds", + question="?", + current_price=0.42, + delta_24h=-0.02, + volume_24h=1000.0, + relationship_type="inverse", + relationship_strength=0.9, + ) + ] + _, user = builder.build(_context(related=related)) + assert "kalshi_fed_holds" in user + + +@pytest.mark.unit +def test_missing_template_raises(tmp_path: object) -> None: + import shutil + + from augur_format.llm.prompts.builder import _DEFAULT_TEMPLATE_DIR + + isolated = tmp_path # type: ignore[assignment] + isolated_path = isolated # appease mypy; tmp_path is Path in practice. + assert isolated_path # keep name + target = tmp_path / "templates" # type: ignore[operator] + shutil.copytree(_DEFAULT_TEMPLATE_DIR, target) + (target / "price_velocity.txt").unlink() + builder = PromptBuilder(FORBIDDEN, template_dir=target) + with pytest.raises(PromptTemplateNotFoundError): + builder.build(_context())