From 11155a29c5d7865a98718f8bb9497552a1aec834 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 16:39:28 +0530 Subject: [PATCH 1/4] chore: bump workspace to 0.1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stamp the three workspace packages and the workspace root at 0.1.0. The root pyproject keeps tool.uv.package = false so only the three workspace distributions — augur-signals, augur-labels, augur-format — ship to PyPI; the 'augur' name on PyPI is held by a 2018 placeholder upload and is not needed for publication. --- pyproject.toml | 2 +- src/augur_format/pyproject.toml | 2 +- src/augur_labels/pyproject.toml | 2 +- src/augur_signals/pyproject.toml | 2 +- uv.lock | 8 ++++---- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 441ae7d..66e2839 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "augur" -version = "0.0.0" +version = "0.1.0" description = "Structured market anomaly detection for prediction markets" readme = "README.md" requires-python = ">=3.12" diff --git a/src/augur_format/pyproject.toml b/src/augur_format/pyproject.toml index c74250b..9ac6ba7 100644 --- a/src/augur_format/pyproject.toml +++ b/src/augur_format/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "augur-format" -version = "0.0.0" +version = "0.1.0" description = "Formatters for Augur: deterministic JSON, Markdown, and gated LLM briefs" requires-python = ">=3.12" diff --git a/src/augur_labels/pyproject.toml b/src/augur_labels/pyproject.toml index 8f91f12..5d57620 100644 --- a/src/augur_labels/pyproject.toml +++ b/src/augur_labels/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "augur-labels" -version = "0.0.0" +version = "0.1.0" description = "Newsworthy event labeling pipeline for Augur" requires-python = ">=3.12" diff --git a/src/augur_signals/pyproject.toml b/src/augur_signals/pyproject.toml index ffe3268..932e0ae 100644 --- a/src/augur_signals/pyproject.toml +++ b/src/augur_signals/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "augur-signals" -version = "0.0.0" +version = "0.1.0" description = "Signal extraction core for Augur: detectors, calibration, context assembly" requires-python = ">=3.12" diff --git a/uv.lock b/uv.lock index b0d31b9..c407f70 100644 --- a/uv.lock +++ b/uv.lock @@ -174,7 +174,7 @@ wheels = [ [[package]] name = "augur" -version = "0.0.0" +version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "augur-format" }, @@ -229,7 +229,7 @@ dev = [ [[package]] name = "augur-format" -version = "0.0.0" +version = "0.1.0" source = { editable = "src/augur_format" } dependencies = [ { name = "augur-signals" }, @@ -261,7 +261,7 @@ provides-extras = ["llm-local", "llm-cloud"] [[package]] name = "augur-labels" -version = "0.0.0" +version = "0.1.0" source = { editable = "src/augur_labels" } dependencies = [ { name = "augur-signals" }, @@ -284,7 +284,7 @@ requires-dist = [ [[package]] name = "augur-signals" -version = "0.0.0" +version = "0.1.0" source = { editable = "src/augur_signals" } dependencies = [ { name = "aiohttp" }, From d392b6e9b33aa3ee2c68f9ae3865830134d35b61 Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 16:39:41 +0530 Subject: [PATCH 2/4] feat(workers): bootstrap helpers and entrypoint catalog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit augur_signals.workers.bootstrap centralizes the startup plumbing: load_runtime_configs() reads bus.toml / storage.toml / observability.toml from $AUGUR_CONFIG_DIR, activate_observability configures the Prometheus + OTel backends and starts the metrics listener, build_event_bus dispatches the NATS or Redis factory and rejects the 'memory' backend with a clear pointer to the monolith, resolve_replica_id reads AUGUR_REPLICA_ID / POD_NAME, parse_shard_arg parses 'index/count' shard args with bounds checking. augur_signals.workers.__main__ is a tiny catalog that python -m augur_signals.workers prints so operators can enumerate the worker kinds and their Kubernetes CMD strings. Per-kind __main__ modules for feature / detector / manipulation / calibration / dedup / context_format / augur_format.workers.llm are deferred; they require a design decision on the per-subject BusMessage payload schema plus bus-friendly adapters over the Phase 1-4 stateful transforms. See docs/operations/manual-testing.md §3 for the gap list. --- .../augur_signals/workers/__main__.py | 43 ++++++ .../augur_signals/workers/bootstrap.py | 130 ++++++++++++++++++ tests/signals/test_worker_bootstrap.py | 87 ++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 src/augur_signals/augur_signals/workers/__main__.py create mode 100644 src/augur_signals/augur_signals/workers/bootstrap.py create mode 100644 tests/signals/test_worker_bootstrap.py diff --git a/src/augur_signals/augur_signals/workers/__main__.py b/src/augur_signals/augur_signals/workers/__main__.py new file mode 100644 index 0000000..dea595a --- /dev/null +++ b/src/augur_signals/augur_signals/workers/__main__.py @@ -0,0 +1,43 @@ +"""Print a list of runnable worker entrypoints. + +``python -m augur_signals.workers`` prints the catalogue of supported +worker kinds and their CMD strings. Each kind has its own ``__main__`` +under ``augur_signals.workers.`` and is invoked as +``python -m augur_signals.workers.`` in the Kubernetes manifests. +""" + +from __future__ import annotations + +_CATALOGUE: tuple[tuple[str, str], ...] = ( + ("poller", "python -m augur_signals.workers.poller --platform "), + ("feature", "python -m augur_signals.workers.feature --shard /"), + ("detector", "python -m augur_signals.workers.detector --shard /"), + ("manipulation", "python -m augur_signals.workers.manipulation"), + ("calibration", "python -m augur_signals.workers.calibration"), + ("dedup", "python -m augur_signals.workers.dedup"), + ("context_format", "python -m augur_signals.workers.context_format"), +) + + +def main() -> int: + print("Runnable augur-signals worker entrypoints:") + print() + for kind, invocation in _CATALOGUE: + print(f" {kind:<16} {invocation}") + print() + print("LLM formatter entrypoint:") + print(" llm python -m augur_format.workers.llm") + print() + print( + "Each entrypoint reads config/{bus,storage,observability}.toml " + "from $AUGUR_CONFIG_DIR and connects to the bus backend " + "declared in bus.toml. See docs/operations/manual-testing.md " + "for the local smoke-test stack." + ) + return 0 + + +if __name__ == "__main__": # pragma: no cover — CLI entrypoint wrapper + import sys + + sys.exit(main()) diff --git a/src/augur_signals/augur_signals/workers/bootstrap.py b/src/augur_signals/augur_signals/workers/bootstrap.py new file mode 100644 index 0000000..ef9ec72 --- /dev/null +++ b/src/augur_signals/augur_signals/workers/bootstrap.py @@ -0,0 +1,130 @@ +"""Shared bootstrap helpers for worker `__main__` modules. + +Every worker entrypoint follows the same startup sequence: + +1. Load `BusConfig`, `StorageConfig`, `ObservabilityConfig` from the + TOML files under `$AUGUR_CONFIG_DIR` (default `config/`). +2. Activate the observability backend and open the Prometheus + scrape listener. +3. Build the `EventBus` and (if the worker needs it) the storage + adapter. + +This module centralizes that plumbing so per-worker modules stay +focused on the transform. Every helper fails loud on missing or +inconsistent config — no silent fallbacks. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path + +from augur_signals._config import load_config +from augur_signals._observability import configure_observability, start_metrics_server +from augur_signals._observability_config import ObservabilityConfig +from augur_signals.bus._config import BusConfig +from augur_signals.bus.base import EventBus +from augur_signals.bus.factory import make_event_bus +from augur_signals.bus.memory import InProcessAsyncBus +from augur_signals.storage._config import StorageConfig + + +@dataclass(frozen=True, slots=True) +class RuntimeConfigs: + """Triple of configs every worker loads at startup.""" + + bus: BusConfig + storage: StorageConfig + observability: ObservabilityConfig + + +def config_dir() -> Path: + """Resolve the active config directory from `AUGUR_CONFIG_DIR`.""" + return Path(os.environ.get("AUGUR_CONFIG_DIR", "config")).resolve() + + +def load_runtime_configs(config_dir_override: Path | None = None) -> RuntimeConfigs: + """Load the three TOML configs every worker depends on.""" + root = config_dir_override if config_dir_override is not None else config_dir() + return RuntimeConfigs( + bus=load_config(root / "bus.toml", BusConfig), + storage=load_config(root / "storage.toml", StorageConfig), + observability=load_config(root / "observability.toml", ObservabilityConfig), + ) + + +def activate_observability(config: ObservabilityConfig) -> None: + """Configure the observability backend and open the metrics port.""" + configure_observability(config) + start_metrics_server(config) + + +def build_event_bus(config: BusConfig) -> EventBus: + """Return an `EventBus` for *config*. + + The memory backend is served by the monolith's `InProcessAsyncBus` + wrapper — not the byte-level `EventBus` factory. For a worker to + use the memory backend it must wrap the `InProcessAsyncBus` with a + subject-aware shim, which is out of scope for this bootstrap. The + factory call covers `nats` and `redis`; `memory` raises a clear + error pointing to the monolith engine. + """ + if config.backend.kind == "memory": + raise RuntimeError( + "Worker bootstrap does not serve the 'memory' bus backend. " + "Set bus.backend.kind to 'nats' or 'redis' in bus.toml, or " + "run the monolith engine which uses InProcessAsyncBus directly." + ) + return make_event_bus(config) + + +def resolve_replica_id() -> str: + """Read the replica's stable identifier from the environment. + + Kubernetes pods set `POD_NAME` via the Downward API; plain-container + deployments supply `AUGUR_REPLICA_ID`. Missing both is a fatal + configuration error because per-replica metric labels and + distributed-lock holder ids depend on the value. + """ + replica = os.environ.get("AUGUR_REPLICA_ID") or os.environ.get("POD_NAME") + if not replica: + raise RuntimeError( + "Replica id is unset. Populate AUGUR_REPLICA_ID or POD_NAME in the worker environment." + ) + return replica + + +def parse_shard_arg(shard: str) -> tuple[int, int]: + """Parse a `"index/count"` shard argument to `(index, count)`. + + Raises: + ValueError: The argument is malformed or uses a non-positive + count, or the index is out of range. + """ + parts = shard.split("/") + if len(parts) != 2: + raise ValueError(f"Expected 'index/count' shard arg, got {shard!r}") + try: + index = int(parts[0]) + count = int(parts[1]) + except ValueError as exc: + raise ValueError(f"Shard components must be integers: {shard!r}") from exc + if count <= 0: + raise ValueError(f"Shard count must be positive, got {count}") + if index < 0 or index >= count: + raise ValueError(f"Shard index {index} out of range for count {count}") + return index, count + + +# Re-export so callers do not need to reach into bus.memory directly. +__all__ = [ + "InProcessAsyncBus", + "RuntimeConfigs", + "activate_observability", + "build_event_bus", + "config_dir", + "load_runtime_configs", + "parse_shard_arg", + "resolve_replica_id", +] diff --git a/tests/signals/test_worker_bootstrap.py b/tests/signals/test_worker_bootstrap.py new file mode 100644 index 0000000..8be75a5 --- /dev/null +++ b/tests/signals/test_worker_bootstrap.py @@ -0,0 +1,87 @@ +"""Tests for the worker bootstrap helpers.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from augur_signals.bus._config import BackendBody, BusConfig +from augur_signals.bus.nats import NATSBus +from augur_signals.bus.redis_streams import RedisStreamsBus +from augur_signals.workers.bootstrap import ( + build_event_bus, + load_runtime_configs, + parse_shard_arg, + resolve_replica_id, +) + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +@pytest.mark.unit +def test_load_runtime_configs_parses_shipped_defaults() -> None: + cfg = load_runtime_configs(REPO_ROOT / "config") + assert cfg.bus.backend.kind == "memory" + assert cfg.storage.backend.kind == "duckdb" + assert cfg.observability.metrics.kind == "prometheus" + + +@pytest.mark.unit +def test_load_runtime_configs_missing_dir_raises(tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError): + load_runtime_configs(tmp_path) + + +@pytest.mark.unit +def test_build_event_bus_memory_rejects_loudly() -> None: + cfg = BusConfig(backend=BackendBody(kind="memory")) + with pytest.raises(RuntimeError, match="memory"): + build_event_bus(cfg) + + +@pytest.mark.unit +def test_build_event_bus_nats_returns_nats_bus() -> None: + cfg = BusConfig(backend=BackendBody(kind="nats")) + assert isinstance(build_event_bus(cfg), NATSBus) + + +@pytest.mark.unit +def test_build_event_bus_redis_returns_redis_bus() -> None: + cfg = BusConfig(backend=BackendBody(kind="redis")) + assert isinstance(build_event_bus(cfg), RedisStreamsBus) + + +@pytest.mark.unit +def test_parse_shard_arg_happy_path() -> None: + assert parse_shard_arg("0/2") == (0, 2) + assert parse_shard_arg("3/4") == (3, 4) + + +@pytest.mark.unit +@pytest.mark.parametrize("bad", ["foo", "0", "0/0", "2/2", "-1/2", "a/b"]) +def test_parse_shard_arg_rejects_malformed(bad: str) -> None: + with pytest.raises(ValueError): + parse_shard_arg(bad) + + +@pytest.mark.unit +def test_resolve_replica_id_from_pod_name(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("AUGUR_REPLICA_ID", raising=False) + monkeypatch.setenv("POD_NAME", "augur-feature-0") + assert resolve_replica_id() == "augur-feature-0" + + +@pytest.mark.unit +def test_resolve_replica_id_augur_env_wins(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AUGUR_REPLICA_ID", "replica-a") + monkeypatch.setenv("POD_NAME", "augur-feature-0") + assert resolve_replica_id() == "replica-a" + + +@pytest.mark.unit +def test_resolve_replica_id_missing_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("AUGUR_REPLICA_ID", raising=False) + monkeypatch.delenv("POD_NAME", raising=False) + with pytest.raises(RuntimeError, match="Replica id"): + resolve_replica_id() From f65f216bb5b61ec446a460746e3d9c0fbd01242c Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 16:39:51 +0530 Subject: [PATCH 3/4] feat(ops): local smoke compose stack with nats, redis, timescaledb, prometheus docker compose -f ops/docker/compose.yaml up -d brings up every external dependency the distributed runtime needs, with pre-baked bus/storage/observability TOML configs under ops/docker/config that point at the compose services. Workers run as host processes so each one is launchable separately for piecewise testing. The OTel collector is optional (traces.kind = 'disabled' in the smoke config). Prometheus scrapes localhost:9091..9097 via host.docker.internal so workers on the host are visible from the compose-internal Prometheus instance. --- ops/docker/compose.yaml | 88 ++++++++++++++++++++++++++++ ops/docker/config/bus.toml | 22 +++++++ ops/docker/config/observability.toml | 22 +++++++ ops/docker/config/storage.toml | 35 +++++++++++ ops/docker/otel-collector.yaml | 21 +++++++ ops/docker/prometheus.yml | 24 ++++++++ 6 files changed, 212 insertions(+) create mode 100644 ops/docker/compose.yaml create mode 100644 ops/docker/config/bus.toml create mode 100644 ops/docker/config/observability.toml create mode 100644 ops/docker/config/storage.toml create mode 100644 ops/docker/otel-collector.yaml create mode 100644 ops/docker/prometheus.yml diff --git a/ops/docker/compose.yaml b/ops/docker/compose.yaml new file mode 100644 index 0000000..5521c1b --- /dev/null +++ b/ops/docker/compose.yaml @@ -0,0 +1,88 @@ +# Local smoke-test stack for the distributed runtime. +# +# Brings up NATS JetStream, Redis, TimescaleDB, and a Prometheus +# scraper pointed at the worker /metrics ports. Workers are NOT +# launched by this compose file — operators run them in separate +# terminals with `python -m augur_signals.workers.` so the +# stack can be exercised piecewise. +# +# Usage: +# docker compose -f ops/docker/compose.yaml up -d +# export AUGUR_CONFIG_DIR=$(pwd)/ops/docker/config +# python -m augur_signals.workers.poller --platform polymarket +# docker compose -f ops/docker/compose.yaml down -v + +name: augur-smoke + +services: + nats: + image: nats:2.11-alpine + command: + - "-js" + - "-sd" + - "/data" + - "--http_port" + - "8222" + ports: + - "4222:4222" # client + - "8222:8222" # http monitoring + volumes: + - nats_data:/data + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8222/healthz"] + interval: 5s + timeout: 2s + retries: 10 + + redis: + image: redis:7-alpine + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 2s + retries: 10 + + timescaledb: + image: timescale/timescaledb:latest-pg16 + environment: + POSTGRES_USER: augur + POSTGRES_PASSWORD: augur + POSTGRES_DB: augur + ports: + - "5432:5432" + volumes: + - timescale_data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U augur -d augur"] + interval: 5s + timeout: 3s + retries: 15 + + prometheus: + image: prom/prometheus:v2.54.1 + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus_data:/prometheus + + otel-collector: + # Optional — only needed if you exercise the OTel tracer. + # Comment out if your workers have traces.kind = "disabled". + image: otel/opentelemetry-collector:0.108.0 + command: ["--config=/etc/otelcol/config.yaml"] + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + volumes: + - ./otel-collector.yaml:/etc/otelcol/config.yaml:ro + +volumes: + nats_data: + timescale_data: + prometheus_data: diff --git a/ops/docker/config/bus.toml b/ops/docker/config/bus.toml new file mode 100644 index 0000000..f235985 --- /dev/null +++ b/ops/docker/config/bus.toml @@ -0,0 +1,22 @@ +# Smoke-stack bus config pointing at the local compose services. + +[backend] +kind = "nats" +capacity = 256 + +[nats] +servers = ["nats://localhost:4222"] +credentials_file_env = "NATS_CREDENTIALS_FILE" +stream_name = "augur" +replication_factor = 1 +subject_prefix = "augur" + +[redis] +url_env = "REDIS_URL" +stream_max_length = 10000 +consumer_group_prefix = "augur" +block_ms = 1000 + +[lock] +ttl_seconds = 30 +renew_interval_seconds = 10 diff --git a/ops/docker/config/observability.toml b/ops/docker/config/observability.toml new file mode 100644 index 0000000..44db814 --- /dev/null +++ b/ops/docker/config/observability.toml @@ -0,0 +1,22 @@ +# Smoke-stack observability config. +# +# The Prometheus service in compose scrapes localhost:9091..9097. Pick +# a port per worker by overriding `AUGUR_METRICS_PORT` at launch time, +# or leave the default 9090 if you run one worker at a time. + +[metrics] +kind = "prometheus" +prometheus_bind = "127.0.0.1" +prometheus_port = 9090 + +[traces] +# Set to "otlp" to exercise the OTel collector service. Leave as +# "disabled" for faster iteration. +kind = "disabled" +otlp_endpoint = "http://localhost:4317" +service_name = "augur-smoke" +sampling_ratio = 1.0 + +[logs] +level = "DEBUG" +format = "json" diff --git a/ops/docker/config/storage.toml b/ops/docker/config/storage.toml new file mode 100644 index 0000000..5e47428 --- /dev/null +++ b/ops/docker/config/storage.toml @@ -0,0 +1,35 @@ +# Smoke-stack storage config. TimescaleDB runs under docker compose at +# localhost:5432 with superuser augur/augur. +# +# Export: +# export AUGUR_TIMESCALE_URL=postgresql://augur:augur@localhost:5432/augur +# before running workers or migration scripts. + +[backend] +kind = "timescaledb" +duckdb_path = "data/augur.duckdb" +timescale_url_env = "AUGUR_TIMESCALE_URL" + +[connection] +pool_size = 5 +max_overflow = 5 +pool_timeout_seconds = 10 + +[migration] +parquet_archive_root = "labels/snapshots_archive" +dual_write_lag_alert_seconds = 10 + +[hypertable] +snapshot_chunk_interval_days = 1 +feature_chunk_interval_days = 1 +signal_chunk_interval_days = 7 + +[retention] +snapshot_retention_days = 0 +feature_retention_days = 30 +signal_retention_days = 0 + +[compression] +snapshot_compress_after_days = 7 +feature_compress_after_days = 7 +signal_compress_after_days = 30 diff --git a/ops/docker/otel-collector.yaml b/ops/docker/otel-collector.yaml new file mode 100644 index 0000000..d7b2887 --- /dev/null +++ b/ops/docker/otel-collector.yaml @@ -0,0 +1,21 @@ +# Minimal OTel collector config for the local smoke stack. +# Accepts OTLP on 4317/4318 and logs every span to stdout so local +# traces are inspectable without wiring Jaeger or Tempo. + +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +exporters: + debug: + verbosity: basic + +service: + pipelines: + traces: + receivers: [otlp] + exporters: [debug] diff --git a/ops/docker/prometheus.yml b/ops/docker/prometheus.yml new file mode 100644 index 0000000..95f865e --- /dev/null +++ b/ops/docker/prometheus.yml @@ -0,0 +1,24 @@ +# Prometheus scrape config for the local smoke stack. +# +# Scrapes the host-bound /metrics endpoints workers expose on +# localhost:9091..9097. When running workers inside `docker compose` +# alongside Prometheus, replace `host.docker.internal` with the +# service name and match the port the worker binds to. + +global: + scrape_interval: 10s + evaluation_interval: 10s + +scrape_configs: + - job_name: augur-workers + static_configs: + - targets: + - host.docker.internal:9091 + - host.docker.internal:9092 + - host.docker.internal:9093 + - host.docker.internal:9094 + - host.docker.internal:9095 + - host.docker.internal:9096 + - host.docker.internal:9097 + labels: + stack: smoke From c863e5961519603cf9df28ac3af5fdc63b06096c Mon Sep 17 00:00:00 2001 From: Mathews-Tom Date: Fri, 17 Apr 2026 16:40:03 +0530 Subject: [PATCH 4/4] docs: refresh README, add manual-testing guide, sync repo-structure block README: document the v0.1.0 release, the optional-dependency groups (llm-local, llm-cloud, bus-nats, bus-redis, storage-timescale, observability, distributed), every runnable surface (labeling CLI, worker catalog, migration scripts, smoke compose, container build), and a per-phase status table. docs/operations/manual-testing.md: end-to-end guide covering quality gates, labeling CLI, distributed-runtime smoke stack (compose bring-up, TimescaleDB init, worker catalog invocation, known gaps), migration scripts, container build, Kubernetes dry-run, and observability endpoints. docs/README.md group index: add the operations/ group so readers find the runbook and the manual-testing guide. Extend the implementer reading order with both. docs/architecture/system-design.md: rewrite the repository-structure block so workers/, ops/docker/, ops/deploy/, the three new TOML configs, and the migration + sidecar scripts are discoverable from the canonical architecture doc. --- README.md | 115 +++++++++++++++- docs/README.md | 23 ++-- docs/architecture/system-design.md | 63 ++++++--- docs/operations/manual-testing.md | 214 +++++++++++++++++++++++++++++ 4 files changed, 378 insertions(+), 37 deletions(-) create mode 100644 docs/operations/manual-testing.md diff --git a/README.md b/README.md index ae3a2e2..e0413a1 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,8 @@ Structured market anomaly detection for prediction markets. Augur observes Polym Augur is not a forecaster, an arbitrage engine, or a news writer. It is a deterministic structured-signal pipeline. See `docs/foundations/overview.md` for the full product framing and `docs/foundations/non-goals.md` for what Augur explicitly does not do. +Current version: **0.1.0**. Phase 1-5 scaffolding landed; runnable surfaces are the test suite, the labeling CLI, and the distributed-runtime smoke stack. See `docs/operations/manual-testing.md` for the end-to-end guide. + ## Documentation Authoritative documentation lives in `docs/`: @@ -12,6 +14,7 @@ Authoritative documentation lives in `docs/`: - `docs/contracts/` — schemas, versioning policy, consumer registry - `docs/methodology/` — calibration, manipulation taxonomy, labeling protocol - `docs/architecture/` — system design, polling spec, deduplication and storms, storage and scaling +- `docs/operations/` — distributed runbook, manual testing guide - `docs/examples/` — worked positive and negative signal paths - `docs/strategy/` — risk register, defensibility thesis @@ -21,6 +24,7 @@ Start with `docs/README.md` for the documentation index. - Python 3.12 or newer - [uv](https://docs.astral.sh/uv/) 0.6 or newer for dependency management +- Optional: Docker + Docker Compose for the phase-5 smoke stack ## Local Development @@ -34,6 +38,67 @@ uv run pytest # run the test suite with coverage All three workspace packages — `augur-signals`, `augur-labels`, `augur-format` — are installed in editable mode by `uv sync`. Configuration lives under `config/`; data and label artifacts live under `data/` and `labels/`. Exported JSON schemas are committed to `schemas/` and kept in sync by `scripts/export_schemas.py`. +## Optional Dependency Groups + +Each workspace package exposes extras for opt-in integrations. Install only what a deployment needs: + +```bash +# LLM secondary formatter (phase 4) +uv sync --extra llm-local # augur-format[llm-local] — Ollama client +uv sync --extra llm-cloud # augur-format[llm-cloud] — Anthropic SDK + +# Distributed runtime (phase 5) +uv sync --extra bus-nats # NATS JetStream adapter +uv sync --extra bus-redis # Redis Streams adapter +uv sync --extra storage-timescale # TimescaleDB via psycopg +uv sync --extra observability # Prometheus + OpenTelemetry +uv sync --extra distributed # all of the above +``` + +The dev dependency group in the repo root already pulls every extra so CI exercises every adapter against injected fakes. + +## Runnable Surfaces + +### Labeling CLI (phase 2) + +```bash +uv run python scripts/label.py --help +uv run python scripts/label.py candidates +uv run python scripts/label.py decide +``` + +### Worker entrypoints (phase 5) + +```bash +uv run python -m augur_signals.workers # catalog +uv run python -m augur_signals.workers.poller --help # per-kind entrypoints +``` + +The `workers` package exposes bootstrap helpers (`augur_signals.workers.bootstrap`) that every `__main__` module uses for config loading, observability activation, and bus connection. Per-kind transform wiring for feature / detector / manipulation / calibration / dedup / context_format / llm requires a follow-up commit — see `docs/operations/manual-testing.md §3`. + +### Migration scripts (phase 5) + +```bash +uv run python scripts/migrate_to_timescale.py backfill --from labels/snapshots_archive +uv run python scripts/migrate_to_timescale.py verify --start 2026-01-01 --end 2026-04-01 --duckdb data/augur.duckdb +uv run python scripts/dual_write_sidecar.py --lag-alert-seconds 10 +``` + +### Smoke stack (phase 5) + +```bash +docker compose -f ops/docker/compose.yaml up -d # NATS + Redis + TimescaleDB + Prometheus +export AUGUR_CONFIG_DIR=$(pwd)/ops/docker/config +export AUGUR_TIMESCALE_URL=postgresql://augur:augur@localhost:5432/augur +``` + +### Container build + +```bash +docker build -f ops/docker/Dockerfile -t augur:dev . +kubectl apply -k ops/deploy/ --dry-run=client -o yaml +``` + ## Quality Gates The following commands must pass before any commit reaches `main`: @@ -50,23 +115,61 @@ Coverage thresholds (80 % overall, 90 % new code, 95 % critical paths) follow `~ ## Repository Layout -``` +```text augur/ -├── pyproject.toml # uv workspace root +├── pyproject.toml # uv workspace root (v0.1.0) ├── uv.lock ├── config/ # TOML configuration +│ ├── bus.toml # phase 5 — message bus backend +│ ├── storage.toml # phase 5 — DuckDB / TimescaleDB selector +│ ├── observability.toml # phase 5 — Prometheus + OTel exporters +│ ├── llm.toml # phase 4 — gated LLM formatter +│ └── ... # polling, detectors, dedup, formatters, consumers, labeling, markets, forbidden_tokens ├── data/ # market taxonomy, investigation prompts, calibration state ├── labels/ # newsworthy-event labels (Parquet) ├── schemas/ # exported JSON schemas per Pydantic model -├── scripts/ # export_schemas, backtest, calibrate, label +├── scripts/ +│ ├── backtest.py # stub +│ ├── calibrate.py # stub +│ ├── export_schemas.py +│ ├── label.py # labeling CLI wrapper +│ ├── lint_detector_now.py +│ ├── migrate_to_timescale.py # phase 5 backfill + verify +│ └── dual_write_sidecar.py # phase 5 tee replay ├── src/ │ ├── augur_signals/ # signal extraction core (no LLM imports — CI enforced) -│ ├── augur_labels/ # labeling pipeline -│ └── augur_format/ # deterministic and gated-LLM formatters +│ │ └── augur_signals/ +│ │ ├── bus/ # EventBus protocol + NATS + Redis + distributed lock +│ │ ├── workers/ # harness, singleton runner, bootstrap, subject helpers +│ │ ├── storage/ # DuckDB + TimescaleDB adapters +│ │ └── ... # ingestion, features, detectors, manipulation, calibration, dedup, context +│ ├── augur_labels/ # labeling pipeline (phase 2) +│ └── augur_format/ # deterministic and gated-LLM formatters (phases 3 + 4) ├── tests/ -└── ops/ # deployment and observability assets (populated later) +├── ops/ +│ ├── docker/ # multi-stage Dockerfile + local compose smoke stack +│ │ ├── Dockerfile +│ │ ├── compose.yaml +│ │ ├── prometheus.yml +│ │ ├── otel-collector.yaml +│ │ └── config/ # smoke-specific bus/storage/observability TOMLs +│ └── deploy/ # Kubernetes manifests (Deployments, StatefulSets, HPA, Services) +└── .docs/ # phase specs and development plan ``` +## Phase Status + +| Phase | Scope | State | +| --- | --- | --- | +| 0 | Project workspace, CI scaffolding | Merged | +| 1 | Signal extraction core, detectors, calibration, dedup, context | Merged | +| 2 | Labeling pipeline + annotator CLI | Merged | +| 3 | Deterministic formatters (JSON, Markdown, WebSocket, Webhook) | Merged | +| 4 | Gated LLM secondary formatter | Merged | +| 5 | Distributed runtime scaffolding (bus, TimescaleDB, workers, ops) | Merged | + +`CHANGELOG.md` records per-phase operational handoff notes. Release notes for v0.1.0 will aggregate these on tag. + ## License See `LICENSE`. diff --git a/docs/README.md b/docs/README.md index 3143504..7f2942a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -32,22 +32,25 @@ Read these before writing or modifying any code: 3. `methodology/calibration-methodology.md` — confidence pipeline 4. `methodology/labeling-protocol.md` — ground-truth definition 5. `methodology/manipulation-taxonomy.md` — manipulation signatures -6. `architecture/system-design.md` — layer-by-layer architecture +6. `architecture/system-design.md` — layer-by-layer architecture (includes Deployment Modes) 7. `architecture/adaptive-polling-spec.md` — polling state machine 8. `architecture/deduplication-and-storms.md` — signal merge algorithm 9. `architecture/storage-and-scaling.md` — storage architecture and migration triggers +10. `operations/distributed-runbook.md` — cutover, rollback, failover procedures +11. `operations/manual-testing.md` — runnable surfaces and local smoke stack ## Group Index -| Group | Purpose | -| ------------------- | ----------------------------------------------------- | -| `foundations/` | Project framing, scope, vocabulary, outward case | -| `contracts/` | Data schemas and registries that bind layers together | -| `methodology/` | Statistical, algorithmic, and process methodology | -| `architecture/` | System architecture, storage, polling, signal merging | -| `strategy/` | Risk register and defensibility analysis | -| `examples/` | Worked positive-path and negative-path examples | -| `open-questions.md` | Unresolved decisions with current best answers | +| Group | Purpose | +| ------------------- | ------------------------------------------------------------ | +| `foundations/` | Project framing, scope, vocabulary, outward case | +| `contracts/` | Data schemas and registries that bind layers together | +| `methodology/` | Statistical, algorithmic, and process methodology | +| `architecture/` | System architecture, storage, polling, signal merging | +| `operations/` | Distributed-runtime runbook and manual-testing guide | +| `strategy/` | Risk register and defensibility analysis | +| `examples/` | Worked positive-path and negative-path examples | +| `open-questions.md` | Unresolved decisions with current best answers | ## Conventions diff --git a/docs/architecture/system-design.md b/docs/architecture/system-design.md index ecb386a..2e8559d 100644 --- a/docs/architecture/system-design.md +++ b/docs/architecture/system-design.md @@ -75,12 +75,20 @@ The diagram reflects the deterministic-context-primary architecture. The LLM for ```text augur/ -├── pyproject.toml +├── pyproject.toml # uv workspace root (v0.1.0) ├── README.md ├── config/ -│ ├── default.toml -│ ├── markets.toml +│ ├── bus.toml # phase 5 — message bus backend selector +│ ├── storage.toml # phase 5 — DuckDB / TimescaleDB selector +│ ├── observability.toml # phase 5 — Prometheus + OTel exporters +│ ├── llm.toml # phase 4 — gated LLM formatter +│ ├── polling.toml │ ├── detectors.toml +│ ├── dedup.toml +│ ├── formatters.toml +│ ├── consumers.toml +│ ├── labeling.toml +│ ├── markets.toml │ └── forbidden_tokens.toml ├── data/ │ ├── markets/ @@ -90,26 +98,39 @@ augur/ │ └── newsworthy_events.parquet ├── src/ │ ├── augur_signals/ -│ │ ├── models/ # MarketSnapshot, FeatureVector, MarketSignal, enums -│ │ ├── ingestion/ # Pollers, normalizer -│ │ ├── features/ # Rolling-window feature pipeline -│ │ ├── detectors/ # 5 detectors + base protocol -│ │ ├── manipulation/ # Signature catalog + evaluator -│ │ ├── calibration/ # FPR, BH-FDR, reliability curves, drift, FDR controller -│ │ ├── context/ # Deterministic context assembler -│ │ ├── storage/ # DuckDB persistence -│ │ ├── bus/ # Async event bus -│ │ ├── dedup/ # Signal dedup + storm handling -│ │ └── engine.py # Orchestrator -│ ├── augur_labels/ # Labeling pipeline +│ │ ├── models/ # MarketSnapshot, FeatureVector, MarketSignal, enums +│ │ ├── ingestion/ # Pollers, normalizer +│ │ ├── features/ # Rolling-window feature pipeline +│ │ ├── detectors/ # 5 detectors + base protocol +│ │ ├── manipulation/ # Signature catalog + evaluator +│ │ ├── calibration/ # FPR, BH-FDR, reliability curves, drift, FDR controller +│ │ ├── context/ # Deterministic context assembler +│ │ ├── storage/ # DuckDB + TimescaleDB adapters (phase 5) +│ │ ├── bus/ # EventBus protocol + NATS + Redis + distributed lock (phase 5) +│ │ ├── workers/ # Harness, singleton runner, bootstrap (phase 5) +│ │ ├── dedup/ # Signal dedup + storm handling +│ │ └── engine.py # Monolith orchestrator +│ ├── augur_labels/ # Labeling pipeline (phase 2) │ └── augur_format/ -│ ├── deterministic/ # JSON, Markdown templates -│ └── llm/ # Gated LLM formatter (Phase 4) +│ ├── deterministic/ # JSON, Markdown, webhook, websocket (phase 3) +│ └── llm/ # Gated LLM formatter (phase 4) ├── tests/ -└── scripts/ - ├── backtest.py - ├── calibrate.py - └── label.py +├── scripts/ +│ ├── backtest.py # stub +│ ├── calibrate.py # stub +│ ├── export_schemas.py +│ ├── label.py +│ ├── lint_detector_now.py +│ ├── migrate_to_timescale.py # phase 5 — backfill + verify +│ └── dual_write_sidecar.py # phase 5 — tee replay +└── ops/ + ├── docker/ # Dockerfile + local smoke compose stack + │ ├── Dockerfile + │ ├── compose.yaml + │ ├── prometheus.yml + │ ├── otel-collector.yaml + │ └── config/ # smoke-specific bus/storage/observability TOMLs + └── deploy/ # Kubernetes manifests (Deployments, StatefulSets, HPA, Services) ``` The `src/augur_signals/` package contains zero LLM imports. CI enforces this via grep. The `src/augur_format/llm/` package is the only location where LLM code lives; it is gated behind `interpretation_mode = LLM_ASSISTED` and is opt-in per consumer. diff --git a/docs/operations/manual-testing.md b/docs/operations/manual-testing.md new file mode 100644 index 0000000..56ec199 --- /dev/null +++ b/docs/operations/manual-testing.md @@ -0,0 +1,214 @@ +# Manual Testing Guide + +Augur has three runnable surfaces today: the test suite, the labeling CLI, and the distributed-runtime smoke stack. This document enumerates what can be exercised locally and what remains operator-wiring work. + +## 1. Quality gates and tests + +Every commit must pass the same gates CI runs: + +```bash +uv sync # install workspace + dev extras +uv run pre-commit install # wire git hooks +uv run pytest # full suite with coverage +uv run ruff check . +uv run ruff format --check . +uv run mypy --strict src/ +uv run python scripts/export_schemas.py --check +``` + +Targeted runs: + +```bash +uv run pytest tests/signals/ -q # phase 1 + 2 + 5 unit layer +uv run pytest tests/format/ -q # phase 3 + 4 formatters +uv run pytest tests/labels/ -q # phase 2 pipeline +uv run pytest -m unit # fast unit subset +uv run pytest -m integration # opt-in; needs compose stack +``` + +## 2. Labeling CLI + +The one end-to-end workflow that runs today is the annotator tool from phase 2. + +```bash +uv run python scripts/label.py --help +uv run python scripts/label.py candidates # list candidate events +uv run python scripts/label.py decide # record decision +uv run python scripts/label.py promote # gate through two-annotator rule +uv run python scripts/label.py coverage # per-category coverage +``` + +State persists to `labels/queue.json` and promoted rows land as partitioned Parquet under `labels/newsworthy_events/date=YYYY-MM-DD/`. + +## 3. Distributed-runtime smoke stack + +The phase 5 compose stack brings up every external dependency the workers need: NATS JetStream, Redis, TimescaleDB, Prometheus, and (optionally) an OTel collector. Workers run as separate host processes so each one is inspectable. + +### Start infrastructure + +```bash +docker compose -f ops/docker/compose.yaml up -d +docker compose -f ops/docker/compose.yaml ps +``` + +### Point workers at the smoke config + +```bash +export AUGUR_CONFIG_DIR=$(pwd)/ops/docker/config +export AUGUR_TIMESCALE_URL=postgresql://augur:augur@localhost:5432/augur +export AUGUR_REPLICA_ID=$(hostname)-local +``` + +### Initialize the TimescaleDB schema + +Run the monolith's migration script against the compose database to create hypertables + policies: + +```bash +uv run python -c " +import asyncio +import os +import psycopg +from augur_signals._config import load_config +from augur_signals.storage._config import StorageConfig +from augur_signals.storage.factory import make_timescaledb_store +from pathlib import Path + +async def init() -> None: + cfg = load_config(Path('ops/docker/config/storage.toml'), StorageConfig) + async with await psycopg.AsyncConnection.connect(os.environ['AUGUR_TIMESCALE_URL']) as conn: + store = await make_timescaledb_store(cfg, connection=conn) + await store.initialize() + +asyncio.run(init()) +" +``` + +### List worker entrypoints + +```bash +uv run python -m augur_signals.workers +``` + +Output: + +``` + poller python -m augur_signals.workers.poller --platform + feature python -m augur_signals.workers.feature --shard / + detector python -m augur_signals.workers.detector --shard / + manipulation python -m augur_signals.workers.manipulation + calibration python -m augur_signals.workers.calibration + dedup python -m augur_signals.workers.dedup + context_format python -m augur_signals.workers.context_format +``` + +### Current worker status + +| Worker | Entrypoint state | +| --- | --- | +| `workers` (catalog) | Runnable — prints the list above | +| `poller` | Bootstrapped; requires `SnapshotSource` from `augur_signals.ingestion` to be wired by the deployment's bootstrap module. `python -m augur_signals.workers.poller --help` shows argparse; invocation exits with the wiring requirement. | +| `feature` / `detector` / `manipulation` / `calibration` / `dedup` / `context_format` / `augur_format.workers.llm` | **Deferred** — bus message-schema per subject (e.g., is an `augur.candidates.*` payload a raw `MarketSignal`, or a `MarketSignal` plus recent trades and book events?) is not fixed by the Phase 5 spec. A follow-up commit must: (1) define `BusMessage` payloads per subject, (2) expose the Phase 1-4 transforms (FeaturePipeline, DetectorRegistry, ManipulationDetector, Calibration, ClusterMerge, ContextAssembler, LLMInterpreter) behind a bus-friendly API with state persistence, (3) write per-kind `__main__.py` wrappers that load the transform via `augur_signals.workers.bootstrap`. | + +### Bootstrap helpers (already runnable) + +The bootstrap module is complete and covered by `tests/signals/test_worker_bootstrap.py`: + +```python +from augur_signals.workers.bootstrap import ( + load_runtime_configs, + activate_observability, + build_event_bus, + resolve_replica_id, + parse_shard_arg, +) + +cfg = load_runtime_configs() # from $AUGUR_CONFIG_DIR +activate_observability(cfg.observability) # prometheus listener + OTel tracer +bus = build_event_bus(cfg.bus) # nats or redis +await bus.connect() +``` + +## 4. Migration scripts + +Both scripts are fully runnable against the smoke stack once TimescaleDB is initialized. + +### Backfill from the Parquet archive + +```bash +uv run python scripts/migrate_to_timescale.py backfill \ + --from labels/snapshots_archive \ + --batch-size 10000 +``` + +The script enumerates partitions chronologically, rejects partitions with unexpected columns, and aborts on row-count parity failure. + +### Verify per-(market, day) parity + +```bash +uv run python scripts/migrate_to_timescale.py verify \ + --start 2026-01-01 \ + --end 2026-04-01 \ + --duckdb data/augur.duckdb +``` + +### Dual-write sidecar + +```bash +uv run python scripts/dual_write_sidecar.py \ + --lag-alert-seconds 10 \ + --bus-backend nats \ + --tee-subject augur.writes +``` + +Requires the engine to publish to `augur.writes` — this path is not wired in the monolith yet, so the sidecar is smoke-testable against handcrafted fixtures for now. + +## 5. Container build and Kubernetes + +### Build the image + +```bash +docker build -f ops/docker/Dockerfile -t augur:dev . +``` + +The multi-stage build copies the uv venv + source + `config/` into a non-root user and exposes the monolith engine as the default CMD. Per-worker launch is a `CMD` override in the Kubernetes manifests. + +### Dry-run the Kubernetes manifests + +```bash +kubectl apply -k ops/deploy/ --dry-run=client -o yaml | less +``` + +Populate `ConfigMap` and `Secret` data before a real apply: + +```bash +kubectl -n augur create configmap augur-config \ + --from-file=config/ \ + --dry-run=client -o yaml | kubectl apply -f - + +kubectl -n augur create secret generic augur-secrets \ + --from-literal=AUGUR_TIMESCALE_URL="$AUGUR_TIMESCALE_URL" \ + --from-literal=REDIS_URL="redis://redis:6379/0" \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +## 6. Observability + +- Prometheus: `http://localhost:9090` after compose is up. Scrapes `host.docker.internal:9091..9097`. +- NATS admin: `http://localhost:8222/varz`. +- Redis CLI: `redis-cli -h localhost ping`. +- TimescaleDB: `psql $AUGUR_TIMESCALE_URL -c 'select * from timescaledb_information.hypertables'`. +- OTel collector: spans print to the container stdout (`docker compose logs otel-collector`). + +## 7. Tear down + +```bash +docker compose -f ops/docker/compose.yaml down -v +unset AUGUR_CONFIG_DIR AUGUR_TIMESCALE_URL AUGUR_REPLICA_ID +``` + +## 8. Known gaps + +- No end-to-end monolith launcher (`python -m augur_signals.engine` has no `__main__`). The engine is driveable from Python and from `tests/signals/test_engine_integration.py`, but no production script. +- `scripts/backtest.py` and `scripts/calibrate.py` are stubs that raise `NotImplementedError`. +- Worker entrypoints for feature / detector / manipulation / calibration / dedup / context_format / llm require the bus message-schema work described in §3 above. +- Live failover tests against a real NATS or Redis cluster are operator-owned; CI uses dependency-injected fakes.