diff --git a/.luacheckrc b/.luacheckrc index e6c8fb7..d31a16a 100644 --- a/.luacheckrc +++ b/.luacheckrc @@ -12,7 +12,7 @@ globals = { max_line_length = 140 -- Exclude generated or vendored paths -exclude_files = { "spec/helpers/mock_ngx.lua" } +exclude_files = { "spec/helpers/mock_ngx.lua", "cli/lib/mock_ngx.lua" } -- Test files monkey-patch standard globals (math.random, os.getenv) -- and use intentionally-unused callback arguments (self, ctx). diff --git a/README.md b/README.md index c922ef1..342d6cf 100644 --- a/README.md +++ b/README.md @@ -6,66 +6,53 @@

FAIRVISOR

-

Turn API limits into enforceable business policy.

- -

- Every API that charges per token, serves paying tenants, or runs agentic pipelines needs
- enforceable limits — not just rate-limit middleware bolted on as an afterthought.
-
- Open-source edge enforcement engine for rate limits, quotas, and cost budgets.
- Runs standalone or with a SaaS control plane for team governance. -

+

The LLM rate limiter your multi-tenant product was missing.

License: MPL-2.0 Latest release - CI + CI Lua coverage GHCR image Platforms: linux/amd64 · linux/arm64 Docs

-

- Latency: < 70 µs enforcement overhead · 195k RPS max throughput · No external state (no Redis / DB) -

- --- ## Table of Contents -- [What is Fairvisor?](#what-is-fairvisor) +- [Why we built this](#why-we-built-this) - [Why not nginx / Kong / Envoy?](#why-not-nginx--kong--envoy) - [Quick start](#quick-start) - [LLM token budget in 30 seconds](#llm-token-budget-in-30-seconds) - [How a request flows](#how-a-request-flows) + - [Architecture](#architecture) - [Enforcement capabilities](#enforcement-capabilities) -- [Policy as code](#policy-as-code) - [Performance](#performance) - [Deployment](#deployment) - [CLI](#cli) - [SaaS control plane (optional)](#saas-control-plane-optional) - [Project layout](#project-layout) -- [Contributing](#contributing) - [License](#license) --- -## What is Fairvisor? +When multiple tenants, agents, or services share an API, one misbehaving caller can exhaust the budget for everyone — whether that's LLM tokens, API credits, or request quotas. Fairvisor is a lightweight enforcement engine that gives each tenant isolated limits at the edge: token budgets, cost caps, rate limits, and kill switches — keyed on JWT claims, API keys, or IP. One container, one JSON policy file, no Redis. -Fairvisor Edge is a **policy enforcement layer** that sits between your API gateway and your upstream services. Every request is evaluated against a declarative JSON policy bundle and receives a deterministic allow or reject verdict — with machine-readable rejection headers and sub-millisecond latency. +## Why we built this -It is **not** a reverse proxy replacement. It is **not** a WAF. It is a dedicated, composable enforcement point for: +API gateways count requests. LLM providers bill by the token. -- **Rate limits and quotas** — per route, per tenant, per JWT claim, per API key -- **Cost budgets** — cumulative spend caps per org, team, or endpoint -- **LLM token limits** — TPM/TPD budgets with pre-request reservation and post-response refund -- **Kill switches** — instant traffic blocking per descriptor, no restart required -- **Shadow mode** — dry-run enforcement against real traffic before going live -- **Loop detection** — stops runaway agentic workflows at the edge -- **Circuit breaker** — auto-trips on spend spikes, auto-resets after cooldown +When you serve multiple tenants — customers, teams, or agentic pipelines — that gap becomes a real problem. One runaway agent can consume a month's token budget overnight. Your gateway sees one request per second; your invoice shows 3 million tokens. -All controls are defined in one versioned policy bundle. Policies hot-reload without restarting the process. +We needed something that: +- Understood token budgets, not just request counts +- Could key limits on JWT claims (`org_id`, `plan`, `user_id`), not just IPs +- Kept every request fast — no Redis round-trip, no extra network call in the hot path +- Could plug into nginx or Envoy *or* run standalone as a transparent LLM proxy + +We couldn't find it, so we built Fairvisor. ## Why not nginx / Kong / Envoy? @@ -84,17 +71,41 @@ If you have an existing gateway, the question is whether Fairvisor adds anything **If nginx `limit_req` is enough for you**, use it. It has zero overhead and is the right tool for simple per-IP global throttling. Fairvisor becomes relevant when you need per-tenant awareness, JWT-claim-based bucketing, or cost/token tracking that `limit_req` has no model for. -**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state. +**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state. See [Kong / Traefik integration →](https://docs.fairvisor.com/docs/gateway/) -**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position. +**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position. See [Envoy ext_authz integration →](https://docs.fairvisor.com/docs/gateway/envoy/) **If you are on Cloudflare or Akamai**, per-JWT-claim limits, LLM token budgets, and cost caps are not in the platform's model. If your limits are tenant-aware or cost-aware, you need something that runs in your own stack. -Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacement. See [docs/gateway-integration.md](docs/gateway-integration.md) for integration patterns. +Fairvisor can run alongside Kong, nginx, and Envoy — or as a standalone reverse proxy if you don't need a separate gateway. See [nginx auth_request →](https://docs.fairvisor.com/docs/gateway/nginx/) · [Envoy ext_authz →](https://docs.fairvisor.com/docs/gateway/envoy/) · [Kong / Traefik →](https://docs.fairvisor.com/docs/gateway/) for integration patterns. ## Quick start -### 1. Create a policy +> **Which mode is right for you?** +> - **Wrapper** — your app calls OpenAI / Anthropic / Gemini directly → point your client at Fairvisor instead, no other code changes needed. *Fastest to try.* +> - **Reverse proxy** — you have a single upstream service → Fairvisor sits in front and enforces before forwarding. +> - **Decision service** — you already run nginx, Envoy, or Kong → call `POST /v1/decision` from `auth_request` / `ext_authz`. + +### Fastest path + +```bash +git clone https://github.com/fairvisor/edge.git +cd edge/examples/quickstart +docker compose up -d +``` + +Run your first enforce/reject test in under a minute — full walkthrough in [`examples/quickstart/README.md`](examples/quickstart/README.md). + +**Recipes:** `examples/recipes/` — team budgets, runaway agent guard, circuit-breaker. + +**Sample artifacts:** `fixtures/` — canonical enforce/reject fixtures (OpenAI, Anthropic, Gemini). + +### Minimal decision\_service example + +
+Expand — manual setup with a single docker run + +**1. Create a policy** ```bash mkdir fairvisor-demo && cd fairvisor-demo @@ -127,7 +138,7 @@ mkdir fairvisor-demo && cd fairvisor-demo } ``` -### 2. Run the edge +**2. Run the edge** ```bash docker run -d \ @@ -136,31 +147,48 @@ docker run -d \ -v "$(pwd)/policy.json:/etc/fairvisor/policy.json:ro" \ -e FAIRVISOR_CONFIG_FILE=/etc/fairvisor/policy.json \ -e FAIRVISOR_MODE=decision_service \ - ghcr.io/fairvisor/fairvisor-edge:v0.1.0 + ghcr.io/fairvisor/fairvisor-edge:latest ``` -### 3. Verify +**3. Verify** ```bash curl -sf http://localhost:8080/readyz # {"status":"ok"} -curl -s -w "\nHTTP %{http_code}\n" \ +# Allowed request → HTTP 200 +curl -s -o /dev/null -w "HTTP %{http_code}\n" \ -H "X-Original-Method: GET" \ -H "X-Original-URI: /api/data" \ -H "X-Forwarded-For: 10.0.0.1" \ http://localhost:8080/v1/decision + +# Rejected request — exhaust the burst (>10 requests) +for i in $(seq 1 12); do + curl -s -o /dev/null -w "HTTP %{http_code}\n" \ + -H "X-Original-Method: GET" \ + -H "X-Original-URI: /api/data" \ + -H "X-Forwarded-For: 10.0.0.1" \ + http://localhost:8080/v1/decision +done +# last requests → HTTP 429 X-Fairvisor-Reason: rate_limit_exceeded ``` +
+ > Full walkthrough: [docs.fairvisor.com/docs/quickstart](https://docs.fairvisor.com/docs/quickstart/) ## LLM token budget in 30 seconds +The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, enforces budgets, and strips the upstream key from the client. No gateway changes needed — just point your client at Fairvisor instead of OpenAI. + +**1. Policy** — one rule, per-org TPM + daily cap: + ```json { "id": "llm-budget", "spec": { - "selector": { "pathPrefix": "/v1/chat" }, + "selector": { "pathPrefix": "/" }, "mode": "enforce", "rules": [ { @@ -178,9 +206,32 @@ curl -s -w "\nHTTP %{http_code}\n" \ } ``` -Each organization (from the JWT `org_id` claim) gets its own independent 60k TPM / 1.2M TPD budget. Requests over the limit return a `429` with an OpenAI-compatible error body — no client changes needed. +**2. Call the API** — token format `Bearer :`: + +```bash +curl https://your-fairvisor-host/openai/v1/chat/completions \ + -H "Authorization: Bearer eyJhbGc...:sk-proj-..." \ + -H "Content-Type: application/json" \ + -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}' +``` + +Fairvisor parses the JWT claims (no signature validation — the JWT is trusted as-is), extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT. -Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint. +When the budget is exhausted: + +```http +HTTP/1.1 429 Too Many Requests +X-Fairvisor-Reason: tpm_exceeded +Retry-After: 12 +RateLimit-Limit: 60000 +RateLimit-Remaining: 0 +``` + +Each organization gets its own independent 60k TPM / 1.2M TPD budget. Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint. + +The selector matches the incoming wrapper path. Use `pathPrefix: "/"` to cover all providers, or `pathPrefix: "/openai"` to limit to one provider only. + +> **Decision service / reverse proxy mode:** if you already have a gateway, use `selector: { "pathPrefix": "/v1/chat" }` and call `POST /v1/decision` from your existing `auth_request` or `ext_authz` hook instead. ## How a request flows @@ -188,7 +239,9 @@ Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible e **Reverse proxy mode** — Fairvisor sits inline. Traffic arrives at Fairvisor directly, gets evaluated, and is proxied to the upstream if allowed. No separate gateway needed. -Both modes use the same policy bundle and return the same rejection headers. +**Wrapper mode** — Fairvisor acts as a transparent LLM proxy. Clients send requests to Fairvisor's OpenAI-compatible endpoint (`/openai/v1/chat/completions`, `/anthropic/v1/messages`, `/gemini/v1/generateContent`). Fairvisor enforces token budgets and cost limits, strips the client auth header, injects the upstream API key, and forwards the request. No changes needed in the client — swap the base URL and you're done. + +All three modes use the same policy bundle and return the same rejection headers. When a request is rejected: @@ -206,46 +259,76 @@ Headers follow [RFC 9333 RateLimit Fields](https://www.rfc-editor.org/rfc/rfc933 ### Architecture -**Decision service mode** (sidecar — your gateway calls `/v1/decision`, handles forwarding itself): - -``` - Client ──► Your gateway (nginx / Envoy / Kong) - │ - │ POST /v1/decision - │ (auth_request / ext_authz) - ▼ - ┌─────────────────────┐ - │ Fairvisor Edge │ - │ decision_service │ - │ │ - │ rule_engine │ - │ ngx.shared.dict │ ◄── no Redis, no network - └──────────┬──────────┘ - │ - 204 allow │ 429 reject - ▼ - gateway proxies or returns rejection +**Decision service mode** — sidecar: your gateway calls `/v1/decision`, handles forwarding itself. + +```mermaid +sequenceDiagram + participant C as Client + participant G as Your Gateway
(nginx / Envoy / Kong) + participant F as Fairvisor Edge
decision_service + participant U as Upstream service + + C->>G: Request + G->>F: POST /v1/decision
(auth_request / ext_authz) + alt allow + F-->>G: 204 No Content + G->>U: Forward request + U-->>G: Response + G-->>C: Response + else reject + F-->>G: 429 + RateLimit headers + G-->>C: 429 Too Many Requests + end ``` -**Reverse proxy mode** (inline — Fairvisor handles proxying): - +**Reverse proxy mode** — inline: Fairvisor handles both enforcement and proxying. + +```mermaid +sequenceDiagram + participant C as Client + participant F as Fairvisor Edge
reverse_proxy + participant U as Upstream service + + C->>F: Request + alt allow + F->>U: Forward request + U-->>F: Response + F-->>C: Response + else reject + F-->>C: 429 + RFC 9333 headers + end ``` - Client ──► Fairvisor Edge (reverse_proxy) - │ - │ access.lua → rule_engine - │ ngx.shared.dict - │ - allow ──► upstream service - reject ──► 429 + RFC 9333 headers + +**Wrapper mode** — transparent LLM proxy: swap the base URL, no other client changes needed. + +```mermaid +sequenceDiagram + participant C as Client + participant F as Fairvisor Edge
wrapper + participant U as Upstream LLM
(OpenAI / Anthropic / Gemini) + + C->>F: POST /openai/v1/chat/completions
Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY + F->>F: 1. Parse JWT claims (org_id, user_id) + F->>F: 2. Enforce TPM / TPD / cost budget + alt budget ok + F->>U: POST /v1/chat/completions
Authorization: Bearer UPSTREAM_KEY + U-->>F: 200 OK + token usage + F->>F: 3. Count tokens · refund unused reservation + F-->>C: 200 OK (Authorization stripped from reply) + else budget exceeded + F-->>C: 429 X-Fairvisor-Reason: tpm_exceeded + end ``` -Both modes use the same policy bundle and produce the same rejection headers. +Supported upstream paths: `/openai/*`, `/anthropic/*`, `/gemini/*`, `/grok/*`. + +All three modes use the same policy bundle and produce the same rejection headers. ## Enforcement capabilities | If you need to… | Algorithm | Typical identity keys | Reject reason | |---|---|---|---| -| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:addr` | `rate_limit_exceeded` | +| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:address` | `rate_limit_exceeded` | | Cap cumulative spend | `cost_based` | `jwt:org_id`, `jwt:plan` | `budget_exhausted` | | Cap LLM tokens (TPM/TPD) | `token_bucket_llm` | `jwt:org_id`, `jwt:user_id` | `tpm_exceeded`, `tpd_exceeded` | | Instantly block a segment | kill switch | any descriptor | `kill_switch_active` | @@ -253,38 +336,10 @@ Both modes use the same policy bundle and produce the same rejection headers. | Stop runaway agent loops | loop detection | request fingerprint | `loop_detected` | | Clamp spend spikes | circuit breaker | global or policy scope | `circuit_breaker_open` | -Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:addr`, `ip:country`). Combine multiple keys per rule for compound matching. - -## Policy as code - -Define policies in JSON, validate against the schema, test in shadow mode, then promote: - -```bash -# Validate bundle structure and rule semantics -fairvisor validate ./policies.json - -# Replay real traffic without blocking anything -fairvisor test --dry-run - -# Apply a new bundle (hot-reload, no restart) -fairvisor connect --push ./policies.json -``` - -Policies are versioned JSON — commit them to Git, review changes in PRs, roll back with confidence. +Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:address`, `ip:country`). Combine multiple keys per rule for compound matching. ## Performance -### Benchmark methodology (March 2026) - -- **Hosts:** 2 × AWS `c7i.xlarge` (4 vCPU, 8 GiB each), cluster placement group, eu-central-1 -- **OS:** Ubuntu 24.04 LTS -- **Runtime:** OpenResty 1.29.2.1, Fairvisor latest `main` (no Docker) -- **Load tool:** `k6` v0.54.0, `constant-arrival-rate`, 10,000 RPS for 60s, 10s warmup -- **Benchmark script:** `run-all.sh` from `fairvisor/benchmark` -- **Topology:** two-host — Fairvisor and k6 on separate machines (VPC private network) -- **Decision endpoint contract:** `POST /v1/decision` with `X-Original-Method` and `X-Original-URI` -- **Note:** reverse proxy numbers include policy evaluation and upstream proxy hop to backend nginx. - ### Latest measured latency @ 10,000 RPS | Percentile | Decision service | Reverse proxy | Raw nginx (baseline) | @@ -303,9 +358,8 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll | Simple rate limit (1 rule) | 195,000 | | Complex policy (5 rules, JWT parsing, loop detection) | 195,000 | -**No external datastore.** All enforcement state lives in in-process shared memory (`ngx.shared.dict`). No Redis, no Postgres, no network round-trips in the decision path. +Reproduce: see [fairvisor/benchmark](https://github.com/fairvisor/benchmark) — the canonical benchmark source of truth for Fairvisor Edge performance numbers. -> Reproduce: `git clone https://github.com/fairvisor/benchmark && cd benchmark && bash run-all.sh` ## Deployment @@ -318,7 +372,7 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll | Envoy `ext_authz` | [docs/gateway/envoy](https://docs.fairvisor.com/docs/gateway/envoy/) | | Kong / Traefik | [docs/gateway](https://docs.fairvisor.com/docs/gateway/) | -Fairvisor integrates **alongside** Kong, nginx, Envoy, and Traefik — it does not replace them. +Fairvisor works alongside Kong, nginx, Envoy, and Traefik — or runs standalone as a reverse proxy when you don't need a separate gateway. ## CLI @@ -328,7 +382,6 @@ fairvisor validate policy.json # validate before deploying fairvisor test --dry-run # shadow-mode replay fairvisor status # edge health and loaded bundle info fairvisor logs # tail rejection events -fairvisor connect # connect to SaaS control plane ``` ## SaaS control plane (optional) @@ -349,25 +402,16 @@ If the SaaS is unreachable, the edge keeps enforcing with the last-known policy ## Project layout ``` -src/fairvisor/ runtime modules (OpenResty/LuaJIT) -cli/ command-line tooling -spec/ unit and integration tests (busted) -tests/e2e/ Docker-based E2E tests (pytest) -examples/ sample policy bundles -helm/ Helm chart -docker/ Docker artifacts -docs/ reference documentation -``` - -## Contributing - -See [CONTRIBUTING.md](CONTRIBUTING.md). Bug reports, issues, and pull requests welcome. - -Run the test suite: - -```bash -busted spec # unit + integration -pytest tests/e2e -v # E2E (requires Docker) +src/fairvisor/ runtime modules (OpenResty/LuaJIT) +cli/ command-line tooling +spec/ unit and integration tests (busted) +tests/e2e/ Docker-based E2E tests (pytest) +examples/quickstart/ runnable quickstart (docker compose up -d) +examples/recipes/ deployable policy recipes (team budgets, agent guard, circuit breaker) +fixtures/ canonical request/response sample artifacts +helm/ Helm chart +docker/ Docker artifacts +docs/ reference documentation ``` ## License diff --git a/bin/ci/check_unpinned_dependencies.py b/bin/ci/check_unpinned_dependencies.py index 433a0bf..00903f2 100644 --- a/bin/ci/check_unpinned_dependencies.py +++ b/bin/ci/check_unpinned_dependencies.py @@ -13,7 +13,7 @@ SHA_RE = re.compile(r"^[0-9a-f]{40}$") USES_RE = re.compile(r"uses:\s*([^\s#]+)") FROM_RE = re.compile(r"^\s*FROM\s+([^\s]+)", re.MULTILINE) -OPM_RE = re.compile(r"^\s*RUN\s+opm\s+get\s+([^\s\\]+)(?:\s+([^\s\\#]+))?", re.MULTILINE) +OPM_RE = re.compile(r"^\s*RUN\s+opm\s+get\s+([^\s\\]+)(?:[^\S\n]+([^\s\\#]+))?", re.MULTILINE) @dataclass(frozen=True) @@ -79,7 +79,9 @@ def scan_dockerfile(path: Path) -> list[Finding]: for match in OPM_RE.finditer(text): package = match.group(1) version = match.group(2) - if not version or version.startswith(("&", "|")): + # OPM uses "pkg=version" (embedded) or "pkg version" (space-separated) + is_pinned = "=" in package or (version and not version.startswith(("&", "|"))) + if not is_pinned: findings.append( Finding( category="opm-package", diff --git a/bin/fairvisor b/bin/fairvisor index f171b53..1b1a490 100755 --- a/bin/fairvisor +++ b/bin/fairvisor @@ -3,5 +3,5 @@ SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)" -exec resty -I "${SCRIPT_DIR}/src" -I "${SCRIPT_DIR}/cli" \ +exec resty -I "${SCRIPT_DIR}/src" -I "${SCRIPT_DIR}" \ "${SCRIPT_DIR}/cli/main.lua" "$@" diff --git a/cli/README.md b/cli/README.md index bd18187..a68ef1a 100644 --- a/cli/README.md +++ b/cli/README.md @@ -18,10 +18,10 @@ From the repo root: Or with `resty` directly (e.g. from another directory, adjusting `-I` paths): ```bash -resty -I /path/to/fv-oss/src -I /path/to/fv-oss/cli /path/to/fv-oss/cli/main.lua [options] +resty -I /path/to/fv-oss/src -I /path/to/fv-oss /path/to/fv-oss/cli/main.lua [options] ``` -`bin/fairvisor` sets `-I` to the repo's `src` and `cli` so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly. +`bin/fairvisor` sets `-I` to the repo's `src` and root (for `cli.*` modules) so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly. ## Commands diff --git a/cli/commands/status.lua b/cli/commands/status.lua index 3931937..326c495 100644 --- a/cli/commands/status.lua +++ b/cli/commands/status.lua @@ -41,14 +41,28 @@ function _M.run(argv) return nil, 2 end + local policy_version = "unknown" + if health_res.body and health_res.body ~= "" then + local ok_json, cjson = pcall(require, "cjson") + if ok_json then + local ok_dec, decoded = pcall(cjson.decode, health_res.body) + if ok_dec and type(decoded) == "table" and decoded.policy_version ~= nil then + policy_version = tostring(decoded.policy_version) + end + end + end + local metrics_res = httpc:request_uri(edge_url .. "/metrics") local metrics_body = metrics_res and metrics_res.body or "" + local decisions_raw = _parse_metric(metrics_body, "fairvisor_decisions_total") + local decisions = (decisions_raw ~= "unknown") and decisions_raw or "0" + local data = { status = (health_res.status == 200 and "ready") or "not ready", - policy_version = _parse_metric(metrics_body, "fairvisor_bundle_version"), + policy_version = policy_version, saas = (_parse_metric(metrics_body, "fairvisor_saas_reachable") == "1") and "connected" or "disconnected", - decisions = _parse_metric(metrics_body, "fairvisor_decisions_total"), + decisions = decisions, } if format == "json" then diff --git a/cli/commands/test.lua b/cli/commands/test.lua index 17066e4..d620ecf 100644 --- a/cli/commands/test.lua +++ b/cli/commands/test.lua @@ -15,6 +15,9 @@ local function _read_file(path) local content = handle:read("*a") handle:close() + if content == nil then + return nil, "failed to read file content" + end return content end @@ -47,7 +50,7 @@ local function _generate_mock_requests(bundle) headers = {}, query_params = {}, ip_address = "127.0.0.1", - user_agent = "fairvisor-cli/test", + user_agent = "fairvisor/test", } end @@ -58,7 +61,7 @@ local function _generate_mock_requests(bundle) headers = {}, query_params = {}, ip_address = "127.0.0.1", - user_agent = "fairvisor-cli/test", + user_agent = "fairvisor/test", } end @@ -127,7 +130,7 @@ function _M.run(argv) local content, read_err = _read_file(file) if not content then - output.print_error("Cannot read file: " .. read_err) + output.print_error("Cannot read file: " .. (read_err or "unknown read error")) return nil, 1 end @@ -142,7 +145,7 @@ function _M.run(argv) return nil, 1 end - local ok_mock, mock_ngx = pcall(require, "spec.helpers.mock_ngx") + local ok_mock, mock_ngx = pcall(require, "cli.lib.mock_ngx") if not ok_mock then output.print_error("mock_ngx helper is unavailable: " .. mock_ngx) return nil, 1 @@ -168,7 +171,7 @@ function _M.run(argv) if requests_file then requests, read_err = _load_requests(requests_file) if not requests then - output.print_error("Cannot load requests: " .. read_err) + output.print_error("Cannot load requests: " .. (read_err or "unknown read error")) return nil, 1 end else diff --git a/cli/commands/validate.lua b/cli/commands/validate.lua index 115236f..af4ab62 100644 --- a/cli/commands/validate.lua +++ b/cli/commands/validate.lua @@ -20,6 +20,9 @@ local function _read_file(path) local content = handle:read("*a") handle:close() + if content == nil then + return nil, "failed to read file content" + end return content end @@ -101,7 +104,7 @@ function _M.run(argv) local content, read_err = _read_file(file) if not content then - output.print_error("Cannot read file: " .. read_err) + output.print_error("Cannot read file: " .. (read_err or "unknown read error")) return nil, 1 end diff --git a/cli/lib/mock_ngx.lua b/cli/lib/mock_ngx.lua new file mode 100644 index 0000000..643efd8 --- /dev/null +++ b/cli/lib/mock_ngx.lua @@ -0,0 +1,283 @@ +local type = type +local string_byte = string.byte +local string_char = string.char +local string_sub = string.sub +local string_find = string.find +local table_concat = table.concat + +local bit +local ok_bit, b = pcall(require, "bit") +if ok_bit then bit = b end + +local _M = {} +local string_byte = string.byte + +local _BASE64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" + +local function _to_base64(input) + local bytes = { string_byte(input, 1, #input) } + local out = {} + local index = 1 + + while index <= #bytes do + local b1 = bytes[index] or 0 + local b2 = bytes[index + 1] or 0 + local b3 = bytes[index + 2] or 0 + local pad = 0 + + if bytes[index + 1] == nil then + pad = 2 + elseif bytes[index + 2] == nil then + pad = 1 + end + + local n = b1 * 65536 + b2 * 256 + b3 + local c1 = math.floor(n / 262144) % 64 + 1 + local c2 = math.floor(n / 4096) % 64 + 1 + local c3 = math.floor(n / 64) % 64 + 1 + local c4 = n % 64 + 1 + + out[#out + 1] = string_sub(_BASE64_ALPHABET, c1, c1) + out[#out + 1] = string_sub(_BASE64_ALPHABET, c2, c2) + + if pad == 2 then + out[#out + 1] = "=" + out[#out + 1] = "=" + elseif pad == 1 then + out[#out + 1] = string_sub(_BASE64_ALPHABET, c3, c3) + out[#out + 1] = "=" + else + out[#out + 1] = string_sub(_BASE64_ALPHABET, c3, c3) + out[#out + 1] = string_sub(_BASE64_ALPHABET, c4, c4) + end + + index = index + 3 + end + + return table_concat(out) +end + +local function _from_base64(input) + local clean = input:gsub("%s", "") + local out = {} + local index = 1 + + while index <= #clean do + local c1 = string_sub(clean, index, index) + local c2 = string_sub(clean, index + 1, index + 1) + local c3 = string_sub(clean, index + 2, index + 2) + local c4 = string_sub(clean, index + 3, index + 3) + + if c1 == "" or c2 == "" then + break + end + + local v1 = string_find(_BASE64_ALPHABET, c1, 1, true) + local v2 = string_find(_BASE64_ALPHABET, c2, 1, true) + local v3 = c3 ~= "=" and string_find(_BASE64_ALPHABET, c3, 1, true) or nil + local v4 = c4 ~= "=" and string_find(_BASE64_ALPHABET, c4, 1, true) or nil + + if not v1 or not v2 then + return nil + end + + v1 = v1 - 1 + v2 = v2 - 1 + v3 = v3 and (v3 - 1) or 0 + v4 = v4 and (v4 - 1) or 0 + + local n = v1 * 262144 + v2 * 4096 + v3 * 64 + v4 + local b1 = math.floor(n / 65536) % 256 + local b2 = math.floor(n / 256) % 256 + local b3 = n % 256 + + out[#out + 1] = string_char(b1) + if c3 ~= "=" then + out[#out + 1] = string_char(b2) + end + if c4 ~= "=" then + out[#out + 1] = string_char(b3) + end + + index = index + 4 + end + + return table_concat(out) +end + +local function _simple_digest(input) + local h1 = 2166136261 + local h2 = 16777619 + for i = 1, #input do + local b = string_byte(input, i) + if bit then + h1 = bit.bxor(h1, b) % 4294967296 + else + -- Fallback if bit is not available (not cryptographically same, but avoids crash) + h1 = (h1 + b) % 4294967296 + end + h1 = (h1 * 16777619) % 4294967296 + h2 = (h2 + (b * i)) % 4294967296 + end + + local parts = {} + for i = 1, 8 do + local a = (h1 + (i * 2654435761)) % 4294967296 + local b = (h2 + (i * 2246822519)) % 4294967296 + parts[#parts + 1] = string_char(math.floor(a / 16777216) % 256) + parts[#parts + 1] = string_char(math.floor(a / 65536) % 256) + parts[#parts + 1] = string_char(math.floor(b / 256) % 256) + parts[#parts + 1] = string_char(b % 256) + end + + return table_concat(parts) +end + +function _M.mock_shared_dict() + local data = {} + + return { + get = function(_, key) + return data[key] + end, + set = function(_, key, value) + data[key] = value + return true + end, + incr = function(_, key, value, init, _init_ttl) + local current = data[key] + if current == nil then + if init then + data[key] = init + value + return data[key], nil, true + end + return nil, "not found" + end + + data[key] = current + value + return data[key], nil, false + end, + delete = function(_, key) + data[key] = nil + end, + flush_all = function(_) + data = {} + end, + } +end + +function _M.setup_time_mock() + local mock_time = 1000.000 + + local function now() + return mock_time + end + + local function advance_time(seconds) + mock_time = mock_time + seconds + end + + local function set_time(seconds) + mock_time = seconds + end + + return { + now = now, + advance_time = advance_time, + set_time = set_time, + } +end + +function _M.setup_package_mock() + package.loaded["resty.maxminddb"] = { + initted = function() return true end, + init = function() return true end, + lookup = function() return nil end, + } +end + +function _M.setup_ngx() + local time = _M.setup_time_mock() + local dict = _M.mock_shared_dict() + local logs = {} + local timers = {} + + local function crc32_short(value) + local hash = 0 + local input = tostring(value or "") + for i = 1, #input do + hash = (hash * 33 + string_byte(input, i)) % 4294967296 + end + return hash + end + + _G.ngx = { + now = time.now, + update_time = function() + end, + shared = { + fairvisor_counters = dict, + }, + req = { + read_body = function() end, + get_body_data = function() return nil end, + get_body_file = function() return nil end, + get_headers = function() return {} end, + get_uri_args = function() return {} end, + }, + var = { + request_method = "GET", + uri = "/", + host = "localhost", + remote_addr = "127.0.0.1", + geoip2_data_country_iso_code = nil, + asn = nil, + fairvisor_asn_type = nil, + is_tor_exit = nil, + }, + log = function(...) + logs[#logs + 1] = { ... } + end, + timer = { + every = function(interval, callback) + timers[#timers + 1] = { interval = interval, callback = callback } + return true + end, + }, + hmac_sha256 = function(key, payload) + return _simple_digest(key .. ":" .. payload) + end, + hmac_sha1 = function(key, payload) + return _simple_digest("sha1:" .. key .. ":" .. payload) + end, + sha1_bin = function(payload) + return _simple_digest("sha1bin:" .. payload) + end, + sha256_bin = function(payload) + return _simple_digest("sha256bin:" .. payload) + end, + encode_base64 = function(value) + return _to_base64(value) + end, + decode_base64 = function(value) + return _from_base64(value) + end, + md5 = function(payload) + return _to_base64(_simple_digest("md5:" .. payload)) + end, + ERR = 1, + WARN = 2, + INFO = 3, + DEBUG = 4, + crc32_short = crc32_short, + } + + return { + time = time, + dict = dict, + logs = logs, + timers = timers, + } +end + +return _M diff --git a/docker/Dockerfile b/docker/Dockerfile index 47e7b68..0d3888c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,6 +6,7 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends \ gettext-base \ python3 \ libmaxminddb0 \ + libmaxminddb-dev \ mmdb-bin \ && rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.cli b/docker/Dockerfile.cli index c72ab12..f91c407 100644 --- a/docker/Dockerfile.cli +++ b/docker/Dockerfile.cli @@ -6,6 +6,8 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends \ perl \ && rm -rf /var/lib/apt/lists/* +RUN opm get ledgetech/lua-resty-http=0.17.1 + WORKDIR /opt/fairvisor COPY src /opt/fairvisor/src diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template index c0d4184..c745a65 100644 --- a/docker/nginx.conf.template +++ b/docker/nginx.conf.template @@ -25,6 +25,8 @@ worker_shutdown_timeout 35s; http { resolver 127.0.0.11 ipv6=off valid=30s; resolver_timeout 2s; + map_hash_max_size 262144; + map_hash_bucket_size 64; geo $is_tor_exit { default 0; @@ -51,7 +53,8 @@ http { location = /livez { default_type text/plain; - return 200 "ok\n"; + return 200 "ok +"; } location = /readyz { @@ -102,7 +105,8 @@ http { } default_type text/plain; - return 404 "not found\n"; + return 404 "not found +"; } } } diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md new file mode 100644 index 0000000..d9b637f --- /dev/null +++ b/examples/quickstart/README.md @@ -0,0 +1,111 @@ +# Fairvisor Edge — Quickstart + +Go from `git clone` to working policy enforcement in one step. + +## Prerequisites + +- Docker with Compose V2 (`docker compose version`) +- Port 8080 free on localhost + +## Start + +```bash +docker compose up -d +``` + +The first run builds the `fairvisor` image locally from `docker/Dockerfile`, so no +GHCR login is required. + +Wait for the `fairvisor` service to report healthy: + +```bash +docker compose ps +# fairvisor should show "healthy" +``` + +## Verify enforcement + +This quickstart runs in `FAIRVISOR_MODE=reverse_proxy`. Requests to `/v1/*` +are enforced by the TPM policy and forwarded to a local mock LLM backend. +No real API keys are required. + +**Allowed request** — should return `200`: + +```bash +curl -s -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d @../../fixtures/normal_request.json +``` + +Expected response body shape matches `../../fixtures/allow_response.json`. + +**Over-limit request** — should return `429`: + +```bash +curl -s -X POST http://localhost:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d @../../fixtures/over_limit_request.json +``` + +Expected response body shape: `../../fixtures/reject_tpm_exceeded.json`. +The response will also include: +- `X-Fairvisor-Reason: tpm_exceeded` +- `Retry-After: 60` +- `RateLimit-Limit: 100` (matches the quickstart policy `tokens_per_minute`) +- `RateLimit-Remaining: 0` + +## How the policy works + +The quickstart policy (`policy.json`) enforces a TPM limit keyed on `ip:address`: + +- `tokens_per_minute: 100` — allows roughly 2 small requests per minute +- `tokens_per_day: 1000` — daily cap +- `default_max_completion: 50` — pessimistic reservation per request when `max_tokens` is not set + +Sending `over_limit_request.json` (which sets `max_tokens: 200000`) immediately +exceeds the 100-token per-minute budget and triggers a `429`. + +## Wrapper mode (real provider routing) + +Wrapper mode routes requests to real upstream providers using provider-prefixed paths +and a composite Bearer token. It requires real provider API keys and cannot be +demonstrated with this mock stack. + +**Path and auth format:** + +``` +POST /openai/v1/chat/completions +Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY +``` + +Where: +- `CLIENT_JWT` — signed JWT identifying the calling client/tenant (used for policy enforcement) +- `UPSTREAM_KEY` — real upstream API key forwarded to the provider (e.g. `sk-...` for OpenAI) + +Fairvisor strips the composite header, injects the correct provider auth before forwarding, +and **never returns upstream auth headers to the caller** +(see `../../fixtures/allow_response.json`). + +**Provider-prefixed paths:** + +| Path prefix | Upstream | Auth header injected | +|---|---|---| +| `/openai/v1/...` | `https://api.openai.com/v1/...` | `Authorization: Bearer UPSTREAM_KEY` | +| `/anthropic/v1/...` | `https://api.anthropic.com/v1/...` | `x-api-key: UPSTREAM_KEY` | +| `/gemini/v1beta/...` | `https://generativelanguage.googleapis.com/v1beta/...` | `x-goog-api-key: UPSTREAM_KEY` | + +To run in wrapper mode, change the compose env to `FAIRVISOR_MODE: wrapper` and +supply real credentials in the `Authorization` header. + +## Teardown + +```bash +docker compose down +``` + +## Next steps + +- See `../recipes/` for team budgets, runaway agent guard, and provider failover scenarios +- See `../../fixtures/` for all sample request/response artifacts +- See [fairvisor/benchmark](https://github.com/fairvisor/benchmark) for performance benchmarks +- See [docs/install/](../../docs/install/) for Kubernetes, VM, and SaaS deployment options diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml new file mode 100644 index 0000000..59a2487 --- /dev/null +++ b/examples/quickstart/docker-compose.yml @@ -0,0 +1,59 @@ +# Fairvisor Edge — Quickstart stack (standalone + reverse proxy mode) +# +# Usage: +# docker compose up -d +# curl -s http://localhost:8080/readyz # health check +# curl -s -X POST http://localhost:8080/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d @../../fixtures/normal_request.json # expect 200 +# curl -s -X POST http://localhost:8080/v1/chat/completions \ +# -H "Content-Type: application/json" \ +# -d @../../fixtures/over_limit_request.json # expect 429 +# +# This stack runs in FAIRVISOR_MODE=reverse_proxy — requests to /v1/* are +# enforced by policy then forwarded to the local mock LLM backend. +# No real API keys required. +# +# Wrapper mode (routing by provider prefix, real upstream keys) is documented +# in README.md under "Wrapper mode". It requires real provider credentials and +# cannot be demonstrated with this mock stack. +# +# This file is also the base for the e2e-smoke CI check. +# CI expects the same port and volume contract; update CI too if those change. + +services: + fairvisor: + build: + context: ../.. + dockerfile: docker/Dockerfile + ports: + - "8080:8080" + environment: + FAIRVISOR_CONFIG_FILE: /etc/fairvisor/policy.json + FAIRVISOR_MODE: reverse_proxy + FAIRVISOR_BACKEND_URL: http://mock_llm:80 + FAIRVISOR_SHARED_DICT_SIZE: 32m + FAIRVISOR_LOG_LEVEL: info + FAIRVISOR_WORKER_PROCESSES: "1" + volumes: + - ./policy.json:/etc/fairvisor/policy.json:ro + depends_on: + mock_llm: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-sf", "http://127.0.0.1:8080/readyz"] + interval: 2m + timeout: 2s + retries: 15 + start_period: 5s + + mock_llm: + image: nginx:1.27-alpine + volumes: + - ./mock-llm.conf:/etc/nginx/nginx.conf:ro + healthcheck: + test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:80/"] + interval: 2m + timeout: 2s + retries: 10 + start_period: 5s diff --git a/examples/quickstart/mock-llm.conf b/examples/quickstart/mock-llm.conf new file mode 100644 index 0000000..26603ab --- /dev/null +++ b/examples/quickstart/mock-llm.conf @@ -0,0 +1,10 @@ +events {} +http { + server { + listen 80; + location / { + default_type application/json; + return 200 '{"id":"chatcmpl-qs","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","content":"Hello from the mock backend!"},"finish_reason":"stop"}],"usage":{"prompt_tokens":10,"completion_tokens":8,"total_tokens":18}}'; + } + } +} diff --git a/examples/quickstart/policy.json b/examples/quickstart/policy.json new file mode 100644 index 0000000..fb9b375 --- /dev/null +++ b/examples/quickstart/policy.json @@ -0,0 +1,31 @@ +{ + "bundle_version": 1, + "issued_at": "2026-01-01T00:00:00Z", + "expires_at": "2030-01-01T00:00:00Z", + "policies": [ + { + "id": "quickstart-tpm-policy", + "spec": { + "selector": { + "pathPrefix": "/v1/", + "methods": ["POST"] + }, + "mode": "enforce", + "rules": [ + { + "name": "tpm-limit", + "limit_keys": ["ip:address"], + "algorithm": "token_bucket_llm", + "algorithm_config": { + "tokens_per_minute": 100, + "tokens_per_day": 1000, + "burst_tokens": 100, + "default_max_completion": 50 + } + } + ] + } + } + ], + "kill_switches": [] +} diff --git a/examples/recipes/circuit-breaker/README.md b/examples/recipes/circuit-breaker/README.md new file mode 100644 index 0000000..ad1227e --- /dev/null +++ b/examples/recipes/circuit-breaker/README.md @@ -0,0 +1,43 @@ +# Recipe: Circuit Breaker — Cost Spike Auto-Shutdown + +Automatically block all LLM traffic when the aggregate token spend rate +exceeds a budget threshold, then self-reset after a cooldown period. + +## How it works + +- Normal traffic: per-org TPM limit enforced (`100 000 tokens/min`) +- Spike detection: if the rolling spend rate hits `500 000 tokens/min` + the circuit breaker opens and **all requests return `429`** with + `X-Fairvisor-Reason: circuit_breaker_open` +- Auto-reset: after 10 minutes without breaker-triggering load, the + circuit resets automatically — no manual intervention needed +- `alert: true` logs the trip event to the Fairvisor audit log + +## Deploy + +```bash +cp policy.json /etc/fairvisor/policy.json +``` + +## Expected behaviour + +```bash +# Normal request — passes +curl -s -o /dev/null -w "%{http_code}" \ + -H "Authorization: Bearer :" \ + http://localhost:8080/v1/chat/completions \ + -d '{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}' +# → 200 + +# After spend spike trips the breaker: +# → 429 X-Fairvisor-Reason: circuit_breaker_open +# Retry-After: 600 +``` + +## Tuning + +| Field | Description | +|---|---| +| `spend_rate_threshold_per_minute` | Tokens/min rolling spend that opens the breaker | +| `auto_reset_after_minutes` | Cooldown before automatic reset (0 = manual only) | +| `tokens_per_minute` | Per-org steady-state limit (independent of breaker) | diff --git a/examples/recipes/circuit-breaker/policy.json b/examples/recipes/circuit-breaker/policy.json new file mode 100644 index 0000000..7d58c8d --- /dev/null +++ b/examples/recipes/circuit-breaker/policy.json @@ -0,0 +1,37 @@ +{ + "bundle_version": 1, + "issued_at": "2026-01-01T00:00:00Z", + "expires_at": "2030-01-01T00:00:00Z", + "policies": [ + { + "id": "cost-spike-guard", + "spec": { + "selector": { + "pathPrefix": "/v1/", + "methods": ["POST"] + }, + "mode": "enforce", + "rules": [ + { + "name": "per-org-tpm", + "limit_keys": ["jwt:org_id"], + "algorithm": "token_bucket_llm", + "algorithm_config": { + "tokens_per_minute": 100000, + "burst_tokens": 100000, + "default_max_completion": 2048 + } + } + ], + "circuit_breaker": { + "enabled": true, + "spend_rate_threshold_per_minute": 500000, + "action": "reject", + "alert": true, + "auto_reset_after_minutes": 10 + } + } + } + ], + "kill_switches": [] +} diff --git a/examples/recipes/runaway-agent-guard/README.md b/examples/recipes/runaway-agent-guard/README.md new file mode 100644 index 0000000..7b34491 --- /dev/null +++ b/examples/recipes/runaway-agent-guard/README.md @@ -0,0 +1,50 @@ +# Recipe: Runaway Agent Guard + +Stop runaway agentic workflows before they exhaust your token budget or +billing limit. + +## Problem + +Autonomous agents (LangChain, AutoGPT, custom loops) can enter retry storms +or infinite planning loops. Without enforcement, a single runaway agent +can consume thousands of dollars of API budget in minutes. + +## How it works + +Two rules cooperate: + +1. **Loop detector** — counts requests per `agent_id` in a sliding window. + If the agent fires more than 30 requests in 60 seconds, it trips a + 120-second cooldown. This catches tight retry loops. + +2. **TPM guard** — caps tokens per minute per agent. A burst-heavy agent + that passes the loop check still cannot drain the token pool. + +## Deploy + +```bash +cp policy.json /etc/fairvisor/policy.json +``` + +## JWT shape expected + +```json +{ + "sub": "user-456", + "agent_id": "autoagent-prod-7", + "exp": 9999999999 +} +``` + +## Kill switch for incidents + +If an agent causes an incident, flip a kill switch without restarting edge: + +```bash +# Via CLI +fairvisor kill-switch enable agent-id=autoagent-prod-7 + +# Or update the policy bundle with a kill_switch entry and hot-reload +``` + +See `docs/cookbook/kill-switch-incident-response.md` for the full incident playbook. diff --git a/examples/recipes/runaway-agent-guard/policy.json b/examples/recipes/runaway-agent-guard/policy.json new file mode 100644 index 0000000..b3facab --- /dev/null +++ b/examples/recipes/runaway-agent-guard/policy.json @@ -0,0 +1,41 @@ +{ + "bundle_version": 1, + "issued_at": "2026-01-01T00:00:00Z", + "expires_at": "2030-01-01T00:00:00Z", + "policies": [ + { + "id": "runaway-agent-guard", + "spec": { + "selector": { + "pathPrefix": "/", + "methods": [ + "POST" + ] + }, + "mode": "enforce", + "loop_detection": { + "enabled": true, + "window_seconds": 60, + "threshold_identical_requests": 30, + "action": "reject", + "similarity": "exact" + }, + "rules": [ + { + "name": "agent-tpm-guard", + "limit_keys": [ + "jwt:agent_id" + ], + "algorithm": "token_bucket_llm", + "algorithm_config": { + "tokens_per_minute": 50000, + "burst_tokens": 50000, + "default_max_completion": 512 + } + } + ] + } + } + ], + "kill_switches": [] +} diff --git a/examples/recipes/team-budgets/README.md b/examples/recipes/team-budgets/README.md new file mode 100644 index 0000000..54c1551 --- /dev/null +++ b/examples/recipes/team-budgets/README.md @@ -0,0 +1,45 @@ +# Recipe: Team Budgets + +Enforce per-team token and cost limits using JWT claims. + +## How it works + +Each request carries a JWT with a `team_id` claim. Fairvisor uses this as +the bucket key for two independent rules: + +1. **TPM/TPD limit** — token-rate enforcement per minute and per day +2. **Monthly cost budget** — cumulative cost cap with staged warn/throttle/reject + +## Deploy + +```bash +# Copy policy to your edge config path +cp policy.json /etc/fairvisor/policy.json + +# Or use with docker compose (standalone mode): +FAIRVISOR_CONFIG_FILE=./policy.json FAIRVISOR_MODE=wrapper docker compose up -d +``` + +## JWT shape expected + +```json +{ + "sub": "user-123", + "team_id": "engineering", + "plan": "pro", + "exp": 9999999999 +} +``` + +## Staged actions at cost budget thresholds + +| Threshold | Action | +|---|---| +| 80% | Warn (allow, log, emit business event) | +| 95% | Throttle (allow with 500 ms delay) | +| 100% | Reject (429, `budget_exceeded`) | + +## Related fixtures + +- `../../../fixtures/reject_tpd_exceeded.json` — TPD reject body +- `../../../fixtures/reject_tpm_exceeded.json` — TPM reject body diff --git a/examples/recipes/team-budgets/policy.json b/examples/recipes/team-budgets/policy.json new file mode 100644 index 0000000..7a87d2c --- /dev/null +++ b/examples/recipes/team-budgets/policy.json @@ -0,0 +1,47 @@ +{ + "bundle_version": 1, + "issued_at": "2026-01-01T00:00:00Z", + "expires_at": "2030-01-01T00:00:00Z", + "policies": [ + { + "id": "team-token-budget", + "spec": { + "selector": { + "pathPrefix": "/openai/", + "methods": ["POST"] + }, + "mode": "enforce", + "rules": [ + { + "name": "per-team-tpm", + "limit_keys": ["jwt:team_id"], + "algorithm": "token_bucket_llm", + "algorithm_config": { + "tokens_per_minute": 120000, + "tokens_per_day": 2000000, + "burst_tokens": 120000, + "default_max_completion": 1024 + } + }, + { + "name": "per-team-cost-budget", + "limit_keys": ["jwt:team_id"], + "algorithm": "cost_based", + "algorithm_config": { + "budget": 50000, + "period": "7d", + "cost_key": "fixed", + "fixed_cost": 1, + "staged_actions": [ + { "threshold_percent": 80, "action": "warn" }, + { "threshold_percent": 95, "action": "throttle", "delay_ms": 500 }, + { "threshold_percent": 100, "action": "reject" } + ] + } + } + ] + } + } + ], + "kill_switches": [] +} diff --git a/fixtures/allow_response.json b/fixtures/allow_response.json new file mode 100644 index 0000000..7cc0312 --- /dev/null +++ b/fixtures/allow_response.json @@ -0,0 +1,28 @@ +{ + "_comment": "Sample 200 response for an allowed request in wrapper mode. Note: no Authorization, x-api-key, or x-goog-api-key headers — upstream auth is stripped on the response side.", + "_status": 200, + "_headers": { + "Content-Type": "application/json", + "X-Fairvisor-Reason": null, + "Authorization": null, + "x-api-key": null, + "x-goog-api-key": null + }, + "id": "chatcmpl-example", + "object": "chat.completion", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "Hello! How can I help you today?" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 10, + "completion_tokens": 9, + "total_tokens": 19 + } +} diff --git a/fixtures/anthropic_normal_request.json b/fixtures/anthropic_normal_request.json new file mode 100644 index 0000000..bcffdbf --- /dev/null +++ b/fixtures/anthropic_normal_request.json @@ -0,0 +1,10 @@ +{ + "model": "claude-3-5-haiku-20241022", + "max_tokens": 20, + "messages": [ + { + "role": "user", + "content": "Say hello in one sentence." + } + ] +} diff --git a/fixtures/normal_request.json b/fixtures/normal_request.json new file mode 100644 index 0000000..049a4e4 --- /dev/null +++ b/fixtures/normal_request.json @@ -0,0 +1,10 @@ +{ + "model": "gpt-4o-mini", + "messages": [ + { + "role": "user", + "content": "Say hello in one sentence." + } + ], + "max_tokens": 20 +} diff --git a/fixtures/over_limit_request.json b/fixtures/over_limit_request.json new file mode 100644 index 0000000..b3b554f --- /dev/null +++ b/fixtures/over_limit_request.json @@ -0,0 +1,10 @@ +{ + "model": "gpt-4o", + "messages": [ + { + "role": "user", + "content": "Say hello in one sentence." + } + ], + "max_tokens": 200000 +} diff --git a/fixtures/reject_anthropic.json b/fixtures/reject_anthropic.json new file mode 100644 index 0000000..bdf468f --- /dev/null +++ b/fixtures/reject_anthropic.json @@ -0,0 +1,13 @@ +{ + "_comment": "Anthropic-native 429 reject body. Used for /anthropic/* paths.", + "_headers": { + "X-Fairvisor-Reason": "tpm_exceeded", + "Retry-After": "60", + "Content-Type": "application/json" + }, + "type": "error", + "error": { + "type": "rate_limit_error", + "message": "Token budget exceeded for this tenant." + } +} diff --git a/fixtures/reject_gemini.json b/fixtures/reject_gemini.json new file mode 100644 index 0000000..f0df901 --- /dev/null +++ b/fixtures/reject_gemini.json @@ -0,0 +1,13 @@ +{ + "_comment": "Gemini-native 429 reject body. Used for /gemini/* paths.", + "_headers": { + "X-Fairvisor-Reason": "tpm_exceeded", + "Retry-After": "60", + "Content-Type": "application/json" + }, + "error": { + "code": 429, + "message": "Token budget exceeded for this tenant.", + "status": "RESOURCE_EXHAUSTED" + } +} diff --git a/fixtures/reject_openai.json b/fixtures/reject_openai.json new file mode 100644 index 0000000..eabd023 --- /dev/null +++ b/fixtures/reject_openai.json @@ -0,0 +1,14 @@ +{ + "_comment": "OpenAI-native 429 reject body. Used for /openai/* paths and OpenAI-compatible providers.", + "_headers": { + "X-Fairvisor-Reason": "tpm_exceeded", + "Retry-After": "60", + "Content-Type": "application/json" + }, + "error": { + "type": "rate_limit_error", + "code": "tpm_exceeded", + "message": "Token budget exceeded for this tenant.", + "param": null + } +} diff --git a/fixtures/reject_prompt_too_large.json b/fixtures/reject_prompt_too_large.json new file mode 100644 index 0000000..9c4cf8c --- /dev/null +++ b/fixtures/reject_prompt_too_large.json @@ -0,0 +1,13 @@ +{ + "_comment": "429 body returned when the request exceeds max_prompt_tokens.", + "_headers": { + "X-Fairvisor-Reason": "prompt_too_large", + "Content-Type": "application/json" + }, + "error": { + "type": "rate_limit_error", + "code": "prompt_too_large", + "message": "Request prompt exceeds the maximum allowed token count for this policy.", + "param": null + } +} diff --git a/fixtures/reject_tpd_exceeded.json b/fixtures/reject_tpd_exceeded.json new file mode 100644 index 0000000..83cb2ea --- /dev/null +++ b/fixtures/reject_tpd_exceeded.json @@ -0,0 +1,16 @@ +{ + "_comment": "Illustrative 429 body returned when the per-day token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_day value.", + "_headers": { + "X-Fairvisor-Reason": "tpd_exceeded", + "Retry-After": "", + "RateLimit-Limit": "", + "RateLimit-Remaining": "0", + "Content-Type": "application/json" + }, + "error": { + "type": "rate_limit_error", + "code": "tpd_exceeded", + "message": "Token budget exceeded for this tenant.", + "param": null + } +} diff --git a/fixtures/reject_tpm_exceeded.json b/fixtures/reject_tpm_exceeded.json new file mode 100644 index 0000000..0805778 --- /dev/null +++ b/fixtures/reject_tpm_exceeded.json @@ -0,0 +1,17 @@ +{ + "_comment": "Illustrative 429 body returned when the per-minute token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_minute value.", + "_headers": { + "X-Fairvisor-Reason": "tpm_exceeded", + "Retry-After": "60", + "RateLimit-Limit": "", + "RateLimit-Remaining": "0", + "RateLimit-Reset": "", + "Content-Type": "application/json" + }, + "error": { + "type": "rate_limit_error", + "code": "tpm_exceeded", + "message": "Token budget exceeded for this tenant.", + "param": null + } +} diff --git a/spec/unit/features/llm_limiter.feature b/spec/unit/features/llm_limiter.feature index e9d3828..d33e4b1 100644 --- a/spec/unit/features/llm_limiter.feature +++ b/spec/unit/features/llm_limiter.feature @@ -190,3 +190,61 @@ Feature: LLM limiter module behavior And the llm limiter config is validated When I build error response for reason "tpm_exceeded" Then error response has OpenAI rate limit shape + + Rule: New features — max_completion_tokens and improved JSON parsing + Scenario: max_completion_tokens is extracted from body when max_tokens is missing + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config has default_max_completion 1000 + And the llm limiter config is validated + And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_completion_tokens":2000}' + When I run llm check at now 1700000000 + Then check is allowed + And reserved equals estimated_total 2002 + + Scenario: improved JSON parsing handles spaces and false positives + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config uses estimator "simple_word" + And the llm limiter config is validated + And the request body is '{"messages":[{"role":"user", "content" : "12345678"}]}' + When I estimate prompt tokens + Then prompt estimate equals 2 + + Scenario: simple_word parsing multiple messages + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config uses estimator "simple_word" + And the llm limiter config is validated + And the request body is '{"messages":[{"role":"user","content":"hello"},{"role":"assistant","content":"world!"}]}' + When I estimate prompt tokens + Then prompt estimate equals 3 + + Scenario: simple_word fallback when no messages key + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config uses estimator "simple_word" + And the llm limiter config is validated + And the request body is '{"input":"test"}' + When I estimate prompt tokens + Then prompt estimate equals 4 + + Scenario: max_tokens field in body is used when request_context.max_tokens is absent + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config has default_max_completion 1000 + And the llm limiter config is validated + And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_tokens":500}' + When I run llm check at now 1700000000 + Then check is allowed + And reserved equals estimated_total 502 + + Scenario: body with no max_tokens field falls back to default_max_completion + Given the nginx mock environment is reset + And a valid llm limiter config with tokens_per_minute 10000 + And the config has default_max_completion 800 + And the llm limiter config is validated + And the request body is '{"messages":[{"role":"user","content":"hi"}]}' + When I run llm check at now 1700000000 + Then check is allowed + And reserved equals estimated_total 801 diff --git a/spec/unit/llm_limiter_spec.lua b/spec/unit/llm_limiter_spec.lua index aecf8e1..362b3ae 100644 --- a/spec/unit/llm_limiter_spec.lua +++ b/spec/unit/llm_limiter_spec.lua @@ -129,6 +129,11 @@ runner:given("^the request body is empty$", function(ctx) ctx.request_context.body = "" end) +runner:given("^the request body is '([^']+)'$", function(ctx, body) + ctx.request_context = ctx.request_context or {} + ctx.request_context.body = body +end) + runner:given("^the request body has (%d+) prompt characters in messages$", function(ctx, chars) local char_count = tonumber(chars) local content = string.rep("a", char_count) diff --git a/spec/unit/supply_chain_pins_spec.lua b/spec/unit/supply_chain_pins_spec.lua index 63190bf..f9b36bf 100644 --- a/spec/unit/supply_chain_pins_spec.lua +++ b/spec/unit/supply_chain_pins_spec.lua @@ -30,6 +30,11 @@ describe("supply chain pinning", function() assert.is_falsy(runtime_dockerfile:match("RUN opm get anjia0532/lua%-resty%-maxminddb%s*$")) end) + it("pins lua-resty-http to a concrete OPM version in the CLI Dockerfile", function() + assert.is_truthy(cli_dockerfile:match("RUN opm get ledgetech/lua%-resty%-http=0%.17%.1")) + assert.is_falsy(cli_dockerfile:match("RUN opm get ledgetech/lua%-resty%-http%s*$")) + end) + it("pins both CodeQL upload-sarif actions to the resolved commit SHA", function() local _, count = ci_workflow:gsub( "github/codeql%-action/upload%-sarif@cb06a0a8527b2c6970741b3a0baa15231dc74a4c", diff --git a/src/fairvisor/llm_limiter.lua b/src/fairvisor/llm_limiter.lua index 3b29bc0..ab3d3d4 100644 --- a/src/fairvisor/llm_limiter.lua +++ b/src/fairvisor/llm_limiter.lua @@ -189,23 +189,32 @@ local function _simple_word_estimate(request_context) local array_end = array_start and string_find(body, "]", array_start, true) if array_start and array_end then local segment = string_sub(body, array_start, array_end) - local marker = "\"content\":\"" - local marker_len = #marker local position = 1 local char_count = 0 while true do - local start_pos = string_find(segment, marker, position, true) - if not start_pos then + -- Find "content" key + local key_start = string_find(segment, "\"content\"", position, true) + if not key_start then break end - local content_start = start_pos + marker_len - local content_end = string_find(segment, "\"", content_start, true) - if not content_end then - break + + -- Look for value start: : "..." + -- pattern: ^%s*:%s*" + local val_marker_start, val_marker_end = string_find(segment, "^%s*:%s*\"", key_start + 9) + + if not val_marker_start then + -- False positive (e.g. key was in a string), skip it + position = key_start + 7 + else + local content_start = val_marker_end + 1 + local content_end = string_find(segment, "\"", content_start, true) + if not content_end then + break + end + char_count = char_count + (content_end - content_start) + position = content_end + 1 end - char_count = char_count + (content_end - content_start) - position = content_end + 1 end return ceil(char_count / 4) @@ -215,6 +224,21 @@ local function _simple_word_estimate(request_context) return ceil(#body / 4) end +local function _extract_max_tokens(body) + if type(body) ~= "string" or body == "" then + return nil + end + -- simple regex scan for "max_tokens": 123 + -- or "max_completion_tokens": 123 + local _, _, val = string_find(body, '"max_tokens"%s*:%s*(%d+)') + if val then return tonumber(val) end + + _, _, val = string_find(body, '"max_completion_tokens"%s*:%s*(%d+)') + if val then return tonumber(val) end + + return nil +end + local function _check_tpd_budget(dict, key, config, cost, now) local ttl = _seconds_until_midnight_utc(now) local new_total, incr_err = dict:incr(key, cost, 0, ttl) @@ -365,6 +389,11 @@ function _M.check(dict, key, config, request_context, now) local max_completion = config.default_max_completion if request_context and type(request_context.max_tokens) == "number" and request_context.max_tokens > 0 then max_completion = request_context.max_tokens + elseif request_context and request_context.body then + local extracted = _extract_max_tokens(request_context.body) + if extracted and extracted > 0 then + max_completion = extracted + end end if config.max_completion_tokens and max_completion > config.max_completion_tokens then max_completion = config.max_completion_tokens diff --git a/src/fairvisor/rule_engine.lua b/src/fairvisor/rule_engine.lua index 8808fdb..06dcbf9 100644 --- a/src/fairvisor/rule_engine.lua +++ b/src/fairvisor/rule_engine.lua @@ -8,6 +8,9 @@ local utils = require("fairvisor.utils") local _M = {} +local function _log_info(...) + if ngx and ngx.log then ngx.log(ngx.INFO, ...) end +end local function _log_warn(...) if ngx and ngx.log then ngx.log(ngx.WARN, ...) end end @@ -469,12 +472,12 @@ end local function _maybe_log_override_state(flags) if _last_global_shadow_active ~= flags.global_shadow_active then _last_global_shadow_active = flags.global_shadow_active - _log_warn("evaluate global_shadow_state active=", tostring(flags.global_shadow_active)) + _log_info("evaluate global_shadow_state active=", tostring(flags.global_shadow_active)) end if _last_kill_switch_override_active ~= flags.kill_switch_override_active then _last_kill_switch_override_active = flags.kill_switch_override_active - _log_warn("evaluate kill_switch_override_state active=", tostring(flags.kill_switch_override_active)) + _log_info("evaluate kill_switch_override_state active=", tostring(flags.kill_switch_override_active)) end end