diff --git a/.luacheckrc b/.luacheckrc
index e6c8fb7..d31a16a 100644
--- a/.luacheckrc
+++ b/.luacheckrc
@@ -12,7 +12,7 @@ globals = {
max_line_length = 140
-- Exclude generated or vendored paths
-exclude_files = { "spec/helpers/mock_ngx.lua" }
+exclude_files = { "spec/helpers/mock_ngx.lua", "cli/lib/mock_ngx.lua" }
-- Test files monkey-patch standard globals (math.random, os.getenv)
-- and use intentionally-unused callback arguments (self, ctx).
diff --git a/README.md b/README.md
index c922ef1..342d6cf 100644
--- a/README.md
+++ b/README.md
@@ -6,66 +6,53 @@
FAIRVISOR
-Turn API limits into enforceable business policy.
-
-
- Every API that charges per token, serves paying tenants, or runs agentic pipelines needs
- enforceable limits — not just rate-limit middleware bolted on as an afterthought.
-
- Open-source edge enforcement engine for rate limits, quotas, and cost budgets.
- Runs standalone or with a SaaS control plane for team governance.
-
+The LLM rate limiter your multi-tenant product was missing.
-
+
-
- Latency: < 70 µs enforcement overhead · 195k RPS max throughput · No external state (no Redis / DB)
-
-
---
## Table of Contents
-- [What is Fairvisor?](#what-is-fairvisor)
+- [Why we built this](#why-we-built-this)
- [Why not nginx / Kong / Envoy?](#why-not-nginx--kong--envoy)
- [Quick start](#quick-start)
- [LLM token budget in 30 seconds](#llm-token-budget-in-30-seconds)
- [How a request flows](#how-a-request-flows)
+ - [Architecture](#architecture)
- [Enforcement capabilities](#enforcement-capabilities)
-- [Policy as code](#policy-as-code)
- [Performance](#performance)
- [Deployment](#deployment)
- [CLI](#cli)
- [SaaS control plane (optional)](#saas-control-plane-optional)
- [Project layout](#project-layout)
-- [Contributing](#contributing)
- [License](#license)
---
-## What is Fairvisor?
+When multiple tenants, agents, or services share an API, one misbehaving caller can exhaust the budget for everyone — whether that's LLM tokens, API credits, or request quotas. Fairvisor is a lightweight enforcement engine that gives each tenant isolated limits at the edge: token budgets, cost caps, rate limits, and kill switches — keyed on JWT claims, API keys, or IP. One container, one JSON policy file, no Redis.
-Fairvisor Edge is a **policy enforcement layer** that sits between your API gateway and your upstream services. Every request is evaluated against a declarative JSON policy bundle and receives a deterministic allow or reject verdict — with machine-readable rejection headers and sub-millisecond latency.
+## Why we built this
-It is **not** a reverse proxy replacement. It is **not** a WAF. It is a dedicated, composable enforcement point for:
+API gateways count requests. LLM providers bill by the token.
-- **Rate limits and quotas** — per route, per tenant, per JWT claim, per API key
-- **Cost budgets** — cumulative spend caps per org, team, or endpoint
-- **LLM token limits** — TPM/TPD budgets with pre-request reservation and post-response refund
-- **Kill switches** — instant traffic blocking per descriptor, no restart required
-- **Shadow mode** — dry-run enforcement against real traffic before going live
-- **Loop detection** — stops runaway agentic workflows at the edge
-- **Circuit breaker** — auto-trips on spend spikes, auto-resets after cooldown
+When you serve multiple tenants — customers, teams, or agentic pipelines — that gap becomes a real problem. One runaway agent can consume a month's token budget overnight. Your gateway sees one request per second; your invoice shows 3 million tokens.
-All controls are defined in one versioned policy bundle. Policies hot-reload without restarting the process.
+We needed something that:
+- Understood token budgets, not just request counts
+- Could key limits on JWT claims (`org_id`, `plan`, `user_id`), not just IPs
+- Kept every request fast — no Redis round-trip, no extra network call in the hot path
+- Could plug into nginx or Envoy *or* run standalone as a transparent LLM proxy
+
+We couldn't find it, so we built Fairvisor.
## Why not nginx / Kong / Envoy?
@@ -84,17 +71,41 @@ If you have an existing gateway, the question is whether Fairvisor adds anything
**If nginx `limit_req` is enough for you**, use it. It has zero overhead and is the right tool for simple per-IP global throttling. Fairvisor becomes relevant when you need per-tenant awareness, JWT-claim-based bucketing, or cost/token tracking that `limit_req` has no model for.
-**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state.
+**If you are already running Kong**, the built-in rate limiting plugin stores counters in Redis or Postgres — every decision is a network call. Fairvisor can run alongside Kong as an `auth_request` decision service with no external state. See [Kong / Traefik integration →](https://docs.fairvisor.com/docs/gateway/)
-**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position.
+**If you are running Envoy**, the [global rate limit service](https://github.com/envoyproxy/ratelimit) requires deploying a separate Redis-backed service with its own config language. Fairvisor is one container, one JSON file, and integrates via `ext_authz` in the same position. See [Envoy ext_authz integration →](https://docs.fairvisor.com/docs/gateway/envoy/)
**If you are on Cloudflare or Akamai**, per-JWT-claim limits, LLM token budgets, and cost caps are not in the platform's model. If your limits are tenant-aware or cost-aware, you need something that runs in your own stack.
-Fairvisor integrates *alongside* Kong, nginx, and Envoy — it is not a replacement. See [docs/gateway-integration.md](docs/gateway-integration.md) for integration patterns.
+Fairvisor can run alongside Kong, nginx, and Envoy — or as a standalone reverse proxy if you don't need a separate gateway. See [nginx auth_request →](https://docs.fairvisor.com/docs/gateway/nginx/) · [Envoy ext_authz →](https://docs.fairvisor.com/docs/gateway/envoy/) · [Kong / Traefik →](https://docs.fairvisor.com/docs/gateway/) for integration patterns.
## Quick start
-### 1. Create a policy
+> **Which mode is right for you?**
+> - **Wrapper** — your app calls OpenAI / Anthropic / Gemini directly → point your client at Fairvisor instead, no other code changes needed. *Fastest to try.*
+> - **Reverse proxy** — you have a single upstream service → Fairvisor sits in front and enforces before forwarding.
+> - **Decision service** — you already run nginx, Envoy, or Kong → call `POST /v1/decision` from `auth_request` / `ext_authz`.
+
+### Fastest path
+
+```bash
+git clone https://github.com/fairvisor/edge.git
+cd edge/examples/quickstart
+docker compose up -d
+```
+
+Run your first enforce/reject test in under a minute — full walkthrough in [`examples/quickstart/README.md`](examples/quickstart/README.md).
+
+**Recipes:** `examples/recipes/` — team budgets, runaway agent guard, circuit-breaker.
+
+**Sample artifacts:** `fixtures/` — canonical enforce/reject fixtures (OpenAI, Anthropic, Gemini).
+
+### Minimal decision\_service example
+
+
+Expand — manual setup with a single docker run
+
+**1. Create a policy**
```bash
mkdir fairvisor-demo && cd fairvisor-demo
@@ -127,7 +138,7 @@ mkdir fairvisor-demo && cd fairvisor-demo
}
```
-### 2. Run the edge
+**2. Run the edge**
```bash
docker run -d \
@@ -136,31 +147,48 @@ docker run -d \
-v "$(pwd)/policy.json:/etc/fairvisor/policy.json:ro" \
-e FAIRVISOR_CONFIG_FILE=/etc/fairvisor/policy.json \
-e FAIRVISOR_MODE=decision_service \
- ghcr.io/fairvisor/fairvisor-edge:v0.1.0
+ ghcr.io/fairvisor/fairvisor-edge:latest
```
-### 3. Verify
+**3. Verify**
```bash
curl -sf http://localhost:8080/readyz
# {"status":"ok"}
-curl -s -w "\nHTTP %{http_code}\n" \
+# Allowed request → HTTP 200
+curl -s -o /dev/null -w "HTTP %{http_code}\n" \
-H "X-Original-Method: GET" \
-H "X-Original-URI: /api/data" \
-H "X-Forwarded-For: 10.0.0.1" \
http://localhost:8080/v1/decision
+
+# Rejected request — exhaust the burst (>10 requests)
+for i in $(seq 1 12); do
+ curl -s -o /dev/null -w "HTTP %{http_code}\n" \
+ -H "X-Original-Method: GET" \
+ -H "X-Original-URI: /api/data" \
+ -H "X-Forwarded-For: 10.0.0.1" \
+ http://localhost:8080/v1/decision
+done
+# last requests → HTTP 429 X-Fairvisor-Reason: rate_limit_exceeded
```
+
+
> Full walkthrough: [docs.fairvisor.com/docs/quickstart](https://docs.fairvisor.com/docs/quickstart/)
## LLM token budget in 30 seconds
+The fastest path is **wrapper mode**: Fairvisor sits in front of the LLM API, enforces budgets, and strips the upstream key from the client. No gateway changes needed — just point your client at Fairvisor instead of OpenAI.
+
+**1. Policy** — one rule, per-org TPM + daily cap:
+
```json
{
"id": "llm-budget",
"spec": {
- "selector": { "pathPrefix": "/v1/chat" },
+ "selector": { "pathPrefix": "/" },
"mode": "enforce",
"rules": [
{
@@ -178,9 +206,32 @@ curl -s -w "\nHTTP %{http_code}\n" \
}
```
-Each organization (from the JWT `org_id` claim) gets its own independent 60k TPM / 1.2M TPD budget. Requests over the limit return a `429` with an OpenAI-compatible error body — no client changes needed.
+**2. Call the API** — token format `Bearer :`:
+
+```bash
+curl https://your-fairvisor-host/openai/v1/chat/completions \
+ -H "Authorization: Bearer eyJhbGc...:sk-proj-..." \
+ -H "Content-Type: application/json" \
+ -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"Hello"}]}'
+```
+
+Fairvisor parses the JWT claims (no signature validation — the JWT is trusted as-is), extracts `org_id`, charges tokens against the budget, strips the `Authorization` header, and forwards with the upstream key. The upstream never sees the client JWT.
-Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint.
+When the budget is exhausted:
+
+```http
+HTTP/1.1 429 Too Many Requests
+X-Fairvisor-Reason: tpm_exceeded
+Retry-After: 12
+RateLimit-Limit: 60000
+RateLimit-Remaining: 0
+```
+
+Each organization gets its own independent 60k TPM / 1.2M TPD budget. Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible endpoint.
+
+The selector matches the incoming wrapper path. Use `pathPrefix: "/"` to cover all providers, or `pathPrefix: "/openai"` to limit to one provider only.
+
+> **Decision service / reverse proxy mode:** if you already have a gateway, use `selector: { "pathPrefix": "/v1/chat" }` and call `POST /v1/decision` from your existing `auth_request` or `ext_authz` hook instead.
## How a request flows
@@ -188,7 +239,9 @@ Works with OpenAI, Anthropic, Azure OpenAI, Mistral, and any OpenAI-compatible e
**Reverse proxy mode** — Fairvisor sits inline. Traffic arrives at Fairvisor directly, gets evaluated, and is proxied to the upstream if allowed. No separate gateway needed.
-Both modes use the same policy bundle and return the same rejection headers.
+**Wrapper mode** — Fairvisor acts as a transparent LLM proxy. Clients send requests to Fairvisor's OpenAI-compatible endpoint (`/openai/v1/chat/completions`, `/anthropic/v1/messages`, `/gemini/v1/generateContent`). Fairvisor enforces token budgets and cost limits, strips the client auth header, injects the upstream API key, and forwards the request. No changes needed in the client — swap the base URL and you're done.
+
+All three modes use the same policy bundle and return the same rejection headers.
When a request is rejected:
@@ -206,46 +259,76 @@ Headers follow [RFC 9333 RateLimit Fields](https://www.rfc-editor.org/rfc/rfc933
### Architecture
-**Decision service mode** (sidecar — your gateway calls `/v1/decision`, handles forwarding itself):
-
-```
- Client ──► Your gateway (nginx / Envoy / Kong)
- │
- │ POST /v1/decision
- │ (auth_request / ext_authz)
- ▼
- ┌─────────────────────┐
- │ Fairvisor Edge │
- │ decision_service │
- │ │
- │ rule_engine │
- │ ngx.shared.dict │ ◄── no Redis, no network
- └──────────┬──────────┘
- │
- 204 allow │ 429 reject
- ▼
- gateway proxies or returns rejection
+**Decision service mode** — sidecar: your gateway calls `/v1/decision`, handles forwarding itself.
+
+```mermaid
+sequenceDiagram
+ participant C as Client
+ participant G as Your Gateway
(nginx / Envoy / Kong)
+ participant F as Fairvisor Edge
decision_service
+ participant U as Upstream service
+
+ C->>G: Request
+ G->>F: POST /v1/decision
(auth_request / ext_authz)
+ alt allow
+ F-->>G: 204 No Content
+ G->>U: Forward request
+ U-->>G: Response
+ G-->>C: Response
+ else reject
+ F-->>G: 429 + RateLimit headers
+ G-->>C: 429 Too Many Requests
+ end
```
-**Reverse proxy mode** (inline — Fairvisor handles proxying):
-
+**Reverse proxy mode** — inline: Fairvisor handles both enforcement and proxying.
+
+```mermaid
+sequenceDiagram
+ participant C as Client
+ participant F as Fairvisor Edge
reverse_proxy
+ participant U as Upstream service
+
+ C->>F: Request
+ alt allow
+ F->>U: Forward request
+ U-->>F: Response
+ F-->>C: Response
+ else reject
+ F-->>C: 429 + RFC 9333 headers
+ end
```
- Client ──► Fairvisor Edge (reverse_proxy)
- │
- │ access.lua → rule_engine
- │ ngx.shared.dict
- │
- allow ──► upstream service
- reject ──► 429 + RFC 9333 headers
+
+**Wrapper mode** — transparent LLM proxy: swap the base URL, no other client changes needed.
+
+```mermaid
+sequenceDiagram
+ participant C as Client
+ participant F as Fairvisor Edge
wrapper
+ participant U as Upstream LLM
(OpenAI / Anthropic / Gemini)
+
+ C->>F: POST /openai/v1/chat/completions
Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
+ F->>F: 1. Parse JWT claims (org_id, user_id)
+ F->>F: 2. Enforce TPM / TPD / cost budget
+ alt budget ok
+ F->>U: POST /v1/chat/completions
Authorization: Bearer UPSTREAM_KEY
+ U-->>F: 200 OK + token usage
+ F->>F: 3. Count tokens · refund unused reservation
+ F-->>C: 200 OK (Authorization stripped from reply)
+ else budget exceeded
+ F-->>C: 429 X-Fairvisor-Reason: tpm_exceeded
+ end
```
-Both modes use the same policy bundle and produce the same rejection headers.
+Supported upstream paths: `/openai/*`, `/anthropic/*`, `/gemini/*`, `/grok/*`.
+
+All three modes use the same policy bundle and produce the same rejection headers.
## Enforcement capabilities
| If you need to… | Algorithm | Typical identity keys | Reject reason |
|---|---|---|---|
-| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:addr` | `rate_limit_exceeded` |
+| Cap request frequency | `token_bucket` | `jwt:user_id`, `header:x-api-key`, `ip:address` | `rate_limit_exceeded` |
| Cap cumulative spend | `cost_based` | `jwt:org_id`, `jwt:plan` | `budget_exhausted` |
| Cap LLM tokens (TPM/TPD) | `token_bucket_llm` | `jwt:org_id`, `jwt:user_id` | `tpm_exceeded`, `tpd_exceeded` |
| Instantly block a segment | kill switch | any descriptor | `kill_switch_active` |
@@ -253,38 +336,10 @@ Both modes use the same policy bundle and produce the same rejection headers.
| Stop runaway agent loops | loop detection | request fingerprint | `loop_detected` |
| Clamp spend spikes | circuit breaker | global or policy scope | `circuit_breaker_open` |
-Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:addr`, `ip:country`). Combine multiple keys per rule for compound matching.
-
-## Policy as code
-
-Define policies in JSON, validate against the schema, test in shadow mode, then promote:
-
-```bash
-# Validate bundle structure and rule semantics
-fairvisor validate ./policies.json
-
-# Replay real traffic without blocking anything
-fairvisor test --dry-run
-
-# Apply a new bundle (hot-reload, no restart)
-fairvisor connect --push ./policies.json
-```
-
-Policies are versioned JSON — commit them to Git, review changes in PRs, roll back with confidence.
+Identity keys can be **JWT claims** (`jwt:org_id`, `jwt:plan`), **HTTP headers** (`header:x-api-key`), or **IP attributes** (`ip:address`, `ip:country`). Combine multiple keys per rule for compound matching.
## Performance
-### Benchmark methodology (March 2026)
-
-- **Hosts:** 2 × AWS `c7i.xlarge` (4 vCPU, 8 GiB each), cluster placement group, eu-central-1
-- **OS:** Ubuntu 24.04 LTS
-- **Runtime:** OpenResty 1.29.2.1, Fairvisor latest `main` (no Docker)
-- **Load tool:** `k6` v0.54.0, `constant-arrival-rate`, 10,000 RPS for 60s, 10s warmup
-- **Benchmark script:** `run-all.sh` from `fairvisor/benchmark`
-- **Topology:** two-host — Fairvisor and k6 on separate machines (VPC private network)
-- **Decision endpoint contract:** `POST /v1/decision` with `X-Original-Method` and `X-Original-URI`
-- **Note:** reverse proxy numbers include policy evaluation and upstream proxy hop to backend nginx.
-
### Latest measured latency @ 10,000 RPS
| Percentile | Decision service | Reverse proxy | Raw nginx (baseline) |
@@ -303,9 +358,8 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll
| Simple rate limit (1 rule) | 195,000 |
| Complex policy (5 rules, JWT parsing, loop detection) | 195,000 |
-**No external datastore.** All enforcement state lives in in-process shared memory (`ngx.shared.dict`). No Redis, no Postgres, no network round-trips in the decision path.
+Reproduce: see [fairvisor/benchmark](https://github.com/fairvisor/benchmark) — the canonical benchmark source of truth for Fairvisor Edge performance numbers.
-> Reproduce: `git clone https://github.com/fairvisor/benchmark && cd benchmark && bash run-all.sh`
## Deployment
@@ -318,7 +372,7 @@ Policies are versioned JSON — commit them to Git, review changes in PRs, roll
| Envoy `ext_authz` | [docs/gateway/envoy](https://docs.fairvisor.com/docs/gateway/envoy/) |
| Kong / Traefik | [docs/gateway](https://docs.fairvisor.com/docs/gateway/) |
-Fairvisor integrates **alongside** Kong, nginx, Envoy, and Traefik — it does not replace them.
+Fairvisor works alongside Kong, nginx, Envoy, and Traefik — or runs standalone as a reverse proxy when you don't need a separate gateway.
## CLI
@@ -328,7 +382,6 @@ fairvisor validate policy.json # validate before deploying
fairvisor test --dry-run # shadow-mode replay
fairvisor status # edge health and loaded bundle info
fairvisor logs # tail rejection events
-fairvisor connect # connect to SaaS control plane
```
## SaaS control plane (optional)
@@ -349,25 +402,16 @@ If the SaaS is unreachable, the edge keeps enforcing with the last-known policy
## Project layout
```
-src/fairvisor/ runtime modules (OpenResty/LuaJIT)
-cli/ command-line tooling
-spec/ unit and integration tests (busted)
-tests/e2e/ Docker-based E2E tests (pytest)
-examples/ sample policy bundles
-helm/ Helm chart
-docker/ Docker artifacts
-docs/ reference documentation
-```
-
-## Contributing
-
-See [CONTRIBUTING.md](CONTRIBUTING.md). Bug reports, issues, and pull requests welcome.
-
-Run the test suite:
-
-```bash
-busted spec # unit + integration
-pytest tests/e2e -v # E2E (requires Docker)
+src/fairvisor/ runtime modules (OpenResty/LuaJIT)
+cli/ command-line tooling
+spec/ unit and integration tests (busted)
+tests/e2e/ Docker-based E2E tests (pytest)
+examples/quickstart/ runnable quickstart (docker compose up -d)
+examples/recipes/ deployable policy recipes (team budgets, agent guard, circuit breaker)
+fixtures/ canonical request/response sample artifacts
+helm/ Helm chart
+docker/ Docker artifacts
+docs/ reference documentation
```
## License
diff --git a/bin/ci/check_unpinned_dependencies.py b/bin/ci/check_unpinned_dependencies.py
index 433a0bf..00903f2 100644
--- a/bin/ci/check_unpinned_dependencies.py
+++ b/bin/ci/check_unpinned_dependencies.py
@@ -13,7 +13,7 @@
SHA_RE = re.compile(r"^[0-9a-f]{40}$")
USES_RE = re.compile(r"uses:\s*([^\s#]+)")
FROM_RE = re.compile(r"^\s*FROM\s+([^\s]+)", re.MULTILINE)
-OPM_RE = re.compile(r"^\s*RUN\s+opm\s+get\s+([^\s\\]+)(?:\s+([^\s\\#]+))?", re.MULTILINE)
+OPM_RE = re.compile(r"^\s*RUN\s+opm\s+get\s+([^\s\\]+)(?:[^\S\n]+([^\s\\#]+))?", re.MULTILINE)
@dataclass(frozen=True)
@@ -79,7 +79,9 @@ def scan_dockerfile(path: Path) -> list[Finding]:
for match in OPM_RE.finditer(text):
package = match.group(1)
version = match.group(2)
- if not version or version.startswith(("&", "|")):
+ # OPM uses "pkg=version" (embedded) or "pkg version" (space-separated)
+ is_pinned = "=" in package or (version and not version.startswith(("&", "|")))
+ if not is_pinned:
findings.append(
Finding(
category="opm-package",
diff --git a/bin/fairvisor b/bin/fairvisor
index f171b53..1b1a490 100755
--- a/bin/fairvisor
+++ b/bin/fairvisor
@@ -3,5 +3,5 @@
SCRIPT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
-exec resty -I "${SCRIPT_DIR}/src" -I "${SCRIPT_DIR}/cli" \
+exec resty -I "${SCRIPT_DIR}/src" -I "${SCRIPT_DIR}" \
"${SCRIPT_DIR}/cli/main.lua" "$@"
diff --git a/cli/README.md b/cli/README.md
index bd18187..a68ef1a 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -18,10 +18,10 @@ From the repo root:
Or with `resty` directly (e.g. from another directory, adjusting `-I` paths):
```bash
-resty -I /path/to/fv-oss/src -I /path/to/fv-oss/cli /path/to/fv-oss/cli/main.lua [options]
+resty -I /path/to/fv-oss/src -I /path/to/fv-oss /path/to/fv-oss/cli/main.lua [options]
```
-`bin/fairvisor` sets `-I` to the repo's `src` and `cli` so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly.
+`bin/fairvisor` sets `-I` to the repo's `src` and root (for `cli.*` modules) so that `require("cli.commands.init")` and `require("fairvisor.bundle_loader")` resolve correctly.
## Commands
diff --git a/cli/commands/status.lua b/cli/commands/status.lua
index 3931937..326c495 100644
--- a/cli/commands/status.lua
+++ b/cli/commands/status.lua
@@ -41,14 +41,28 @@ function _M.run(argv)
return nil, 2
end
+ local policy_version = "unknown"
+ if health_res.body and health_res.body ~= "" then
+ local ok_json, cjson = pcall(require, "cjson")
+ if ok_json then
+ local ok_dec, decoded = pcall(cjson.decode, health_res.body)
+ if ok_dec and type(decoded) == "table" and decoded.policy_version ~= nil then
+ policy_version = tostring(decoded.policy_version)
+ end
+ end
+ end
+
local metrics_res = httpc:request_uri(edge_url .. "/metrics")
local metrics_body = metrics_res and metrics_res.body or ""
+ local decisions_raw = _parse_metric(metrics_body, "fairvisor_decisions_total")
+ local decisions = (decisions_raw ~= "unknown") and decisions_raw or "0"
+
local data = {
status = (health_res.status == 200 and "ready") or "not ready",
- policy_version = _parse_metric(metrics_body, "fairvisor_bundle_version"),
+ policy_version = policy_version,
saas = (_parse_metric(metrics_body, "fairvisor_saas_reachable") == "1") and "connected" or "disconnected",
- decisions = _parse_metric(metrics_body, "fairvisor_decisions_total"),
+ decisions = decisions,
}
if format == "json" then
diff --git a/cli/commands/test.lua b/cli/commands/test.lua
index 17066e4..d620ecf 100644
--- a/cli/commands/test.lua
+++ b/cli/commands/test.lua
@@ -15,6 +15,9 @@ local function _read_file(path)
local content = handle:read("*a")
handle:close()
+ if content == nil then
+ return nil, "failed to read file content"
+ end
return content
end
@@ -47,7 +50,7 @@ local function _generate_mock_requests(bundle)
headers = {},
query_params = {},
ip_address = "127.0.0.1",
- user_agent = "fairvisor-cli/test",
+ user_agent = "fairvisor/test",
}
end
@@ -58,7 +61,7 @@ local function _generate_mock_requests(bundle)
headers = {},
query_params = {},
ip_address = "127.0.0.1",
- user_agent = "fairvisor-cli/test",
+ user_agent = "fairvisor/test",
}
end
@@ -127,7 +130,7 @@ function _M.run(argv)
local content, read_err = _read_file(file)
if not content then
- output.print_error("Cannot read file: " .. read_err)
+ output.print_error("Cannot read file: " .. (read_err or "unknown read error"))
return nil, 1
end
@@ -142,7 +145,7 @@ function _M.run(argv)
return nil, 1
end
- local ok_mock, mock_ngx = pcall(require, "spec.helpers.mock_ngx")
+ local ok_mock, mock_ngx = pcall(require, "cli.lib.mock_ngx")
if not ok_mock then
output.print_error("mock_ngx helper is unavailable: " .. mock_ngx)
return nil, 1
@@ -168,7 +171,7 @@ function _M.run(argv)
if requests_file then
requests, read_err = _load_requests(requests_file)
if not requests then
- output.print_error("Cannot load requests: " .. read_err)
+ output.print_error("Cannot load requests: " .. (read_err or "unknown read error"))
return nil, 1
end
else
diff --git a/cli/commands/validate.lua b/cli/commands/validate.lua
index 115236f..af4ab62 100644
--- a/cli/commands/validate.lua
+++ b/cli/commands/validate.lua
@@ -20,6 +20,9 @@ local function _read_file(path)
local content = handle:read("*a")
handle:close()
+ if content == nil then
+ return nil, "failed to read file content"
+ end
return content
end
@@ -101,7 +104,7 @@ function _M.run(argv)
local content, read_err = _read_file(file)
if not content then
- output.print_error("Cannot read file: " .. read_err)
+ output.print_error("Cannot read file: " .. (read_err or "unknown read error"))
return nil, 1
end
diff --git a/cli/lib/mock_ngx.lua b/cli/lib/mock_ngx.lua
new file mode 100644
index 0000000..643efd8
--- /dev/null
+++ b/cli/lib/mock_ngx.lua
@@ -0,0 +1,283 @@
+local type = type
+local string_byte = string.byte
+local string_char = string.char
+local string_sub = string.sub
+local string_find = string.find
+local table_concat = table.concat
+
+local bit
+local ok_bit, b = pcall(require, "bit")
+if ok_bit then bit = b end
+
+local _M = {}
+local string_byte = string.byte
+
+local _BASE64_ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+
+local function _to_base64(input)
+ local bytes = { string_byte(input, 1, #input) }
+ local out = {}
+ local index = 1
+
+ while index <= #bytes do
+ local b1 = bytes[index] or 0
+ local b2 = bytes[index + 1] or 0
+ local b3 = bytes[index + 2] or 0
+ local pad = 0
+
+ if bytes[index + 1] == nil then
+ pad = 2
+ elseif bytes[index + 2] == nil then
+ pad = 1
+ end
+
+ local n = b1 * 65536 + b2 * 256 + b3
+ local c1 = math.floor(n / 262144) % 64 + 1
+ local c2 = math.floor(n / 4096) % 64 + 1
+ local c3 = math.floor(n / 64) % 64 + 1
+ local c4 = n % 64 + 1
+
+ out[#out + 1] = string_sub(_BASE64_ALPHABET, c1, c1)
+ out[#out + 1] = string_sub(_BASE64_ALPHABET, c2, c2)
+
+ if pad == 2 then
+ out[#out + 1] = "="
+ out[#out + 1] = "="
+ elseif pad == 1 then
+ out[#out + 1] = string_sub(_BASE64_ALPHABET, c3, c3)
+ out[#out + 1] = "="
+ else
+ out[#out + 1] = string_sub(_BASE64_ALPHABET, c3, c3)
+ out[#out + 1] = string_sub(_BASE64_ALPHABET, c4, c4)
+ end
+
+ index = index + 3
+ end
+
+ return table_concat(out)
+end
+
+local function _from_base64(input)
+ local clean = input:gsub("%s", "")
+ local out = {}
+ local index = 1
+
+ while index <= #clean do
+ local c1 = string_sub(clean, index, index)
+ local c2 = string_sub(clean, index + 1, index + 1)
+ local c3 = string_sub(clean, index + 2, index + 2)
+ local c4 = string_sub(clean, index + 3, index + 3)
+
+ if c1 == "" or c2 == "" then
+ break
+ end
+
+ local v1 = string_find(_BASE64_ALPHABET, c1, 1, true)
+ local v2 = string_find(_BASE64_ALPHABET, c2, 1, true)
+ local v3 = c3 ~= "=" and string_find(_BASE64_ALPHABET, c3, 1, true) or nil
+ local v4 = c4 ~= "=" and string_find(_BASE64_ALPHABET, c4, 1, true) or nil
+
+ if not v1 or not v2 then
+ return nil
+ end
+
+ v1 = v1 - 1
+ v2 = v2 - 1
+ v3 = v3 and (v3 - 1) or 0
+ v4 = v4 and (v4 - 1) or 0
+
+ local n = v1 * 262144 + v2 * 4096 + v3 * 64 + v4
+ local b1 = math.floor(n / 65536) % 256
+ local b2 = math.floor(n / 256) % 256
+ local b3 = n % 256
+
+ out[#out + 1] = string_char(b1)
+ if c3 ~= "=" then
+ out[#out + 1] = string_char(b2)
+ end
+ if c4 ~= "=" then
+ out[#out + 1] = string_char(b3)
+ end
+
+ index = index + 4
+ end
+
+ return table_concat(out)
+end
+
+local function _simple_digest(input)
+ local h1 = 2166136261
+ local h2 = 16777619
+ for i = 1, #input do
+ local b = string_byte(input, i)
+ if bit then
+ h1 = bit.bxor(h1, b) % 4294967296
+ else
+ -- Fallback if bit is not available (not cryptographically same, but avoids crash)
+ h1 = (h1 + b) % 4294967296
+ end
+ h1 = (h1 * 16777619) % 4294967296
+ h2 = (h2 + (b * i)) % 4294967296
+ end
+
+ local parts = {}
+ for i = 1, 8 do
+ local a = (h1 + (i * 2654435761)) % 4294967296
+ local b = (h2 + (i * 2246822519)) % 4294967296
+ parts[#parts + 1] = string_char(math.floor(a / 16777216) % 256)
+ parts[#parts + 1] = string_char(math.floor(a / 65536) % 256)
+ parts[#parts + 1] = string_char(math.floor(b / 256) % 256)
+ parts[#parts + 1] = string_char(b % 256)
+ end
+
+ return table_concat(parts)
+end
+
+function _M.mock_shared_dict()
+ local data = {}
+
+ return {
+ get = function(_, key)
+ return data[key]
+ end,
+ set = function(_, key, value)
+ data[key] = value
+ return true
+ end,
+ incr = function(_, key, value, init, _init_ttl)
+ local current = data[key]
+ if current == nil then
+ if init then
+ data[key] = init + value
+ return data[key], nil, true
+ end
+ return nil, "not found"
+ end
+
+ data[key] = current + value
+ return data[key], nil, false
+ end,
+ delete = function(_, key)
+ data[key] = nil
+ end,
+ flush_all = function(_)
+ data = {}
+ end,
+ }
+end
+
+function _M.setup_time_mock()
+ local mock_time = 1000.000
+
+ local function now()
+ return mock_time
+ end
+
+ local function advance_time(seconds)
+ mock_time = mock_time + seconds
+ end
+
+ local function set_time(seconds)
+ mock_time = seconds
+ end
+
+ return {
+ now = now,
+ advance_time = advance_time,
+ set_time = set_time,
+ }
+end
+
+function _M.setup_package_mock()
+ package.loaded["resty.maxminddb"] = {
+ initted = function() return true end,
+ init = function() return true end,
+ lookup = function() return nil end,
+ }
+end
+
+function _M.setup_ngx()
+ local time = _M.setup_time_mock()
+ local dict = _M.mock_shared_dict()
+ local logs = {}
+ local timers = {}
+
+ local function crc32_short(value)
+ local hash = 0
+ local input = tostring(value or "")
+ for i = 1, #input do
+ hash = (hash * 33 + string_byte(input, i)) % 4294967296
+ end
+ return hash
+ end
+
+ _G.ngx = {
+ now = time.now,
+ update_time = function()
+ end,
+ shared = {
+ fairvisor_counters = dict,
+ },
+ req = {
+ read_body = function() end,
+ get_body_data = function() return nil end,
+ get_body_file = function() return nil end,
+ get_headers = function() return {} end,
+ get_uri_args = function() return {} end,
+ },
+ var = {
+ request_method = "GET",
+ uri = "/",
+ host = "localhost",
+ remote_addr = "127.0.0.1",
+ geoip2_data_country_iso_code = nil,
+ asn = nil,
+ fairvisor_asn_type = nil,
+ is_tor_exit = nil,
+ },
+ log = function(...)
+ logs[#logs + 1] = { ... }
+ end,
+ timer = {
+ every = function(interval, callback)
+ timers[#timers + 1] = { interval = interval, callback = callback }
+ return true
+ end,
+ },
+ hmac_sha256 = function(key, payload)
+ return _simple_digest(key .. ":" .. payload)
+ end,
+ hmac_sha1 = function(key, payload)
+ return _simple_digest("sha1:" .. key .. ":" .. payload)
+ end,
+ sha1_bin = function(payload)
+ return _simple_digest("sha1bin:" .. payload)
+ end,
+ sha256_bin = function(payload)
+ return _simple_digest("sha256bin:" .. payload)
+ end,
+ encode_base64 = function(value)
+ return _to_base64(value)
+ end,
+ decode_base64 = function(value)
+ return _from_base64(value)
+ end,
+ md5 = function(payload)
+ return _to_base64(_simple_digest("md5:" .. payload))
+ end,
+ ERR = 1,
+ WARN = 2,
+ INFO = 3,
+ DEBUG = 4,
+ crc32_short = crc32_short,
+ }
+
+ return {
+ time = time,
+ dict = dict,
+ logs = logs,
+ timers = timers,
+ }
+end
+
+return _M
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 47e7b68..0d3888c 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -6,6 +6,7 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends \
gettext-base \
python3 \
libmaxminddb0 \
+ libmaxminddb-dev \
mmdb-bin \
&& rm -rf /var/lib/apt/lists/*
diff --git a/docker/Dockerfile.cli b/docker/Dockerfile.cli
index c72ab12..f91c407 100644
--- a/docker/Dockerfile.cli
+++ b/docker/Dockerfile.cli
@@ -6,6 +6,8 @@ RUN apt-get update && apt-get upgrade -y --no-install-recommends \
perl \
&& rm -rf /var/lib/apt/lists/*
+RUN opm get ledgetech/lua-resty-http=0.17.1
+
WORKDIR /opt/fairvisor
COPY src /opt/fairvisor/src
diff --git a/docker/nginx.conf.template b/docker/nginx.conf.template
index c0d4184..c745a65 100644
--- a/docker/nginx.conf.template
+++ b/docker/nginx.conf.template
@@ -25,6 +25,8 @@ worker_shutdown_timeout 35s;
http {
resolver 127.0.0.11 ipv6=off valid=30s;
resolver_timeout 2s;
+ map_hash_max_size 262144;
+ map_hash_bucket_size 64;
geo $is_tor_exit {
default 0;
@@ -51,7 +53,8 @@ http {
location = /livez {
default_type text/plain;
- return 200 "ok\n";
+ return 200 "ok
+";
}
location = /readyz {
@@ -102,7 +105,8 @@ http {
}
default_type text/plain;
- return 404 "not found\n";
+ return 404 "not found
+";
}
}
}
diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md
new file mode 100644
index 0000000..d9b637f
--- /dev/null
+++ b/examples/quickstart/README.md
@@ -0,0 +1,111 @@
+# Fairvisor Edge — Quickstart
+
+Go from `git clone` to working policy enforcement in one step.
+
+## Prerequisites
+
+- Docker with Compose V2 (`docker compose version`)
+- Port 8080 free on localhost
+
+## Start
+
+```bash
+docker compose up -d
+```
+
+The first run builds the `fairvisor` image locally from `docker/Dockerfile`, so no
+GHCR login is required.
+
+Wait for the `fairvisor` service to report healthy:
+
+```bash
+docker compose ps
+# fairvisor should show "healthy"
+```
+
+## Verify enforcement
+
+This quickstart runs in `FAIRVISOR_MODE=reverse_proxy`. Requests to `/v1/*`
+are enforced by the TPM policy and forwarded to a local mock LLM backend.
+No real API keys are required.
+
+**Allowed request** — should return `200`:
+
+```bash
+curl -s -X POST http://localhost:8080/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d @../../fixtures/normal_request.json
+```
+
+Expected response body shape matches `../../fixtures/allow_response.json`.
+
+**Over-limit request** — should return `429`:
+
+```bash
+curl -s -X POST http://localhost:8080/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d @../../fixtures/over_limit_request.json
+```
+
+Expected response body shape: `../../fixtures/reject_tpm_exceeded.json`.
+The response will also include:
+- `X-Fairvisor-Reason: tpm_exceeded`
+- `Retry-After: 60`
+- `RateLimit-Limit: 100` (matches the quickstart policy `tokens_per_minute`)
+- `RateLimit-Remaining: 0`
+
+## How the policy works
+
+The quickstart policy (`policy.json`) enforces a TPM limit keyed on `ip:address`:
+
+- `tokens_per_minute: 100` — allows roughly 2 small requests per minute
+- `tokens_per_day: 1000` — daily cap
+- `default_max_completion: 50` — pessimistic reservation per request when `max_tokens` is not set
+
+Sending `over_limit_request.json` (which sets `max_tokens: 200000`) immediately
+exceeds the 100-token per-minute budget and triggers a `429`.
+
+## Wrapper mode (real provider routing)
+
+Wrapper mode routes requests to real upstream providers using provider-prefixed paths
+and a composite Bearer token. It requires real provider API keys and cannot be
+demonstrated with this mock stack.
+
+**Path and auth format:**
+
+```
+POST /openai/v1/chat/completions
+Authorization: Bearer CLIENT_JWT:UPSTREAM_KEY
+```
+
+Where:
+- `CLIENT_JWT` — signed JWT identifying the calling client/tenant (used for policy enforcement)
+- `UPSTREAM_KEY` — real upstream API key forwarded to the provider (e.g. `sk-...` for OpenAI)
+
+Fairvisor strips the composite header, injects the correct provider auth before forwarding,
+and **never returns upstream auth headers to the caller**
+(see `../../fixtures/allow_response.json`).
+
+**Provider-prefixed paths:**
+
+| Path prefix | Upstream | Auth header injected |
+|---|---|---|
+| `/openai/v1/...` | `https://api.openai.com/v1/...` | `Authorization: Bearer UPSTREAM_KEY` |
+| `/anthropic/v1/...` | `https://api.anthropic.com/v1/...` | `x-api-key: UPSTREAM_KEY` |
+| `/gemini/v1beta/...` | `https://generativelanguage.googleapis.com/v1beta/...` | `x-goog-api-key: UPSTREAM_KEY` |
+
+To run in wrapper mode, change the compose env to `FAIRVISOR_MODE: wrapper` and
+supply real credentials in the `Authorization` header.
+
+## Teardown
+
+```bash
+docker compose down
+```
+
+## Next steps
+
+- See `../recipes/` for team budgets, runaway agent guard, and provider failover scenarios
+- See `../../fixtures/` for all sample request/response artifacts
+- See [fairvisor/benchmark](https://github.com/fairvisor/benchmark) for performance benchmarks
+- See [docs/install/](../../docs/install/) for Kubernetes, VM, and SaaS deployment options
diff --git a/examples/quickstart/docker-compose.yml b/examples/quickstart/docker-compose.yml
new file mode 100644
index 0000000..59a2487
--- /dev/null
+++ b/examples/quickstart/docker-compose.yml
@@ -0,0 +1,59 @@
+# Fairvisor Edge — Quickstart stack (standalone + reverse proxy mode)
+#
+# Usage:
+# docker compose up -d
+# curl -s http://localhost:8080/readyz # health check
+# curl -s -X POST http://localhost:8080/v1/chat/completions \
+# -H "Content-Type: application/json" \
+# -d @../../fixtures/normal_request.json # expect 200
+# curl -s -X POST http://localhost:8080/v1/chat/completions \
+# -H "Content-Type: application/json" \
+# -d @../../fixtures/over_limit_request.json # expect 429
+#
+# This stack runs in FAIRVISOR_MODE=reverse_proxy — requests to /v1/* are
+# enforced by policy then forwarded to the local mock LLM backend.
+# No real API keys required.
+#
+# Wrapper mode (routing by provider prefix, real upstream keys) is documented
+# in README.md under "Wrapper mode". It requires real provider credentials and
+# cannot be demonstrated with this mock stack.
+#
+# This file is also the base for the e2e-smoke CI check.
+# CI expects the same port and volume contract; update CI too if those change.
+
+services:
+ fairvisor:
+ build:
+ context: ../..
+ dockerfile: docker/Dockerfile
+ ports:
+ - "8080:8080"
+ environment:
+ FAIRVISOR_CONFIG_FILE: /etc/fairvisor/policy.json
+ FAIRVISOR_MODE: reverse_proxy
+ FAIRVISOR_BACKEND_URL: http://mock_llm:80
+ FAIRVISOR_SHARED_DICT_SIZE: 32m
+ FAIRVISOR_LOG_LEVEL: info
+ FAIRVISOR_WORKER_PROCESSES: "1"
+ volumes:
+ - ./policy.json:/etc/fairvisor/policy.json:ro
+ depends_on:
+ mock_llm:
+ condition: service_healthy
+ healthcheck:
+ test: ["CMD", "curl", "-sf", "http://127.0.0.1:8080/readyz"]
+ interval: 2m
+ timeout: 2s
+ retries: 15
+ start_period: 5s
+
+ mock_llm:
+ image: nginx:1.27-alpine
+ volumes:
+ - ./mock-llm.conf:/etc/nginx/nginx.conf:ro
+ healthcheck:
+ test: ["CMD", "wget", "-q", "-O", "-", "http://127.0.0.1:80/"]
+ interval: 2m
+ timeout: 2s
+ retries: 10
+ start_period: 5s
diff --git a/examples/quickstart/mock-llm.conf b/examples/quickstart/mock-llm.conf
new file mode 100644
index 0000000..26603ab
--- /dev/null
+++ b/examples/quickstart/mock-llm.conf
@@ -0,0 +1,10 @@
+events {}
+http {
+ server {
+ listen 80;
+ location / {
+ default_type application/json;
+ return 200 '{"id":"chatcmpl-qs","object":"chat.completion","choices":[{"index":0,"message":{"role":"assistant","content":"Hello from the mock backend!"},"finish_reason":"stop"}],"usage":{"prompt_tokens":10,"completion_tokens":8,"total_tokens":18}}';
+ }
+ }
+}
diff --git a/examples/quickstart/policy.json b/examples/quickstart/policy.json
new file mode 100644
index 0000000..fb9b375
--- /dev/null
+++ b/examples/quickstart/policy.json
@@ -0,0 +1,31 @@
+{
+ "bundle_version": 1,
+ "issued_at": "2026-01-01T00:00:00Z",
+ "expires_at": "2030-01-01T00:00:00Z",
+ "policies": [
+ {
+ "id": "quickstart-tpm-policy",
+ "spec": {
+ "selector": {
+ "pathPrefix": "/v1/",
+ "methods": ["POST"]
+ },
+ "mode": "enforce",
+ "rules": [
+ {
+ "name": "tpm-limit",
+ "limit_keys": ["ip:address"],
+ "algorithm": "token_bucket_llm",
+ "algorithm_config": {
+ "tokens_per_minute": 100,
+ "tokens_per_day": 1000,
+ "burst_tokens": 100,
+ "default_max_completion": 50
+ }
+ }
+ ]
+ }
+ }
+ ],
+ "kill_switches": []
+}
diff --git a/examples/recipes/circuit-breaker/README.md b/examples/recipes/circuit-breaker/README.md
new file mode 100644
index 0000000..ad1227e
--- /dev/null
+++ b/examples/recipes/circuit-breaker/README.md
@@ -0,0 +1,43 @@
+# Recipe: Circuit Breaker — Cost Spike Auto-Shutdown
+
+Automatically block all LLM traffic when the aggregate token spend rate
+exceeds a budget threshold, then self-reset after a cooldown period.
+
+## How it works
+
+- Normal traffic: per-org TPM limit enforced (`100 000 tokens/min`)
+- Spike detection: if the rolling spend rate hits `500 000 tokens/min`
+ the circuit breaker opens and **all requests return `429`** with
+ `X-Fairvisor-Reason: circuit_breaker_open`
+- Auto-reset: after 10 minutes without breaker-triggering load, the
+ circuit resets automatically — no manual intervention needed
+- `alert: true` logs the trip event to the Fairvisor audit log
+
+## Deploy
+
+```bash
+cp policy.json /etc/fairvisor/policy.json
+```
+
+## Expected behaviour
+
+```bash
+# Normal request — passes
+curl -s -o /dev/null -w "%{http_code}" \
+ -H "Authorization: Bearer :" \
+ http://localhost:8080/v1/chat/completions \
+ -d '{"model":"gpt-4o","messages":[{"role":"user","content":"hi"}]}'
+# → 200
+
+# After spend spike trips the breaker:
+# → 429 X-Fairvisor-Reason: circuit_breaker_open
+# Retry-After: 600
+```
+
+## Tuning
+
+| Field | Description |
+|---|---|
+| `spend_rate_threshold_per_minute` | Tokens/min rolling spend that opens the breaker |
+| `auto_reset_after_minutes` | Cooldown before automatic reset (0 = manual only) |
+| `tokens_per_minute` | Per-org steady-state limit (independent of breaker) |
diff --git a/examples/recipes/circuit-breaker/policy.json b/examples/recipes/circuit-breaker/policy.json
new file mode 100644
index 0000000..7d58c8d
--- /dev/null
+++ b/examples/recipes/circuit-breaker/policy.json
@@ -0,0 +1,37 @@
+{
+ "bundle_version": 1,
+ "issued_at": "2026-01-01T00:00:00Z",
+ "expires_at": "2030-01-01T00:00:00Z",
+ "policies": [
+ {
+ "id": "cost-spike-guard",
+ "spec": {
+ "selector": {
+ "pathPrefix": "/v1/",
+ "methods": ["POST"]
+ },
+ "mode": "enforce",
+ "rules": [
+ {
+ "name": "per-org-tpm",
+ "limit_keys": ["jwt:org_id"],
+ "algorithm": "token_bucket_llm",
+ "algorithm_config": {
+ "tokens_per_minute": 100000,
+ "burst_tokens": 100000,
+ "default_max_completion": 2048
+ }
+ }
+ ],
+ "circuit_breaker": {
+ "enabled": true,
+ "spend_rate_threshold_per_minute": 500000,
+ "action": "reject",
+ "alert": true,
+ "auto_reset_after_minutes": 10
+ }
+ }
+ }
+ ],
+ "kill_switches": []
+}
diff --git a/examples/recipes/runaway-agent-guard/README.md b/examples/recipes/runaway-agent-guard/README.md
new file mode 100644
index 0000000..7b34491
--- /dev/null
+++ b/examples/recipes/runaway-agent-guard/README.md
@@ -0,0 +1,50 @@
+# Recipe: Runaway Agent Guard
+
+Stop runaway agentic workflows before they exhaust your token budget or
+billing limit.
+
+## Problem
+
+Autonomous agents (LangChain, AutoGPT, custom loops) can enter retry storms
+or infinite planning loops. Without enforcement, a single runaway agent
+can consume thousands of dollars of API budget in minutes.
+
+## How it works
+
+Two rules cooperate:
+
+1. **Loop detector** — counts requests per `agent_id` in a sliding window.
+ If the agent fires more than 30 requests in 60 seconds, it trips a
+ 120-second cooldown. This catches tight retry loops.
+
+2. **TPM guard** — caps tokens per minute per agent. A burst-heavy agent
+ that passes the loop check still cannot drain the token pool.
+
+## Deploy
+
+```bash
+cp policy.json /etc/fairvisor/policy.json
+```
+
+## JWT shape expected
+
+```json
+{
+ "sub": "user-456",
+ "agent_id": "autoagent-prod-7",
+ "exp": 9999999999
+}
+```
+
+## Kill switch for incidents
+
+If an agent causes an incident, flip a kill switch without restarting edge:
+
+```bash
+# Via CLI
+fairvisor kill-switch enable agent-id=autoagent-prod-7
+
+# Or update the policy bundle with a kill_switch entry and hot-reload
+```
+
+See `docs/cookbook/kill-switch-incident-response.md` for the full incident playbook.
diff --git a/examples/recipes/runaway-agent-guard/policy.json b/examples/recipes/runaway-agent-guard/policy.json
new file mode 100644
index 0000000..b3facab
--- /dev/null
+++ b/examples/recipes/runaway-agent-guard/policy.json
@@ -0,0 +1,41 @@
+{
+ "bundle_version": 1,
+ "issued_at": "2026-01-01T00:00:00Z",
+ "expires_at": "2030-01-01T00:00:00Z",
+ "policies": [
+ {
+ "id": "runaway-agent-guard",
+ "spec": {
+ "selector": {
+ "pathPrefix": "/",
+ "methods": [
+ "POST"
+ ]
+ },
+ "mode": "enforce",
+ "loop_detection": {
+ "enabled": true,
+ "window_seconds": 60,
+ "threshold_identical_requests": 30,
+ "action": "reject",
+ "similarity": "exact"
+ },
+ "rules": [
+ {
+ "name": "agent-tpm-guard",
+ "limit_keys": [
+ "jwt:agent_id"
+ ],
+ "algorithm": "token_bucket_llm",
+ "algorithm_config": {
+ "tokens_per_minute": 50000,
+ "burst_tokens": 50000,
+ "default_max_completion": 512
+ }
+ }
+ ]
+ }
+ }
+ ],
+ "kill_switches": []
+}
diff --git a/examples/recipes/team-budgets/README.md b/examples/recipes/team-budgets/README.md
new file mode 100644
index 0000000..54c1551
--- /dev/null
+++ b/examples/recipes/team-budgets/README.md
@@ -0,0 +1,45 @@
+# Recipe: Team Budgets
+
+Enforce per-team token and cost limits using JWT claims.
+
+## How it works
+
+Each request carries a JWT with a `team_id` claim. Fairvisor uses this as
+the bucket key for two independent rules:
+
+1. **TPM/TPD limit** — token-rate enforcement per minute and per day
+2. **Monthly cost budget** — cumulative cost cap with staged warn/throttle/reject
+
+## Deploy
+
+```bash
+# Copy policy to your edge config path
+cp policy.json /etc/fairvisor/policy.json
+
+# Or use with docker compose (standalone mode):
+FAIRVISOR_CONFIG_FILE=./policy.json FAIRVISOR_MODE=wrapper docker compose up -d
+```
+
+## JWT shape expected
+
+```json
+{
+ "sub": "user-123",
+ "team_id": "engineering",
+ "plan": "pro",
+ "exp": 9999999999
+}
+```
+
+## Staged actions at cost budget thresholds
+
+| Threshold | Action |
+|---|---|
+| 80% | Warn (allow, log, emit business event) |
+| 95% | Throttle (allow with 500 ms delay) |
+| 100% | Reject (429, `budget_exceeded`) |
+
+## Related fixtures
+
+- `../../../fixtures/reject_tpd_exceeded.json` — TPD reject body
+- `../../../fixtures/reject_tpm_exceeded.json` — TPM reject body
diff --git a/examples/recipes/team-budgets/policy.json b/examples/recipes/team-budgets/policy.json
new file mode 100644
index 0000000..7a87d2c
--- /dev/null
+++ b/examples/recipes/team-budgets/policy.json
@@ -0,0 +1,47 @@
+{
+ "bundle_version": 1,
+ "issued_at": "2026-01-01T00:00:00Z",
+ "expires_at": "2030-01-01T00:00:00Z",
+ "policies": [
+ {
+ "id": "team-token-budget",
+ "spec": {
+ "selector": {
+ "pathPrefix": "/openai/",
+ "methods": ["POST"]
+ },
+ "mode": "enforce",
+ "rules": [
+ {
+ "name": "per-team-tpm",
+ "limit_keys": ["jwt:team_id"],
+ "algorithm": "token_bucket_llm",
+ "algorithm_config": {
+ "tokens_per_minute": 120000,
+ "tokens_per_day": 2000000,
+ "burst_tokens": 120000,
+ "default_max_completion": 1024
+ }
+ },
+ {
+ "name": "per-team-cost-budget",
+ "limit_keys": ["jwt:team_id"],
+ "algorithm": "cost_based",
+ "algorithm_config": {
+ "budget": 50000,
+ "period": "7d",
+ "cost_key": "fixed",
+ "fixed_cost": 1,
+ "staged_actions": [
+ { "threshold_percent": 80, "action": "warn" },
+ { "threshold_percent": 95, "action": "throttle", "delay_ms": 500 },
+ { "threshold_percent": 100, "action": "reject" }
+ ]
+ }
+ }
+ ]
+ }
+ }
+ ],
+ "kill_switches": []
+}
diff --git a/fixtures/allow_response.json b/fixtures/allow_response.json
new file mode 100644
index 0000000..7cc0312
--- /dev/null
+++ b/fixtures/allow_response.json
@@ -0,0 +1,28 @@
+{
+ "_comment": "Sample 200 response for an allowed request in wrapper mode. Note: no Authorization, x-api-key, or x-goog-api-key headers — upstream auth is stripped on the response side.",
+ "_status": 200,
+ "_headers": {
+ "Content-Type": "application/json",
+ "X-Fairvisor-Reason": null,
+ "Authorization": null,
+ "x-api-key": null,
+ "x-goog-api-key": null
+ },
+ "id": "chatcmpl-example",
+ "object": "chat.completion",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "Hello! How can I help you today?"
+ },
+ "finish_reason": "stop"
+ }
+ ],
+ "usage": {
+ "prompt_tokens": 10,
+ "completion_tokens": 9,
+ "total_tokens": 19
+ }
+}
diff --git a/fixtures/anthropic_normal_request.json b/fixtures/anthropic_normal_request.json
new file mode 100644
index 0000000..bcffdbf
--- /dev/null
+++ b/fixtures/anthropic_normal_request.json
@@ -0,0 +1,10 @@
+{
+ "model": "claude-3-5-haiku-20241022",
+ "max_tokens": 20,
+ "messages": [
+ {
+ "role": "user",
+ "content": "Say hello in one sentence."
+ }
+ ]
+}
diff --git a/fixtures/normal_request.json b/fixtures/normal_request.json
new file mode 100644
index 0000000..049a4e4
--- /dev/null
+++ b/fixtures/normal_request.json
@@ -0,0 +1,10 @@
+{
+ "model": "gpt-4o-mini",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Say hello in one sentence."
+ }
+ ],
+ "max_tokens": 20
+}
diff --git a/fixtures/over_limit_request.json b/fixtures/over_limit_request.json
new file mode 100644
index 0000000..b3b554f
--- /dev/null
+++ b/fixtures/over_limit_request.json
@@ -0,0 +1,10 @@
+{
+ "model": "gpt-4o",
+ "messages": [
+ {
+ "role": "user",
+ "content": "Say hello in one sentence."
+ }
+ ],
+ "max_tokens": 200000
+}
diff --git a/fixtures/reject_anthropic.json b/fixtures/reject_anthropic.json
new file mode 100644
index 0000000..bdf468f
--- /dev/null
+++ b/fixtures/reject_anthropic.json
@@ -0,0 +1,13 @@
+{
+ "_comment": "Anthropic-native 429 reject body. Used for /anthropic/* paths.",
+ "_headers": {
+ "X-Fairvisor-Reason": "tpm_exceeded",
+ "Retry-After": "60",
+ "Content-Type": "application/json"
+ },
+ "type": "error",
+ "error": {
+ "type": "rate_limit_error",
+ "message": "Token budget exceeded for this tenant."
+ }
+}
diff --git a/fixtures/reject_gemini.json b/fixtures/reject_gemini.json
new file mode 100644
index 0000000..f0df901
--- /dev/null
+++ b/fixtures/reject_gemini.json
@@ -0,0 +1,13 @@
+{
+ "_comment": "Gemini-native 429 reject body. Used for /gemini/* paths.",
+ "_headers": {
+ "X-Fairvisor-Reason": "tpm_exceeded",
+ "Retry-After": "60",
+ "Content-Type": "application/json"
+ },
+ "error": {
+ "code": 429,
+ "message": "Token budget exceeded for this tenant.",
+ "status": "RESOURCE_EXHAUSTED"
+ }
+}
diff --git a/fixtures/reject_openai.json b/fixtures/reject_openai.json
new file mode 100644
index 0000000..eabd023
--- /dev/null
+++ b/fixtures/reject_openai.json
@@ -0,0 +1,14 @@
+{
+ "_comment": "OpenAI-native 429 reject body. Used for /openai/* paths and OpenAI-compatible providers.",
+ "_headers": {
+ "X-Fairvisor-Reason": "tpm_exceeded",
+ "Retry-After": "60",
+ "Content-Type": "application/json"
+ },
+ "error": {
+ "type": "rate_limit_error",
+ "code": "tpm_exceeded",
+ "message": "Token budget exceeded for this tenant.",
+ "param": null
+ }
+}
diff --git a/fixtures/reject_prompt_too_large.json b/fixtures/reject_prompt_too_large.json
new file mode 100644
index 0000000..9c4cf8c
--- /dev/null
+++ b/fixtures/reject_prompt_too_large.json
@@ -0,0 +1,13 @@
+{
+ "_comment": "429 body returned when the request exceeds max_prompt_tokens.",
+ "_headers": {
+ "X-Fairvisor-Reason": "prompt_too_large",
+ "Content-Type": "application/json"
+ },
+ "error": {
+ "type": "rate_limit_error",
+ "code": "prompt_too_large",
+ "message": "Request prompt exceeds the maximum allowed token count for this policy.",
+ "param": null
+ }
+}
diff --git a/fixtures/reject_tpd_exceeded.json b/fixtures/reject_tpd_exceeded.json
new file mode 100644
index 0000000..83cb2ea
--- /dev/null
+++ b/fixtures/reject_tpd_exceeded.json
@@ -0,0 +1,16 @@
+{
+ "_comment": "Illustrative 429 body returned when the per-day token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_day value.",
+ "_headers": {
+ "X-Fairvisor-Reason": "tpd_exceeded",
+ "Retry-After": "",
+ "RateLimit-Limit": "",
+ "RateLimit-Remaining": "0",
+ "Content-Type": "application/json"
+ },
+ "error": {
+ "type": "rate_limit_error",
+ "code": "tpd_exceeded",
+ "message": "Token budget exceeded for this tenant.",
+ "param": null
+ }
+}
diff --git a/fixtures/reject_tpm_exceeded.json b/fixtures/reject_tpm_exceeded.json
new file mode 100644
index 0000000..0805778
--- /dev/null
+++ b/fixtures/reject_tpm_exceeded.json
@@ -0,0 +1,17 @@
+{
+ "_comment": "Illustrative 429 body returned when the per-minute token budget is exhausted. RateLimit-Limit reflects the policy's tokens_per_minute value.",
+ "_headers": {
+ "X-Fairvisor-Reason": "tpm_exceeded",
+ "Retry-After": "60",
+ "RateLimit-Limit": "",
+ "RateLimit-Remaining": "0",
+ "RateLimit-Reset": "",
+ "Content-Type": "application/json"
+ },
+ "error": {
+ "type": "rate_limit_error",
+ "code": "tpm_exceeded",
+ "message": "Token budget exceeded for this tenant.",
+ "param": null
+ }
+}
diff --git a/spec/unit/features/llm_limiter.feature b/spec/unit/features/llm_limiter.feature
index e9d3828..d33e4b1 100644
--- a/spec/unit/features/llm_limiter.feature
+++ b/spec/unit/features/llm_limiter.feature
@@ -190,3 +190,61 @@ Feature: LLM limiter module behavior
And the llm limiter config is validated
When I build error response for reason "tpm_exceeded"
Then error response has OpenAI rate limit shape
+
+ Rule: New features — max_completion_tokens and improved JSON parsing
+ Scenario: max_completion_tokens is extracted from body when max_tokens is missing
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config has default_max_completion 1000
+ And the llm limiter config is validated
+ And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_completion_tokens":2000}'
+ When I run llm check at now 1700000000
+ Then check is allowed
+ And reserved equals estimated_total 2002
+
+ Scenario: improved JSON parsing handles spaces and false positives
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config uses estimator "simple_word"
+ And the llm limiter config is validated
+ And the request body is '{"messages":[{"role":"user", "content" : "12345678"}]}'
+ When I estimate prompt tokens
+ Then prompt estimate equals 2
+
+ Scenario: simple_word parsing multiple messages
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config uses estimator "simple_word"
+ And the llm limiter config is validated
+ And the request body is '{"messages":[{"role":"user","content":"hello"},{"role":"assistant","content":"world!"}]}'
+ When I estimate prompt tokens
+ Then prompt estimate equals 3
+
+ Scenario: simple_word fallback when no messages key
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config uses estimator "simple_word"
+ And the llm limiter config is validated
+ And the request body is '{"input":"test"}'
+ When I estimate prompt tokens
+ Then prompt estimate equals 4
+
+ Scenario: max_tokens field in body is used when request_context.max_tokens is absent
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config has default_max_completion 1000
+ And the llm limiter config is validated
+ And the request body is '{"messages":[{"role":"user","content":"hello"}],"max_tokens":500}'
+ When I run llm check at now 1700000000
+ Then check is allowed
+ And reserved equals estimated_total 502
+
+ Scenario: body with no max_tokens field falls back to default_max_completion
+ Given the nginx mock environment is reset
+ And a valid llm limiter config with tokens_per_minute 10000
+ And the config has default_max_completion 800
+ And the llm limiter config is validated
+ And the request body is '{"messages":[{"role":"user","content":"hi"}]}'
+ When I run llm check at now 1700000000
+ Then check is allowed
+ And reserved equals estimated_total 801
diff --git a/spec/unit/llm_limiter_spec.lua b/spec/unit/llm_limiter_spec.lua
index aecf8e1..362b3ae 100644
--- a/spec/unit/llm_limiter_spec.lua
+++ b/spec/unit/llm_limiter_spec.lua
@@ -129,6 +129,11 @@ runner:given("^the request body is empty$", function(ctx)
ctx.request_context.body = ""
end)
+runner:given("^the request body is '([^']+)'$", function(ctx, body)
+ ctx.request_context = ctx.request_context or {}
+ ctx.request_context.body = body
+end)
+
runner:given("^the request body has (%d+) prompt characters in messages$", function(ctx, chars)
local char_count = tonumber(chars)
local content = string.rep("a", char_count)
diff --git a/spec/unit/supply_chain_pins_spec.lua b/spec/unit/supply_chain_pins_spec.lua
index 63190bf..f9b36bf 100644
--- a/spec/unit/supply_chain_pins_spec.lua
+++ b/spec/unit/supply_chain_pins_spec.lua
@@ -30,6 +30,11 @@ describe("supply chain pinning", function()
assert.is_falsy(runtime_dockerfile:match("RUN opm get anjia0532/lua%-resty%-maxminddb%s*$"))
end)
+ it("pins lua-resty-http to a concrete OPM version in the CLI Dockerfile", function()
+ assert.is_truthy(cli_dockerfile:match("RUN opm get ledgetech/lua%-resty%-http=0%.17%.1"))
+ assert.is_falsy(cli_dockerfile:match("RUN opm get ledgetech/lua%-resty%-http%s*$"))
+ end)
+
it("pins both CodeQL upload-sarif actions to the resolved commit SHA", function()
local _, count = ci_workflow:gsub(
"github/codeql%-action/upload%-sarif@cb06a0a8527b2c6970741b3a0baa15231dc74a4c",
diff --git a/src/fairvisor/llm_limiter.lua b/src/fairvisor/llm_limiter.lua
index 3b29bc0..ab3d3d4 100644
--- a/src/fairvisor/llm_limiter.lua
+++ b/src/fairvisor/llm_limiter.lua
@@ -189,23 +189,32 @@ local function _simple_word_estimate(request_context)
local array_end = array_start and string_find(body, "]", array_start, true)
if array_start and array_end then
local segment = string_sub(body, array_start, array_end)
- local marker = "\"content\":\""
- local marker_len = #marker
local position = 1
local char_count = 0
while true do
- local start_pos = string_find(segment, marker, position, true)
- if not start_pos then
+ -- Find "content" key
+ local key_start = string_find(segment, "\"content\"", position, true)
+ if not key_start then
break
end
- local content_start = start_pos + marker_len
- local content_end = string_find(segment, "\"", content_start, true)
- if not content_end then
- break
+
+ -- Look for value start: : "..."
+ -- pattern: ^%s*:%s*"
+ local val_marker_start, val_marker_end = string_find(segment, "^%s*:%s*\"", key_start + 9)
+
+ if not val_marker_start then
+ -- False positive (e.g. key was in a string), skip it
+ position = key_start + 7
+ else
+ local content_start = val_marker_end + 1
+ local content_end = string_find(segment, "\"", content_start, true)
+ if not content_end then
+ break
+ end
+ char_count = char_count + (content_end - content_start)
+ position = content_end + 1
end
- char_count = char_count + (content_end - content_start)
- position = content_end + 1
end
return ceil(char_count / 4)
@@ -215,6 +224,21 @@ local function _simple_word_estimate(request_context)
return ceil(#body / 4)
end
+local function _extract_max_tokens(body)
+ if type(body) ~= "string" or body == "" then
+ return nil
+ end
+ -- simple regex scan for "max_tokens": 123
+ -- or "max_completion_tokens": 123
+ local _, _, val = string_find(body, '"max_tokens"%s*:%s*(%d+)')
+ if val then return tonumber(val) end
+
+ _, _, val = string_find(body, '"max_completion_tokens"%s*:%s*(%d+)')
+ if val then return tonumber(val) end
+
+ return nil
+end
+
local function _check_tpd_budget(dict, key, config, cost, now)
local ttl = _seconds_until_midnight_utc(now)
local new_total, incr_err = dict:incr(key, cost, 0, ttl)
@@ -365,6 +389,11 @@ function _M.check(dict, key, config, request_context, now)
local max_completion = config.default_max_completion
if request_context and type(request_context.max_tokens) == "number" and request_context.max_tokens > 0 then
max_completion = request_context.max_tokens
+ elseif request_context and request_context.body then
+ local extracted = _extract_max_tokens(request_context.body)
+ if extracted and extracted > 0 then
+ max_completion = extracted
+ end
end
if config.max_completion_tokens and max_completion > config.max_completion_tokens then
max_completion = config.max_completion_tokens
diff --git a/src/fairvisor/rule_engine.lua b/src/fairvisor/rule_engine.lua
index 8808fdb..06dcbf9 100644
--- a/src/fairvisor/rule_engine.lua
+++ b/src/fairvisor/rule_engine.lua
@@ -8,6 +8,9 @@ local utils = require("fairvisor.utils")
local _M = {}
+local function _log_info(...)
+ if ngx and ngx.log then ngx.log(ngx.INFO, ...) end
+end
local function _log_warn(...)
if ngx and ngx.log then ngx.log(ngx.WARN, ...) end
end
@@ -469,12 +472,12 @@ end
local function _maybe_log_override_state(flags)
if _last_global_shadow_active ~= flags.global_shadow_active then
_last_global_shadow_active = flags.global_shadow_active
- _log_warn("evaluate global_shadow_state active=", tostring(flags.global_shadow_active))
+ _log_info("evaluate global_shadow_state active=", tostring(flags.global_shadow_active))
end
if _last_kill_switch_override_active ~= flags.kill_switch_override_active then
_last_kill_switch_override_active = flags.kill_switch_override_active
- _log_warn("evaluate kill_switch_override_state active=", tostring(flags.kill_switch_override_active))
+ _log_info("evaluate kill_switch_override_state active=", tostring(flags.kill_switch_override_active))
end
end