0-draft · kanywst · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -54,6 +54,26 @@ jobs:
       - name: zig build test-unit
         run: zig build test-unit
 
+  bench:
+    name: bench (smoke)
+    runs-on: ubuntu-latest
+    # No `needs: build` -- `zig build bench` rebuilds the wasm via
+    # the install step, so depending on the artifact job adds wait
+    # time without saving work.
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+
+      - uses: mlugg/setup-zig@d1434d08867e3ee9daa34448df10607b98908d29 # v2.2.1
+        with:
+          version: 0.16.0
+
+      - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
+        with:
+          node-version: 22
+
+      - name: zig build bench
+        run: zig build bench
+
   test-node:
     name: test (node)
     runs-on: ubuntu-latest

diff --git a/bench/README.md b/bench/README.md
@@ -0,0 +1,49 @@
+# bench/
+
+Latency benchmarks for `zopa.wasm`. v1 covers the zopa side only.
+
+OPA WASM SDK, OPA HTTP sidecar, and Cedar comparisons are deferred
+until the conformance harness lands (see
+`docs/proposals/opa-conformance-harness.md`). Without conformance,
+"same answer" cannot be asserted, so a head-to-head latency number
+isn't honest.
+
+## Layout
+
+```text
+bench/
+  run.mjs           Node runner (drives evaluate via the same Node host as test/run.mjs)
+  fixtures/
+    01_static.json  fixture: literal allow:true
+    02_header_eq.json  fixture: input.method == "GET"
+  README.md
+```
+
+Each fixture is a JSON object with `name`, `input`, `ast`, and (when
+applicable) the source forms in `rego` / `cedar` for cross-engine
+runs added later.
+
+## Running
+
+```bash
+zig build --release=small        # produces zig-out/bin/zopa.wasm
+zig build bench                  # invokes node bench/run.mjs
+```
+
+The runner loads each fixture, executes `evaluate` 10,000 iterations
+after a 1,000-iteration warm-up, and prints p50 / p95 / p99 / mean
+latency in microseconds.
+
+## Output
+
+```text
+fixture                 |    p50 |    p95 |    p99 |   mean
+------------------------+--------+--------+--------+-------
+01_static               |  X.XX  |  X.XX  |  X.XX  |  X.XX
+02_header_eq            |  X.XX  |  X.XX  |  X.XX  |  X.XX
+```
+
+Numbers are wall-clock, single-process, CPU-bound. They are not a
+substitute for the proxy-wasm in-Envoy path (which adds
+serialisation + host-call overhead). For the in-Envoy story, see
+`examples/envoy/`.
diff --git a/bench/fixtures/01_static.json b/bench/fixtures/01_static.json
@@ -0,0 +1,7 @@
+{
+  "name":  "01_static",
+  "input": {},
+  "ast":   { "type": "value", "value": true },
+  "rego":  "package authz\n\ndefault allow = true\n",
+  "cedar": "permit(principal, action, resource);"
+}
diff --git a/bench/fixtures/02_header_eq.json b/bench/fixtures/02_header_eq.json
@@ -0,0 +1,12 @@
+{
+  "name":  "02_header_eq",
+  "input": { "method": "GET" },
+  "ast": {
+    "type": "compare",
+    "op":   "eq",
+    "left":  { "type": "ref",   "path": ["input", "method"] },
+    "right": { "type": "value", "value": "GET" }
+  },
+  "rego":  "package authz\n\nallow if input.method == \"GET\"\n",
+  "cedar": "permit(principal, action, resource) when { context.method == \"GET\" };"
+}
diff --git a/bench/run.mjs b/bench/run.mjs
@@ -0,0 +1,93 @@
+// Latency benchmark for zopa.wasm. v1 covers the zopa side only.
+// Drives the generic `evaluate(input, ast)` export. Other engines
+// (OPA, Cedar) are deferred per bench/README.md.
+
+import { readFileSync, readdirSync } from 'node:fs';
+import { join, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const HERE = dirname(fileURLToPath(import.meta.url));
+const DEFAULT_WASM = join(HERE, '..', 'zig-out', 'bin', 'zopa.wasm');
+const WASM_PATH = process.argv[2] ?? DEFAULT_WASM;
+
+const WARMUP = 1000;
+const ITERS = 10000;
+
+// Minimal stubs for isolated benchmarking. These always succeed and
+// would mask real proxy errors if reached, but the generic `evaluate`
+// path doesn't call them, so an unexpected hit means the harness has
+// drifted from what the wasm expects.
+const { instance } = await WebAssembly.instantiate(
+  readFileSync(WASM_PATH),
+  { env: {
+      proxy_log: () => 0,
+      proxy_get_buffer_bytes: () => 1,
+      proxy_get_header_map_pairs: () => 1,
+      proxy_get_header_map_value: () => 1,
+      proxy_send_local_response: () => 0,
+  }},
+);
+const { malloc, free, evaluate, memory } = instance.exports;
+
+const enc = new TextEncoder();
+function writeJson(obj) {
+  const bytes = enc.encode(JSON.stringify(obj));
+  const ptr = malloc(bytes.length);
+  if (ptr === 0) throw new Error('malloc failed');
+  new Uint8Array(memory.buffer, ptr, bytes.length).set(bytes);
+  return { ptr, len: bytes.length };
+}
+function freeBuf({ ptr }) { free(ptr); }
+
+function percentile(sorted, p) {
+  const idx = Math.min(sorted.length - 1, Math.floor(sorted.length * p));
+  return sorted[idx];
+}
+
+function runFixture(fix) {
+  const i = writeJson(fix.input);
+  const a = writeJson(fix.ast);
+  try {
+    // Warmup also validates the policy actually evaluates without
+    // error. evaluate() returns 1 (allow), 0 (deny), or -1 (parse /
+    // depth-cap / unknown-node failures). We bail on -1 so the
+    // benchmark never times an error path; allow + deny are both
+    // legitimate decisions to measure.
+    for (let k = 0; k < WARMUP; k++) {
+      const r = evaluate(i.ptr, i.len, a.ptr, a.len);
+      if (r === -1) throw new Error(`fixture ${fix.name}: evaluate returned -1`);
+    }
+    const samples = new Float64Array(ITERS);
+    for (let k = 0; k < ITERS; k++) {
+      const t0 = process.hrtime.bigint();
+      evaluate(i.ptr, i.len, a.ptr, a.len);
+      const t1 = process.hrtime.bigint();
+      samples[k] = Number(t1 - t0) / 1000;  // microseconds
+    }
+    samples.sort();
+    const mean = samples.reduce((s, x) => s + x, 0) / samples.length;
+    return {
+      p50:  percentile(samples, 0.50),
+      p95:  percentile(samples, 0.95),
+      p99:  percentile(samples, 0.99),
+      mean,
+    };
+  } finally {
+    freeBuf(i);
+    freeBuf(a);
+  }
+}
+
+const fixturesDir = join(HERE, 'fixtures');
+const fixtures = readdirSync(fixturesDir)
+  .filter(f => f.endsWith('.json'))
+  .sort()
+  .map(f => JSON.parse(readFileSync(join(fixturesDir, f), 'utf8')));
+
+console.log('fixture                 |    p50 |    p95 |    p99 |   mean');
+console.log('------------------------+--------+--------+--------+-------');
+for (const fix of fixtures) {
+  const r = runFixture(fix);
+  const fmt = (x) => x.toFixed(2).padStart(6);
+  console.log(`${fix.name.padEnd(24)}|${fmt(r.p50)}  |${fmt(r.p95)}  |${fmt(r.p99)}  |${fmt(r.mean)}`);
+}
diff --git a/build.zig b/build.zig
@@ -98,4 +98,15 @@ pub fn build(b: *std.Build) void {
     test_all_step.dependOn(&node_run.step);
     test_all_step.dependOn(&wasmtime_run.step);
     test_all_step.dependOn(&envoy_run.step);
+
+    // `zig build bench` -> Node-based latency benchmark of the
+    // `evaluate` hot path. zopa-only for v1; cross-engine numbers
+    // are deferred until OPA conformance lands. See bench/README.md.
+    const bench_run = b.addSystemCommand(&.{ "node", "bench/run.mjs" });
+    bench_run.step.dependOn(b.getInstallStep());
+    const bench_step = b.step(
+        "bench",
+        "Run zopa.wasm latency benchmark in Node.js",
+    );
+    bench_step.dependOn(&bench_run.step);
 }
diff --git a/docs/proposals/benchmark-harness.md b/docs/proposals/benchmark-harness.md
@@ -0,0 +1,118 @@
+# Benchmark harness vs OPA / Cedar
+
+Status: Proposed (draft PR, design doc only).
+Tracking: ROADMAP.md → Near term ("Compiled-policy benchmark").
+
+## Motivation
+
+The README claims zopa is "two orders of magnitude smaller" than OPA's
+WASM build. That's a statement about binary size only. We don't yet
+publish numbers for the things that actually matter at runtime:
+evaluation latency, memory floor, cold-start time, throughput under
+load.
+
+Without numbers, the project can't honestly recommend itself for a
+production proxy filter, and PRs that touch the eval hot path can
+regress without anyone noticing.
+
+## Goals
+
+1. A reproducible bench harness that runs:
+   - zopa (`evaluate` direct + via Envoy/proxy-wasm).
+   - OPA WASM SDK (one-shot `opa_eval`).
+   - OPA HTTP sidecar (out-of-process baseline, network hop included).
+   - Cedar via its native API (no proxy-wasm path; native baseline).
+1. Metrics:
+   - p50 / p95 / p99 latency per evaluation.
+   - Memory floor after warm-up (pages, RSS, wasm linear memory).
+   - Cold-start (instantiate + first eval).
+   - Throughput at saturation.
+1. A small fixture set of policies covering increasing complexity:
+   - Static `allow=true`.
+   - Single header equality.
+   - Nested `every` over `input.required_perms`.
+   - Worst-case: deeply nested AST near the recursion cap.
+1. CI job that runs a smoke version (low iteration counts) on every
+   PR, full bench on `main` post-merge, results checked into
+   `bench/results/`.
+
+## Non-goals
+
+- Comparing to non-CNCF authorization engines (Casbin, Oso, etc).
+  Optional later.
+- Microbenchmarking individual AST nodes. The bench measures the
+  user-facing path, not internal hot loops.
+- Beating OPA on every metric. We expect zopa to win on size and
+  cold-start, lose on Rego coverage. The bench should make that legible,
+  not hide it.
+
+## Design sketch
+
+### Layout
+
+```text
+bench/
+  Cargo.toml             # criterion runner (Rust; for OPA WASM + Cedar)
+  build.zig              # zopa direct path (zig benchmark target)
+  fixtures/
+    01_static.json       # input + AST + Rego + Cedar source
+    02_header_eq.json
+    03_every_perms.json
+    04_deep_nest.json
+  hosts/
+    zopa_direct.zig      # evaluate(input, ast) over wasmtime embed
+    opa_wasm.rs          # opa_eval via wasmtime-rust
+    opa_http.rs          # localhost OPA over reqwest
+    cedar_native.rs      # cedar-policy crate
+  results/
+    YYYY-MM-DD-<sha>.json
+    latest.md            # human-readable summary
+```
+
+### Fixture format
+
+Each fixture carries the same logical policy in three syntaxes:
+
+```json
+{
+  "name":   "header_eq",
+  "input":  { "method": "GET" },
+  "rego":   "package authz\nallow if input.method == \"GET\"\n",
+  "ast":    { "type": "module", "rules": [ ... ] },
+  "cedar":  "permit(principal, action == Action::\"GET\", resource);"
+}
+```
+
+The runner loads the fixture, hands each engine the form it expects,
+and asserts every engine returns the same decision before timing
+anything.
+
+### Metrics format
+
+JSON, one record per (host, fixture, metric) tuple. Aggregated into
+`latest.md` with a small Python script for the README.
+
+## API impact
+
+None. Bench code lives under `bench/` and never ships in the wasm
+artifact.
+
+## Test plan
+
+- Smoke run in CI on every PR (~5 seconds, low iteration counts) to
+  catch obvious regressions.
+- Full run nightly on `main`, results committed via a GitHub Action
+  with `[skip ci]`.
+- Compare-to-baseline check: fail the CI job if p95 regresses by more
+  than 20% vs the latest committed baseline.
+
+## Open questions
+
+- Where do we host the full nightly results? Inline markdown in the
+  repo is honest but noisy. A `gh-pages` site is more browseable but
+  is more infra to maintain.
+- Should we fix Envoy / OPA / Cedar versions in `bench/Cargo.lock` and
+  bump them deliberately, or follow latest? Pinning is more
+  reproducible; following latest catches upstream regressions.
+- Can the OPA HTTP host be skipped on PR runs and only included
+  nightly? Network-bound benches add variance.