diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 70f3b40..e8fc472 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -54,6 +54,26 @@ jobs: - name: zig build test-unit run: zig build test-unit + bench: + name: bench (smoke) + runs-on: ubuntu-latest + # No `needs: build` -- `zig build bench` rebuilds the wasm via + # the install step, so depending on the artifact job adds wait + # time without saving work. + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - uses: mlugg/setup-zig@d1434d08867e3ee9daa34448df10607b98908d29 # v2.2.1 + with: + version: 0.16.0 + + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 22 + + - name: zig build bench + run: zig build bench + test-node: name: test (node) runs-on: ubuntu-latest diff --git a/bench/README.md b/bench/README.md new file mode 100644 index 0000000..393e766 --- /dev/null +++ b/bench/README.md @@ -0,0 +1,49 @@ +# bench/ + +Latency benchmarks for `zopa.wasm`. v1 covers the zopa side only. + +OPA WASM SDK, OPA HTTP sidecar, and Cedar comparisons are deferred +until the conformance harness lands (see +`docs/proposals/opa-conformance-harness.md`). Without conformance, +"same answer" cannot be asserted, so a head-to-head latency number +isn't honest. + +## Layout + +```text +bench/ + run.mjs Node runner (drives evaluate via the same Node host as test/run.mjs) + fixtures/ + 01_static.json fixture: literal allow:true + 02_header_eq.json fixture: input.method == "GET" + README.md +``` + +Each fixture is a JSON object with `name`, `input`, `ast`, and (when +applicable) the source forms in `rego` / `cedar` for cross-engine +runs added later. + +## Running + +```bash +zig build --release=small # produces zig-out/bin/zopa.wasm +zig build bench # invokes node bench/run.mjs +``` + +The runner loads each fixture, executes `evaluate` 10,000 iterations +after a 1,000-iteration warm-up, and prints p50 / p95 / p99 / mean +latency in microseconds. + +## Output + +```text +fixture | p50 | p95 | p99 | mean +------------------------+--------+--------+--------+------- +01_static | X.XX | X.XX | X.XX | X.XX +02_header_eq | X.XX | X.XX | X.XX | X.XX +``` + +Numbers are wall-clock, single-process, CPU-bound. They are not a +substitute for the proxy-wasm in-Envoy path (which adds +serialisation + host-call overhead). For the in-Envoy story, see +`examples/envoy/`. diff --git a/bench/fixtures/01_static.json b/bench/fixtures/01_static.json new file mode 100644 index 0000000..3296527 --- /dev/null +++ b/bench/fixtures/01_static.json @@ -0,0 +1,7 @@ +{ + "name": "01_static", + "input": {}, + "ast": { "type": "value", "value": true }, + "rego": "package authz\n\ndefault allow = true\n", + "cedar": "permit(principal, action, resource);" +} diff --git a/bench/fixtures/02_header_eq.json b/bench/fixtures/02_header_eq.json new file mode 100644 index 0000000..75352ab --- /dev/null +++ b/bench/fixtures/02_header_eq.json @@ -0,0 +1,12 @@ +{ + "name": "02_header_eq", + "input": { "method": "GET" }, + "ast": { + "type": "compare", + "op": "eq", + "left": { "type": "ref", "path": ["input", "method"] }, + "right": { "type": "value", "value": "GET" } + }, + "rego": "package authz\n\nallow if input.method == \"GET\"\n", + "cedar": "permit(principal, action, resource) when { context.method == \"GET\" };" +} diff --git a/bench/run.mjs b/bench/run.mjs new file mode 100644 index 0000000..d4b054e --- /dev/null +++ b/bench/run.mjs @@ -0,0 +1,93 @@ +// Latency benchmark for zopa.wasm. v1 covers the zopa side only. +// Drives the generic `evaluate(input, ast)` export. Other engines +// (OPA, Cedar) are deferred per bench/README.md. + +import { readFileSync, readdirSync } from 'node:fs'; +import { join, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const DEFAULT_WASM = join(HERE, '..', 'zig-out', 'bin', 'zopa.wasm'); +const WASM_PATH = process.argv[2] ?? DEFAULT_WASM; + +const WARMUP = 1000; +const ITERS = 10000; + +// Minimal stubs for isolated benchmarking. These always succeed and +// would mask real proxy errors if reached, but the generic `evaluate` +// path doesn't call them, so an unexpected hit means the harness has +// drifted from what the wasm expects. +const { instance } = await WebAssembly.instantiate( + readFileSync(WASM_PATH), + { env: { + proxy_log: () => 0, + proxy_get_buffer_bytes: () => 1, + proxy_get_header_map_pairs: () => 1, + proxy_get_header_map_value: () => 1, + proxy_send_local_response: () => 0, + }}, +); +const { malloc, free, evaluate, memory } = instance.exports; + +const enc = new TextEncoder(); +function writeJson(obj) { + const bytes = enc.encode(JSON.stringify(obj)); + const ptr = malloc(bytes.length); + if (ptr === 0) throw new Error('malloc failed'); + new Uint8Array(memory.buffer, ptr, bytes.length).set(bytes); + return { ptr, len: bytes.length }; +} +function freeBuf({ ptr }) { free(ptr); } + +function percentile(sorted, p) { + const idx = Math.min(sorted.length - 1, Math.floor(sorted.length * p)); + return sorted[idx]; +} + +function runFixture(fix) { + const i = writeJson(fix.input); + const a = writeJson(fix.ast); + try { + // Warmup also validates the policy actually evaluates without + // error. evaluate() returns 1 (allow), 0 (deny), or -1 (parse / + // depth-cap / unknown-node failures). We bail on -1 so the + // benchmark never times an error path; allow + deny are both + // legitimate decisions to measure. + for (let k = 0; k < WARMUP; k++) { + const r = evaluate(i.ptr, i.len, a.ptr, a.len); + if (r === -1) throw new Error(`fixture ${fix.name}: evaluate returned -1`); + } + const samples = new Float64Array(ITERS); + for (let k = 0; k < ITERS; k++) { + const t0 = process.hrtime.bigint(); + evaluate(i.ptr, i.len, a.ptr, a.len); + const t1 = process.hrtime.bigint(); + samples[k] = Number(t1 - t0) / 1000; // microseconds + } + samples.sort(); + const mean = samples.reduce((s, x) => s + x, 0) / samples.length; + return { + p50: percentile(samples, 0.50), + p95: percentile(samples, 0.95), + p99: percentile(samples, 0.99), + mean, + }; + } finally { + freeBuf(i); + freeBuf(a); + } +} + +const fixturesDir = join(HERE, 'fixtures'); +const fixtures = readdirSync(fixturesDir) + .filter(f => f.endsWith('.json')) + .sort() + .map(f => JSON.parse(readFileSync(join(fixturesDir, f), 'utf8'))); + +console.log('fixture | p50 | p95 | p99 | mean'); +console.log('------------------------+--------+--------+--------+-------'); +for (const fix of fixtures) { + const r = runFixture(fix); + const fmt = (x) => x.toFixed(2).padStart(6); + console.log(`${fix.name.padEnd(24)}|${fmt(r.p50)} |${fmt(r.p95)} |${fmt(r.p99)} |${fmt(r.mean)}`); +} diff --git a/build.zig b/build.zig index bc9c1b8..fb16660 100644 --- a/build.zig +++ b/build.zig @@ -98,4 +98,15 @@ pub fn build(b: *std.Build) void { test_all_step.dependOn(&node_run.step); test_all_step.dependOn(&wasmtime_run.step); test_all_step.dependOn(&envoy_run.step); + + // `zig build bench` -> Node-based latency benchmark of the + // `evaluate` hot path. zopa-only for v1; cross-engine numbers + // are deferred until OPA conformance lands. See bench/README.md. + const bench_run = b.addSystemCommand(&.{ "node", "bench/run.mjs" }); + bench_run.step.dependOn(b.getInstallStep()); + const bench_step = b.step( + "bench", + "Run zopa.wasm latency benchmark in Node.js", + ); + bench_step.dependOn(&bench_run.step); } diff --git a/docs/proposals/benchmark-harness.md b/docs/proposals/benchmark-harness.md new file mode 100644 index 0000000..b73d6ee --- /dev/null +++ b/docs/proposals/benchmark-harness.md @@ -0,0 +1,118 @@ +# Benchmark harness vs OPA / Cedar + +Status: Proposed (draft PR, design doc only). +Tracking: ROADMAP.md → Near term ("Compiled-policy benchmark"). + +## Motivation + +The README claims zopa is "two orders of magnitude smaller" than OPA's +WASM build. That's a statement about binary size only. We don't yet +publish numbers for the things that actually matter at runtime: +evaluation latency, memory floor, cold-start time, throughput under +load. + +Without numbers, the project can't honestly recommend itself for a +production proxy filter, and PRs that touch the eval hot path can +regress without anyone noticing. + +## Goals + +1. A reproducible bench harness that runs: + - zopa (`evaluate` direct + via Envoy/proxy-wasm). + - OPA WASM SDK (one-shot `opa_eval`). + - OPA HTTP sidecar (out-of-process baseline, network hop included). + - Cedar via its native API (no proxy-wasm path; native baseline). +1. Metrics: + - p50 / p95 / p99 latency per evaluation. + - Memory floor after warm-up (pages, RSS, wasm linear memory). + - Cold-start (instantiate + first eval). + - Throughput at saturation. +1. A small fixture set of policies covering increasing complexity: + - Static `allow=true`. + - Single header equality. + - Nested `every` over `input.required_perms`. + - Worst-case: deeply nested AST near the recursion cap. +1. CI job that runs a smoke version (low iteration counts) on every + PR, full bench on `main` post-merge, results checked into + `bench/results/`. + +## Non-goals + +- Comparing to non-CNCF authorization engines (Casbin, Oso, etc). + Optional later. +- Microbenchmarking individual AST nodes. The bench measures the + user-facing path, not internal hot loops. +- Beating OPA on every metric. We expect zopa to win on size and + cold-start, lose on Rego coverage. The bench should make that legible, + not hide it. + +## Design sketch + +### Layout + +```text +bench/ + Cargo.toml # criterion runner (Rust; for OPA WASM + Cedar) + build.zig # zopa direct path (zig benchmark target) + fixtures/ + 01_static.json # input + AST + Rego + Cedar source + 02_header_eq.json + 03_every_perms.json + 04_deep_nest.json + hosts/ + zopa_direct.zig # evaluate(input, ast) over wasmtime embed + opa_wasm.rs # opa_eval via wasmtime-rust + opa_http.rs # localhost OPA over reqwest + cedar_native.rs # cedar-policy crate + results/ + YYYY-MM-DD-.json + latest.md # human-readable summary +``` + +### Fixture format + +Each fixture carries the same logical policy in three syntaxes: + +```json +{ + "name": "header_eq", + "input": { "method": "GET" }, + "rego": "package authz\nallow if input.method == \"GET\"\n", + "ast": { "type": "module", "rules": [ ... ] }, + "cedar": "permit(principal, action == Action::\"GET\", resource);" +} +``` + +The runner loads the fixture, hands each engine the form it expects, +and asserts every engine returns the same decision before timing +anything. + +### Metrics format + +JSON, one record per (host, fixture, metric) tuple. Aggregated into +`latest.md` with a small Python script for the README. + +## API impact + +None. Bench code lives under `bench/` and never ships in the wasm +artifact. + +## Test plan + +- Smoke run in CI on every PR (~5 seconds, low iteration counts) to + catch obvious regressions. +- Full run nightly on `main`, results committed via a GitHub Action + with `[skip ci]`. +- Compare-to-baseline check: fail the CI job if p95 regresses by more + than 20% vs the latest committed baseline. + +## Open questions + +- Where do we host the full nightly results? Inline markdown in the + repo is honest but noisy. A `gh-pages` site is more browseable but + is more infra to maintain. +- Should we fix Envoy / OPA / Cedar versions in `bench/Cargo.lock` and + bump them deliberately, or follow latest? Pinning is more + reproducible; following latest catches upstream regressions. +- Can the OPA HTTP host be skipped on PR runs and only included + nightly? Network-bound benches add variance.