From 3e6f2e9c77c3822af618464bf83a88ba9ea54658 Mon Sep 17 00:00:00 2001 From: Ralf Anton Beier Date: Tue, 23 Jun 2026 22:04:24 +0200 Subject: [PATCH] chore(runtime): add interpreter-throughput benchmark + baseline (measure-first for perf work) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes the tree-walker's throughput a measured number so speed work can be gated (synth discipline: a regression is a number). Self-contained criterion bench (wat-built modules, no filesystem fixture — the old, orphaned kiln-component/benches/execution_benchmarks.rs read a missing test_add.wasm): - compute_loop: arithmetic hot path (dispatch + operand stack + locals) - memory_loop: same loop + i64.store/i64.load per iter (the per-access lock path) Baseline (release, Apple silicon): compute ~14.3 M iter/s (~70 ns/iter); memory ~5.9 M iter/s (~169 ns/iter). The ~99 ns/iter gap for 2 memory ops quantifies the double-mutex memory-access overhead — the optimization signal. Next: a CI regression gate (criterion --save-baseline + compare) and then the memory-access fix, measured against this. Trace: SM-PERF-001 --- kiln-runtime/Cargo.toml | 8 ++ .../benches/interpreter_throughput.rs | 105 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 kiln-runtime/benches/interpreter_throughput.rs diff --git a/kiln-runtime/Cargo.toml b/kiln-runtime/Cargo.toml index f3e9a058..5469e4d4 100644 --- a/kiln-runtime/Cargo.toml +++ b/kiln-runtime/Cargo.toml @@ -239,3 +239,11 @@ kani-verifier = [] [dev-dependencies] serial_test = "3.4" wat = "1.244" +criterion = { version = "0.8", features = ["html_reports"] } + +# Interpreter instruction-throughput baseline (compute loop + memory loop). +# The memory loop exercises the per-access lock path so optimizations there +# (e.g. the double-mutex memory access) show a measurable delta. See #358. +[[bench]] +name = "interpreter_throughput" +harness = false diff --git a/kiln-runtime/benches/interpreter_throughput.rs b/kiln-runtime/benches/interpreter_throughput.rs new file mode 100644 index 00000000..aed87b49 --- /dev/null +++ b/kiln-runtime/benches/interpreter_throughput.rs @@ -0,0 +1,105 @@ +//! Interpreter instruction-throughput baseline. +//! +//! Makes the tree-walker's throughput a *measured number* so speed work can be +//! gated (the synth discipline: a regression is a number, not a vibe). Two +//! workloads, both built in-process via `wat` (no filesystem fixture): +//! +//! - `compute_loop` — a tight arithmetic loop (`i*i` accumulate). Exercises the +//! instruction-dispatch + operand-stack + local get/set hot path. +//! - `memory_loop` — the same loop with an `i64.store` + `i64.load` per +//! iteration. Exercises the per-access memory path (today a double `Mutex`), +//! so an optimization there shows up as a delta against this baseline. +//! +//! Throughput is reported per loop *iteration*; multiply by the per-iteration +//! instruction count (~8 compute / ~14 memory) for instructions/sec. +//! +//! Baseline (2026-06, rivet 0.17 era, Apple-silicon dev box, release build): +//! compute_loop ~14.0 ms / 200k iters → ~14.3 M iter/s (~70 ns/iter) +//! memory_loop ~33.8 ms / 200k iters → ~5.9 M iter/s (~169 ns/iter) +//! The ~99 ns/iter gap (only ~6 extra instructions) is dominated by the +//! per-access memory locking; a store+load costs ~47 ns of lock overhead. +//! That gap is the regression/optimization signal for the memory-access work. + +use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; +use kiln_foundation::values::Value; +use kiln_runtime::engine::{CapabilityAwareEngine, CapabilityEngine, EnginePreset}; + +/// Loop trip count per `execute` call. Kept modest because the tree-walker is +/// reference-grade (SM-PERF-001) — large enough to dominate call/setup, small +/// enough to keep the bench wall-clock sane. +const N: i32 = 200_000; + +fn compute_wasm() -> Vec { + wat::parse_str( + r#"(module + (func (export "compute_loop") (param $n i32) (result i64) + (local $i i32) (local $acc i64) + (block $done + (loop $l + (br_if $done (i32.ge_s (local.get $i) (local.get $n))) + (local.set $acc + (i64.add (local.get $acc) + (i64.extend_i32_u (i32.mul (local.get $i) (local.get $i))))) + (local.set $i (i32.add (local.get $i) (i32.const 1))) + (br $l))) + (local.get $acc)))"#, + ) + .expect("compute wat parses") +} + +fn memory_wasm() -> Vec { + wat::parse_str( + r#"(module + (memory 1) + (func (export "memory_loop") (param $n i32) (result i64) + (local $i i32) (local $acc i64) (local $addr i32) + (block $done + (loop $l + (br_if $done (i32.ge_s (local.get $i) (local.get $n))) + (local.set $addr + (i32.and (i32.shl (local.get $i) (i32.const 3)) (i32.const 0x7ff8))) + (i64.store (local.get $addr) (i64.extend_i32_u (local.get $i))) + (local.set $acc (i64.add (local.get $acc) (i64.load (local.get $addr)))) + (local.set $i (i32.add (local.get $i) (i32.const 1))) + (br $l))) + (local.get $acc)))"#, + ) + .expect("memory wat parses") +} + +/// Build an engine with an instantiated module and return a closure that runs +/// `export` once over `N` iterations. Setup is done once (outside the measured +/// loop); fuel is set high so the bound never trips mid-measurement. +fn bench_workload(c: &mut Criterion, name: &str, wasm: &[u8], export: &'static str) { + let mut engine = + CapabilityAwareEngine::with_preset(EnginePreset::QM).expect("engine"); + engine.set_fuel(u64::MAX); + let module = engine.load_module(wasm).expect("load"); + let instance = engine.instantiate(module).expect("instantiate"); + + // Sanity: the workload runs and returns a value before we measure it. + let _ = engine + .execute(instance, export, &[Value::I32(N)]) + .expect("workload executes"); + + let mut group = c.benchmark_group("interpreter_throughput"); + group.throughput(Throughput::Elements(N as u64)); + group.sample_size(10); + group.bench_function(name, |b| { + b.iter(|| { + let r = engine + .execute(instance, export, &[Value::I32(black_box(N))]) + .expect("execute"); + black_box(r); + }); + }); + group.finish(); +} + +fn benches(c: &mut Criterion) { + bench_workload(c, "compute_loop", &compute_wasm(), "compute_loop"); + bench_workload(c, "memory_loop", &memory_wasm(), "memory_loop"); +} + +criterion_group!(throughput, benches); +criterion_main!(throughput);