From 14abc0ee9031d83056d56447a0c4dabcf3f17d81 Mon Sep 17 00:00:00 2001
From: proboscis <nameissoap@gmail.com>
Date: Sat, 14 Mar 2026 16:14:26 +0900
Subject: [PATCH] Add doeff-vm benchmark baselines

---
 .gitignore                                 |   4 +
 Makefile                                   |  23 ++-
 benchmarks/README.md                       |  17 ++
 benchmarks/benchmark_runner.py             | 180 +++++++++++++++------
 benchmarks/pyvm_workloads.py               | 171 ++++++++++++++++++++
 packages/doeff-vm/Cargo.toml               |   7 +-
 packages/doeff-vm/benches/pyvm_baseline.rs | 106 ++++++++++++
 tests/test_benchmark_runner.py             |  47 ++++++
 8 files changed, 506 insertions(+), 49 deletions(-)
 create mode 100644 benchmarks/README.md
 create mode 100644 benchmarks/pyvm_workloads.py
 create mode 100644 packages/doeff-vm/benches/pyvm_baseline.rs
 create mode 100644 tests/test_benchmark_runner.py

diff --git a/.gitignore b/.gitignore
index 6091ec9f..6fdf0370 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,10 @@ dmypy.json
 # Rust build artifacts
 rust/**/target/
 packages/**/target/
+packages/*/Cargo.lock
+
+# Benchmark artifacts
+benchmarks/results/
 
 # Local deploy keys
 proboscis_doeff_deploy
diff --git a/Makefile b/Makefile
index 33f1ce13..e38622a9 100644
--- a/Makefile
+++ b/Makefile
@@ -4,7 +4,7 @@
 
 .PHONY: help install sync lint lint-ruff lint-pyright lint-semgrep lint-semgrep-docs lint-doeff lint-packages \
         test test-unit test-e2e test-packages test-all test-spec-audit-sa002 format check pre-commit-install clean \
-        install-opencode-spec-gap-tdd
+        bench-python bench-vm install-opencode-spec-gap-tdd
 
 # Default target
 help:
@@ -33,6 +33,10 @@ help:
 	@echo "  make test-all          Run ALL tests (core + packages)"
 	@echo "  make test-spec-audit-sa002 Run SA-002 pytest + semgrep checks"
 	@echo ""
+	@echo "Benchmarks:"
+	@echo "  make bench-python      Run public Python benchmark runner"
+	@echo "  make bench-vm          Run criterion benchmark for doeff-vm"
+	@echo ""
 	@echo "Formatting:"
 	@echo "  make format            Format code with ruff"
 	@echo "  make check             Run format check without modifying files"
@@ -161,6 +165,23 @@ test-spec-audit-sa002:
 		exit 1; \
 	fi
 
+BENCH_VM_BASE_PYTHON := $(shell uv run python -c "import sys; print(sys._base_executable)")
+BENCH_VM_PYTHON_HOME := $(shell uv run python -c "import sys; print(sys.base_prefix)")
+BENCH_VM_SITE_PACKAGES := $(shell uv run python -c "import site; print(site.getsitepackages()[0])")
+BENCH_PYTHON_ARGS ?=
+BENCH_VM_ARGS ?=
+
+bench-python:
+	uv run python benchmarks/benchmark_runner.py $(BENCH_PYTHON_ARGS)
+
+bench-vm:
+	cd packages/doeff-vm && env \
+		PYO3_PYTHON=$(BENCH_VM_BASE_PYTHON) \
+		PYTHONHOME=$(BENCH_VM_PYTHON_HOME) \
+		DOEFF_BENCH_SITE_PACKAGES=$(BENCH_VM_SITE_PACKAGES) \
+		LD_LIBRARY_PATH=$(BENCH_VM_PYTHON_HOME)/lib \
+		cargo bench --bench pyvm_baseline -- $(BENCH_VM_ARGS)
+
 # =============================================================================
 # Formatting
 # =============================================================================
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 00000000..b74634c0
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,17 @@
+# Benchmarks
+
+Python benchmark artifacts:
+
+- Run `make bench-python` to execute the public `doeff.run(...)` benchmark suite.
+- Results are written to `benchmarks/results/doeff_vm_benchmark_results.json` and
+  `benchmarks/results/doeff_vm_benchmark_results.csv`.
+
+Rust criterion artifacts:
+
+- Run `make bench-vm` to execute the `criterion` baseline suite for `packages/doeff-vm`.
+- Criterion reports are written under `packages/doeff-vm/target/criterion/`.
+
+Useful overrides:
+
+- `make bench-python BENCH_PYTHON_ARGS="--runs 500 --iterations 50"`
+- `make bench-vm BENCH_VM_ARGS="--sample-size 20 --measurement-time 1 --warm-up-time 1 --noplot"`
diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
index 78737006..14dec2cc 100644
--- a/benchmarks/benchmark_runner.py
+++ b/benchmarks/benchmark_runner.py
@@ -1,84 +1,170 @@
-"""Micro-benchmarks for the doeff interpreter.
+"""Benchmark runner for the public doeff Python API backed by doeff-vm.
 
 Usage
 -----
-    uv run python benchmarks/benchmark_runner.py --runs 500
+    uv run python benchmarks/benchmark_runner.py --runs 500 --iterations 25
 """
 
 from __future__ import annotations
 
 import argparse
+import csv
+import json
 import statistics
 import time
-from collections.abc import Iterable
+from dataclasses import asdict, dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+
+from benchmarks.pyvm_workloads import build_public_benchmark_cases
+
+
+@dataclass(frozen=True)
+class BenchmarkMeasurement:
+    name: str
+    runner: str
+    workload: str
+    runs: int
+    workload_iterations: int
+    expected_value: int
+    min_ms: float
+    max_ms: float
+    mean_ms: float
+    median_ms: float
+
+
+@dataclass(frozen=True)
+class BenchmarkReport:
+    generated_at: str
+    runs: int
+    workload_iterations: int
+    results: list[BenchmarkMeasurement]
+
+
+def _measure_case(case, *, runs: int, workload_iterations: int) -> BenchmarkMeasurement:
+    observed = case.invoke()
+    if observed != case.expected_value:
+        raise AssertionError(
+            f"{case.name} returned {observed!r}, expected {case.expected_value!r}"
+        )
 
-from doeff import default_handlers, do, run
-from doeff.effects import Ask, Put, Tell
+    timings: list[float] = []
+    last_value = observed
+    for _ in range(runs):
+        start = time.perf_counter()
+        last_value = case.invoke()
+        elapsed = (time.perf_counter() - start) * 1000.0
+        timings.append(elapsed)
 
+    if last_value != case.expected_value:
+        raise AssertionError(
+            f"{case.name} returned {last_value!r} after timing, expected {case.expected_value!r}"
+        )
 
-@do
-def _stateful_workload(iterations: int) -> int:
-    value = yield Ask("seed")
-    total = value
-    for index in range(iterations):
-        yield Tell(f"iteration:{index}")
-        yield Put("counter", index)
-        total += index
-    return total
+    return BenchmarkMeasurement(
+        name=case.name,
+        runner=case.runner,
+        workload=case.workload,
+        runs=runs,
+        workload_iterations=workload_iterations,
+        expected_value=case.expected_value,
+        min_ms=min(timings),
+        max_ms=max(timings),
+        mean_ms=statistics.mean(timings),
+        median_ms=statistics.median(timings),
+    )
 
 
-def _run_once(workload_iterations: int) -> None:
-    run(
-        _stateful_workload(workload_iterations),
-        handlers=default_handlers(),
-        env={"seed": 1},
+def run_benchmarks(*, runs: int, workload_iterations: int) -> BenchmarkReport:
+    cases = build_public_benchmark_cases(workload_iterations)
+    results = [
+        _measure_case(case, runs=runs, workload_iterations=workload_iterations) for case in cases
+    ]
+    return BenchmarkReport(
+        generated_at=datetime.now(timezone.utc).isoformat(),
+        runs=runs,
+        workload_iterations=workload_iterations,
+        results=results,
     )
 
 
-def benchmark(runs: int, *, workload_iterations: int) -> dict[str, float]:
-    timings: list[float] = []
+def write_report(report: BenchmarkReport, output_dir: Path) -> dict[str, Path]:
+    output_dir.mkdir(parents=True, exist_ok=True)
 
-    for _ in range(runs):
-        start = time.perf_counter()
-        _run_once(workload_iterations)
-        elapsed = (time.perf_counter() - start) * 1000.0
-        timings.append(elapsed)
+    json_path = output_dir / "doeff_vm_benchmark_results.json"
+    csv_path = output_dir / "doeff_vm_benchmark_results.csv"
 
-    return {
-        "runs": runs,
-        "workload_iterations": workload_iterations,
-        "min_ms": min(timings),
-        "max_ms": max(timings),
-        "mean_ms": statistics.mean(timings),
-        "median_ms": statistics.median(timings),
+    json_payload = {
+        "metadata": {
+            "generated_at": report.generated_at,
+            "runs": report.runs,
+            "workload_iterations": report.workload_iterations,
+        },
+        "results": [asdict(result) for result in report.results],
     }
+    json_path.write_text(json.dumps(json_payload, indent=2, sort_keys=True) + "\n")
+
+    with csv_path.open("w", newline="") as handle:
+        writer = csv.DictWriter(
+            handle,
+            fieldnames=[
+                "name",
+                "runner",
+                "workload",
+                "runs",
+                "workload_iterations",
+                "expected_value",
+                "min_ms",
+                "max_ms",
+                "mean_ms",
+                "median_ms",
+            ],
+        )
+        writer.writeheader()
+        for result in report.results:
+            writer.writerow(asdict(result))
+
+    return {"json": json_path, "csv": csv_path}
 
 
-def format_report(results: Iterable[tuple[str, dict[str, float]]]) -> str:
-    lines = ["doeff benchmark results:"]
-    for label, stats in results:
-        lines.append(f"  {label}:")
+def format_report(report: BenchmarkReport, *, output_paths: dict[str, Path] | None = None) -> str:
+    lines = ["doeff-vm benchmark results:"]
+    for result in report.results:
         lines.append(
-            "    runs={runs} iterations={workload_iterations} | min={min_ms:.2f}ms "
-            "median={median_ms:.2f}ms mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**stats)
+            "  {name}: runs={runs} iterations={workload_iterations} | "
+            "min={min_ms:.2f}ms median={median_ms:.2f}ms "
+            "mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**asdict(result))
         )
+    if output_paths is not None:
+        lines.append(f"  json={output_paths['json']}")
+        lines.append(f"  csv={output_paths['csv']}")
     return "\n".join(lines)
 
 
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Benchmark doeff interpreter execution")
-    parser.add_argument("--runs", type=int, default=100, help="Number of interpreter executions")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Benchmark doeff-vm through the public Python API")
+    parser.add_argument("--runs", type=int, default=100, help="Number of executions per workload")
     parser.add_argument(
         "--iterations",
         type=int,
         default=25,
-        help="Inner loop iterations in the workload",
+        help="Inner loop iterations for stateful workloads",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("benchmarks/results"),
+        help="Directory for JSON and CSV benchmark artifacts",
     )
-    args = parser.parse_args()
+    return parser.parse_args()
 
-    stats = benchmark(args.runs, workload_iterations=args.iterations)
-    print(format_report([("stateful_workload", stats)]))
+
+def main() -> None:
+    args = parse_args()
+    report = run_benchmarks(runs=args.runs, workload_iterations=args.iterations)
+    output_paths = write_report(report, args.output_dir)
+    print(format_report(report, output_paths=output_paths))
 
 
-if __name__ == "main":  # pragma: no cover - CLI script
+if __name__ == "__main__":  # pragma: no cover - CLI script
     main()
diff --git a/benchmarks/pyvm_workloads.py b/benchmarks/pyvm_workloads.py
new file mode 100644
index 00000000..1bfd45be
--- /dev/null
+++ b/benchmarks/pyvm_workloads.py
@@ -0,0 +1,171 @@
+"""Shared workload definitions for doeff-vm benchmarks."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import doeff_vm
+
+from doeff import Program, default_handlers, do, run
+from doeff.effects import Ask, Get, Put, Tell
+
+
+@dataclass(frozen=True)
+class CallableBenchmarkCase:
+    name: str
+    runner: str
+    workload: str
+    expected_value: int
+    invoke: Callable[[], int]
+
+
+@dataclass(frozen=True)
+class WorkloadSpec:
+    workload: str
+    expected_value: int
+    env: dict[object, object]
+    program_factory: Callable[[], Program[int]]
+
+
+def _wrap_handlers(program: Program[int], *handlers: object) -> Program[int]:
+    wrapped = program
+    for handler in reversed(handlers):
+        wrapped = doeff_vm.WithHandler(handler, wrapped)
+    return wrapped
+
+
+@do
+def _pure_program() -> Program[int]:
+    return 42
+    yield
+
+
+@do
+def _state_program(iterations: int) -> Program[int]:
+    yield Put("counter", 0)
+    total = 0
+    for _ in range(iterations):
+        current = yield Get("counter")
+        total += current
+        yield Put("counter", current + 1)
+    return total
+
+
+@do
+def _state_writer_program(iterations: int) -> Program[int]:
+    seed = yield Ask("seed")
+    yield Put("counter", seed)
+    total = 0
+    for index in range(iterations):
+        current = yield Get("counter")
+        yield Tell(f"iteration:{index}")
+        total += current
+        yield Put("counter", current + 1)
+    return total
+
+
+def build_workload_specs(iterations: int) -> list[WorkloadSpec]:
+    return [
+        WorkloadSpec(
+            workload="pure",
+            expected_value=42,
+            env={},
+            program_factory=_pure_program,
+        ),
+        WorkloadSpec(
+            workload="state",
+            expected_value=iterations * (iterations - 1) // 2,
+            env={},
+            program_factory=lambda: _state_program(iterations),
+        ),
+        WorkloadSpec(
+            workload="state_writer",
+            expected_value=iterations + (iterations * (iterations - 1) // 2),
+            env={"seed": 1},
+            program_factory=lambda: _state_writer_program(iterations),
+        ),
+    ]
+
+
+def build_public_benchmark_cases(iterations: int) -> list[CallableBenchmarkCase]:
+    cases: list[CallableBenchmarkCase] = []
+    for spec in build_workload_specs(iterations):
+        env = dict(spec.env)
+
+        def invoke(
+            program_factory: Callable[[], Program[int]] = spec.program_factory,
+            env_values: dict[object, object] = env,
+        ) -> int:
+            return run(
+                program_factory(),
+                handlers=default_handlers(),
+                env=dict(env_values),
+            ).value
+
+        cases.append(
+            CallableBenchmarkCase(
+                name=f"public_run:{spec.workload}",
+                runner="public_run",
+                workload=spec.workload,
+                expected_value=spec.expected_value,
+                invoke=invoke,
+            )
+        )
+    return cases
+
+
+def build_raw_vm_benchmark_cases(iterations: int) -> list[CallableBenchmarkCase]:
+    cases: list[CallableBenchmarkCase] = []
+    for spec in build_workload_specs(iterations):
+        env = dict(spec.env)
+
+        def module_invoke(
+            program_factory: Callable[[], Program[int]] = spec.program_factory,
+            env_values: dict[object, object] = env,
+        ) -> int:
+            wrapped = _wrap_handlers(program_factory(), *default_handlers())
+            return doeff_vm.run(wrapped, env=dict(env_values)).value
+
+        def pyvm_invoke(
+            program_factory: Callable[[], Program[int]] = spec.program_factory,
+            env_values: dict[object, object] = env,
+        ) -> int:
+            vm = doeff_vm.PyVM()
+            for key, value in env_values.items():
+                vm.put_env(key, value)
+            return vm.run(_wrap_handlers(program_factory(), *default_handlers()))
+
+        cases.extend(
+            [
+                CallableBenchmarkCase(
+                    name=f"module_run:{spec.workload}",
+                    runner="module_run",
+                    workload=spec.workload,
+                    expected_value=spec.expected_value,
+                    invoke=module_invoke,
+                ),
+                CallableBenchmarkCase(
+                    name=f"pyvm_fresh:{spec.workload}",
+                    runner="pyvm_fresh",
+                    workload=spec.workload,
+                    expected_value=spec.expected_value,
+                    invoke=pyvm_invoke,
+                ),
+            ]
+        )
+    return cases
+
+
+def benchmark_case_names(iterations: int, *, include_raw_vm: bool = False) -> list[str]:
+    cases = build_public_benchmark_cases(iterations)
+    if include_raw_vm:
+        cases.extend(build_raw_vm_benchmark_cases(iterations))
+    return [case.name for case in cases]
+
+
+def benchmark_case_map(iterations: int, *, include_raw_vm: bool = False) -> dict[str, Callable[[], int]]:
+    cases = build_public_benchmark_cases(iterations)
+    if include_raw_vm:
+        cases.extend(build_raw_vm_benchmark_cases(iterations))
+    return {case.name: case.invoke for case in cases}
diff --git a/packages/doeff-vm/Cargo.toml b/packages/doeff-vm/Cargo.toml
index 6a65c721..0587e21a 100644
--- a/packages/doeff-vm/Cargo.toml
+++ b/packages/doeff-vm/Cargo.toml
@@ -10,7 +10,7 @@ keywords = ["algebraic-effects", "vm", "python", "continuations"]
 
 [lib]
 name = "doeff_vm"
-crate-type = ["cdylib"]
+crate-type = ["cdylib", "rlib"]
 
 [features]
 default = []
@@ -22,8 +22,13 @@ doeff-vm-core = { path = "../doeff-vm-core", features = ["python_bridge"] }
 doeff-core-effects = { path = "../doeff-core-effects" }
 
 [dev-dependencies]
+criterion = { version = "0.5", features = ["html_reports"] }
 pyo3 = { version = "0.28", features = ["auto-initialize", "py-clone"] }
 
+[[bench]]
+name = "pyvm_baseline"
+harness = false
+
 [profile.release]
 opt-level = 3
 lto = true
diff --git a/packages/doeff-vm/benches/pyvm_baseline.rs b/packages/doeff-vm/benches/pyvm_baseline.rs
new file mode 100644
index 00000000..11a27832
--- /dev/null
+++ b/packages/doeff-vm/benches/pyvm_baseline.rs
@@ -0,0 +1,106 @@
+use std::path::PathBuf;
+use std::sync::Once;
+use std::time::Duration;
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use doeff_vm::pyvm::doeff_vm;
+use pyo3::append_to_inittab;
+use pyo3::prelude::*;
+use pyo3::types::{PyDict, PyList};
+
+#[derive(Clone)]
+struct BenchmarkCase {
+    name: String,
+    runner: String,
+    workload: String,
+    invoke: Py<PyAny>,
+}
+
+fn repo_root() -> PathBuf {
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("../..")
+        .canonicalize()
+        .expect("repo root should resolve")
+}
+
+fn venv_site_packages() -> PathBuf {
+    std::env::var_os("DOEFF_BENCH_SITE_PACKAGES")
+        .map(PathBuf::from)
+        .expect("DOEFF_BENCH_SITE_PACKAGES should be set")
+        .canonicalize()
+        .expect("venv site-packages should resolve")
+}
+
+fn initialize_python() {
+    static INIT: Once = Once::new();
+    INIT.call_once(|| {
+        append_to_inittab!(doeff_vm);
+        Python::initialize();
+    });
+}
+
+fn load_cases(iterations: usize) -> PyResult<Vec<BenchmarkCase>> {
+    initialize_python();
+    Python::attach(|py| {
+        let sys = py.import("sys")?;
+        let sys_path_obj = sys.getattr("path")?;
+        let sys_path = sys_path_obj.cast::<PyList>()?;
+        let repo_root = repo_root();
+        let repo_root_str = repo_root.to_string_lossy().to_string();
+        let site_packages = venv_site_packages();
+        let site_packages_str = site_packages.to_string_lossy().to_string();
+        sys_path.insert(0, repo_root_str.as_str())?;
+        sys_path.insert(0, site_packages_str.as_str())?;
+
+        let modules_obj = sys.getattr("modules")?;
+        let modules = modules_obj.cast::<PyDict>()?;
+        let doeff_vm_module = py.import("doeff_vm")?;
+        doeff_vm_module.setattr("doeff_vm", doeff_vm_module.clone())?;
+        modules.set_item("doeff_vm.doeff_vm", doeff_vm_module)?;
+
+        let workload_module = py.import("benchmarks.pyvm_workloads")?;
+        let cases = workload_module.call_method1("build_raw_vm_benchmark_cases", (iterations,))?;
+
+        let mut loaded = Vec::new();
+        for item in cases.try_iter()? {
+            let item = item?;
+            loaded.push(BenchmarkCase {
+                name: item.getattr("name")?.extract()?,
+                runner: item.getattr("runner")?.extract()?,
+                workload: item.getattr("workload")?.extract()?,
+                invoke: item.getattr("invoke")?.unbind(),
+            });
+        }
+        Ok(loaded)
+    })
+}
+
+fn benchmark_pyvm_baseline(c: &mut Criterion) {
+    let cases = load_cases(25).expect("criterion workload cases should load");
+    let mut group = c.benchmark_group("doeff_vm_baseline");
+    group.warm_up_time(Duration::from_millis(500));
+    group.measurement_time(Duration::from_secs(3));
+    group.sample_size(20);
+
+    for case in cases {
+        let label = BenchmarkId::new(case.runner.clone(), case.workload.clone());
+        let invoke = case.invoke;
+        let name = case.name.clone();
+        group.bench_function(label, move |b| {
+            b.iter(|| {
+                Python::attach(|py| {
+                    let result = invoke
+                        .bind(py)
+                        .call0()
+                        .unwrap_or_else(|err| panic!("{name} failed: {err}"));
+                    black_box(result);
+                });
+            });
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, benchmark_pyvm_baseline);
+criterion_main!(benches);
diff --git a/tests/test_benchmark_runner.py b/tests/test_benchmark_runner.py
new file mode 100644
index 00000000..3269b413
--- /dev/null
+++ b/tests/test_benchmark_runner.py
@@ -0,0 +1,47 @@
+import csv
+import json
+from pathlib import Path
+
+from benchmarks.benchmark_runner import format_report, run_benchmarks, write_report
+
+
+def test_run_benchmarks_writes_json_and_csv_outputs(tmp_path: Path) -> None:
+    report = run_benchmarks(runs=2, workload_iterations=3)
+    output_paths = write_report(report, tmp_path)
+
+    json_path = output_paths["json"]
+    csv_path = output_paths["csv"]
+
+    assert json_path.exists()
+    assert csv_path.exists()
+
+    payload = json.loads(json_path.read_text())
+    assert payload["metadata"]["runs"] == 2
+    assert payload["metadata"]["workload_iterations"] == 3
+    assert {entry["name"] for entry in payload["results"]} == {
+        "public_run:pure",
+        "public_run:state",
+        "public_run:state_writer",
+    }
+
+    with csv_path.open(newline="") as handle:
+        rows = list(csv.DictReader(handle))
+
+    assert len(rows) == 3
+    assert {row["name"] for row in rows} == {
+        "public_run:pure",
+        "public_run:state",
+        "public_run:state_writer",
+    }
+
+
+def test_format_report_mentions_generated_artifacts(tmp_path: Path) -> None:
+    report = run_benchmarks(runs=1, workload_iterations=2)
+    output_paths = write_report(report, tmp_path)
+
+    summary = format_report(report, output_paths=output_paths)
+
+    assert "doeff-vm benchmark results:" in summary
+    assert "public_run:pure" in summary
+    assert str(output_paths["json"]) in summary
+    assert str(output_paths["csv"]) in summary