From 14abc0ee9031d83056d56447a0c4dabcf3f17d81 Mon Sep 17 00:00:00 2001 From: proboscis Date: Sat, 14 Mar 2026 16:14:26 +0900 Subject: [PATCH] Add doeff-vm benchmark baselines --- .gitignore | 4 + Makefile | 23 ++- benchmarks/README.md | 17 ++ benchmarks/benchmark_runner.py | 180 +++++++++++++++------ benchmarks/pyvm_workloads.py | 171 ++++++++++++++++++++ packages/doeff-vm/Cargo.toml | 7 +- packages/doeff-vm/benches/pyvm_baseline.rs | 106 ++++++++++++ tests/test_benchmark_runner.py | 47 ++++++ 8 files changed, 506 insertions(+), 49 deletions(-) create mode 100644 benchmarks/README.md create mode 100644 benchmarks/pyvm_workloads.py create mode 100644 packages/doeff-vm/benches/pyvm_baseline.rs create mode 100644 tests/test_benchmark_runner.py diff --git a/.gitignore b/.gitignore index 6091ec9f..6fdf0370 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,10 @@ dmypy.json # Rust build artifacts rust/**/target/ packages/**/target/ +packages/*/Cargo.lock + +# Benchmark artifacts +benchmarks/results/ # Local deploy keys proboscis_doeff_deploy diff --git a/Makefile b/Makefile index 33f1ce13..e38622a9 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ .PHONY: help install sync lint lint-ruff lint-pyright lint-semgrep lint-semgrep-docs lint-doeff lint-packages \ test test-unit test-e2e test-packages test-all test-spec-audit-sa002 format check pre-commit-install clean \ - install-opencode-spec-gap-tdd + bench-python bench-vm install-opencode-spec-gap-tdd # Default target help: @@ -33,6 +33,10 @@ help: @echo " make test-all Run ALL tests (core + packages)" @echo " make test-spec-audit-sa002 Run SA-002 pytest + semgrep checks" @echo "" + @echo "Benchmarks:" + @echo " make bench-python Run public Python benchmark runner" + @echo " make bench-vm Run criterion benchmark for doeff-vm" + @echo "" @echo "Formatting:" @echo " make format Format code with ruff" @echo " make check Run format check without modifying files" @@ -161,6 +165,23 @@ test-spec-audit-sa002: exit 1; \ fi +BENCH_VM_BASE_PYTHON := $(shell uv run python -c "import sys; print(sys._base_executable)") +BENCH_VM_PYTHON_HOME := $(shell uv run python -c "import sys; print(sys.base_prefix)") +BENCH_VM_SITE_PACKAGES := $(shell uv run python -c "import site; print(site.getsitepackages()[0])") +BENCH_PYTHON_ARGS ?= +BENCH_VM_ARGS ?= + +bench-python: + uv run python benchmarks/benchmark_runner.py $(BENCH_PYTHON_ARGS) + +bench-vm: + cd packages/doeff-vm && env \ + PYO3_PYTHON=$(BENCH_VM_BASE_PYTHON) \ + PYTHONHOME=$(BENCH_VM_PYTHON_HOME) \ + DOEFF_BENCH_SITE_PACKAGES=$(BENCH_VM_SITE_PACKAGES) \ + LD_LIBRARY_PATH=$(BENCH_VM_PYTHON_HOME)/lib \ + cargo bench --bench pyvm_baseline -- $(BENCH_VM_ARGS) + # ============================================================================= # Formatting # ============================================================================= diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 00000000..b74634c0 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,17 @@ +# Benchmarks + +Python benchmark artifacts: + +- Run `make bench-python` to execute the public `doeff.run(...)` benchmark suite. +- Results are written to `benchmarks/results/doeff_vm_benchmark_results.json` and + `benchmarks/results/doeff_vm_benchmark_results.csv`. + +Rust criterion artifacts: + +- Run `make bench-vm` to execute the `criterion` baseline suite for `packages/doeff-vm`. +- Criterion reports are written under `packages/doeff-vm/target/criterion/`. + +Useful overrides: + +- `make bench-python BENCH_PYTHON_ARGS="--runs 500 --iterations 50"` +- `make bench-vm BENCH_VM_ARGS="--sample-size 20 --measurement-time 1 --warm-up-time 1 --noplot"` diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py index 78737006..14dec2cc 100644 --- a/benchmarks/benchmark_runner.py +++ b/benchmarks/benchmark_runner.py @@ -1,84 +1,170 @@ -"""Micro-benchmarks for the doeff interpreter. +"""Benchmark runner for the public doeff Python API backed by doeff-vm. Usage ----- - uv run python benchmarks/benchmark_runner.py --runs 500 + uv run python benchmarks/benchmark_runner.py --runs 500 --iterations 25 """ from __future__ import annotations import argparse +import csv +import json import statistics import time -from collections.abc import Iterable +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from pathlib import Path + +from benchmarks.pyvm_workloads import build_public_benchmark_cases + + +@dataclass(frozen=True) +class BenchmarkMeasurement: + name: str + runner: str + workload: str + runs: int + workload_iterations: int + expected_value: int + min_ms: float + max_ms: float + mean_ms: float + median_ms: float + + +@dataclass(frozen=True) +class BenchmarkReport: + generated_at: str + runs: int + workload_iterations: int + results: list[BenchmarkMeasurement] + + +def _measure_case(case, *, runs: int, workload_iterations: int) -> BenchmarkMeasurement: + observed = case.invoke() + if observed != case.expected_value: + raise AssertionError( + f"{case.name} returned {observed!r}, expected {case.expected_value!r}" + ) -from doeff import default_handlers, do, run -from doeff.effects import Ask, Put, Tell + timings: list[float] = [] + last_value = observed + for _ in range(runs): + start = time.perf_counter() + last_value = case.invoke() + elapsed = (time.perf_counter() - start) * 1000.0 + timings.append(elapsed) + if last_value != case.expected_value: + raise AssertionError( + f"{case.name} returned {last_value!r} after timing, expected {case.expected_value!r}" + ) -@do -def _stateful_workload(iterations: int) -> int: - value = yield Ask("seed") - total = value - for index in range(iterations): - yield Tell(f"iteration:{index}") - yield Put("counter", index) - total += index - return total + return BenchmarkMeasurement( + name=case.name, + runner=case.runner, + workload=case.workload, + runs=runs, + workload_iterations=workload_iterations, + expected_value=case.expected_value, + min_ms=min(timings), + max_ms=max(timings), + mean_ms=statistics.mean(timings), + median_ms=statistics.median(timings), + ) -def _run_once(workload_iterations: int) -> None: - run( - _stateful_workload(workload_iterations), - handlers=default_handlers(), - env={"seed": 1}, +def run_benchmarks(*, runs: int, workload_iterations: int) -> BenchmarkReport: + cases = build_public_benchmark_cases(workload_iterations) + results = [ + _measure_case(case, runs=runs, workload_iterations=workload_iterations) for case in cases + ] + return BenchmarkReport( + generated_at=datetime.now(timezone.utc).isoformat(), + runs=runs, + workload_iterations=workload_iterations, + results=results, ) -def benchmark(runs: int, *, workload_iterations: int) -> dict[str, float]: - timings: list[float] = [] +def write_report(report: BenchmarkReport, output_dir: Path) -> dict[str, Path]: + output_dir.mkdir(parents=True, exist_ok=True) - for _ in range(runs): - start = time.perf_counter() - _run_once(workload_iterations) - elapsed = (time.perf_counter() - start) * 1000.0 - timings.append(elapsed) + json_path = output_dir / "doeff_vm_benchmark_results.json" + csv_path = output_dir / "doeff_vm_benchmark_results.csv" - return { - "runs": runs, - "workload_iterations": workload_iterations, - "min_ms": min(timings), - "max_ms": max(timings), - "mean_ms": statistics.mean(timings), - "median_ms": statistics.median(timings), + json_payload = { + "metadata": { + "generated_at": report.generated_at, + "runs": report.runs, + "workload_iterations": report.workload_iterations, + }, + "results": [asdict(result) for result in report.results], } + json_path.write_text(json.dumps(json_payload, indent=2, sort_keys=True) + "\n") + + with csv_path.open("w", newline="") as handle: + writer = csv.DictWriter( + handle, + fieldnames=[ + "name", + "runner", + "workload", + "runs", + "workload_iterations", + "expected_value", + "min_ms", + "max_ms", + "mean_ms", + "median_ms", + ], + ) + writer.writeheader() + for result in report.results: + writer.writerow(asdict(result)) + + return {"json": json_path, "csv": csv_path} -def format_report(results: Iterable[tuple[str, dict[str, float]]]) -> str: - lines = ["doeff benchmark results:"] - for label, stats in results: - lines.append(f" {label}:") +def format_report(report: BenchmarkReport, *, output_paths: dict[str, Path] | None = None) -> str: + lines = ["doeff-vm benchmark results:"] + for result in report.results: lines.append( - " runs={runs} iterations={workload_iterations} | min={min_ms:.2f}ms " - "median={median_ms:.2f}ms mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**stats) + " {name}: runs={runs} iterations={workload_iterations} | " + "min={min_ms:.2f}ms median={median_ms:.2f}ms " + "mean={mean_ms:.2f}ms max={max_ms:.2f}ms".format(**asdict(result)) ) + if output_paths is not None: + lines.append(f" json={output_paths['json']}") + lines.append(f" csv={output_paths['csv']}") return "\n".join(lines) -def main() -> None: - parser = argparse.ArgumentParser(description="Benchmark doeff interpreter execution") - parser.add_argument("--runs", type=int, default=100, help="Number of interpreter executions") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark doeff-vm through the public Python API") + parser.add_argument("--runs", type=int, default=100, help="Number of executions per workload") parser.add_argument( "--iterations", type=int, default=25, - help="Inner loop iterations in the workload", + help="Inner loop iterations for stateful workloads", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("benchmarks/results"), + help="Directory for JSON and CSV benchmark artifacts", ) - args = parser.parse_args() + return parser.parse_args() - stats = benchmark(args.runs, workload_iterations=args.iterations) - print(format_report([("stateful_workload", stats)])) + +def main() -> None: + args = parse_args() + report = run_benchmarks(runs=args.runs, workload_iterations=args.iterations) + output_paths = write_report(report, args.output_dir) + print(format_report(report, output_paths=output_paths)) -if __name__ == "main": # pragma: no cover - CLI script +if __name__ == "__main__": # pragma: no cover - CLI script main() diff --git a/benchmarks/pyvm_workloads.py b/benchmarks/pyvm_workloads.py new file mode 100644 index 00000000..1bfd45be --- /dev/null +++ b/benchmarks/pyvm_workloads.py @@ -0,0 +1,171 @@ +"""Shared workload definitions for doeff-vm benchmarks.""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass + +import doeff_vm + +from doeff import Program, default_handlers, do, run +from doeff.effects import Ask, Get, Put, Tell + + +@dataclass(frozen=True) +class CallableBenchmarkCase: + name: str + runner: str + workload: str + expected_value: int + invoke: Callable[[], int] + + +@dataclass(frozen=True) +class WorkloadSpec: + workload: str + expected_value: int + env: dict[object, object] + program_factory: Callable[[], Program[int]] + + +def _wrap_handlers(program: Program[int], *handlers: object) -> Program[int]: + wrapped = program + for handler in reversed(handlers): + wrapped = doeff_vm.WithHandler(handler, wrapped) + return wrapped + + +@do +def _pure_program() -> Program[int]: + return 42 + yield + + +@do +def _state_program(iterations: int) -> Program[int]: + yield Put("counter", 0) + total = 0 + for _ in range(iterations): + current = yield Get("counter") + total += current + yield Put("counter", current + 1) + return total + + +@do +def _state_writer_program(iterations: int) -> Program[int]: + seed = yield Ask("seed") + yield Put("counter", seed) + total = 0 + for index in range(iterations): + current = yield Get("counter") + yield Tell(f"iteration:{index}") + total += current + yield Put("counter", current + 1) + return total + + +def build_workload_specs(iterations: int) -> list[WorkloadSpec]: + return [ + WorkloadSpec( + workload="pure", + expected_value=42, + env={}, + program_factory=_pure_program, + ), + WorkloadSpec( + workload="state", + expected_value=iterations * (iterations - 1) // 2, + env={}, + program_factory=lambda: _state_program(iterations), + ), + WorkloadSpec( + workload="state_writer", + expected_value=iterations + (iterations * (iterations - 1) // 2), + env={"seed": 1}, + program_factory=lambda: _state_writer_program(iterations), + ), + ] + + +def build_public_benchmark_cases(iterations: int) -> list[CallableBenchmarkCase]: + cases: list[CallableBenchmarkCase] = [] + for spec in build_workload_specs(iterations): + env = dict(spec.env) + + def invoke( + program_factory: Callable[[], Program[int]] = spec.program_factory, + env_values: dict[object, object] = env, + ) -> int: + return run( + program_factory(), + handlers=default_handlers(), + env=dict(env_values), + ).value + + cases.append( + CallableBenchmarkCase( + name=f"public_run:{spec.workload}", + runner="public_run", + workload=spec.workload, + expected_value=spec.expected_value, + invoke=invoke, + ) + ) + return cases + + +def build_raw_vm_benchmark_cases(iterations: int) -> list[CallableBenchmarkCase]: + cases: list[CallableBenchmarkCase] = [] + for spec in build_workload_specs(iterations): + env = dict(spec.env) + + def module_invoke( + program_factory: Callable[[], Program[int]] = spec.program_factory, + env_values: dict[object, object] = env, + ) -> int: + wrapped = _wrap_handlers(program_factory(), *default_handlers()) + return doeff_vm.run(wrapped, env=dict(env_values)).value + + def pyvm_invoke( + program_factory: Callable[[], Program[int]] = spec.program_factory, + env_values: dict[object, object] = env, + ) -> int: + vm = doeff_vm.PyVM() + for key, value in env_values.items(): + vm.put_env(key, value) + return vm.run(_wrap_handlers(program_factory(), *default_handlers())) + + cases.extend( + [ + CallableBenchmarkCase( + name=f"module_run:{spec.workload}", + runner="module_run", + workload=spec.workload, + expected_value=spec.expected_value, + invoke=module_invoke, + ), + CallableBenchmarkCase( + name=f"pyvm_fresh:{spec.workload}", + runner="pyvm_fresh", + workload=spec.workload, + expected_value=spec.expected_value, + invoke=pyvm_invoke, + ), + ] + ) + return cases + + +def benchmark_case_names(iterations: int, *, include_raw_vm: bool = False) -> list[str]: + cases = build_public_benchmark_cases(iterations) + if include_raw_vm: + cases.extend(build_raw_vm_benchmark_cases(iterations)) + return [case.name for case in cases] + + +def benchmark_case_map(iterations: int, *, include_raw_vm: bool = False) -> dict[str, Callable[[], int]]: + cases = build_public_benchmark_cases(iterations) + if include_raw_vm: + cases.extend(build_raw_vm_benchmark_cases(iterations)) + return {case.name: case.invoke for case in cases} diff --git a/packages/doeff-vm/Cargo.toml b/packages/doeff-vm/Cargo.toml index 6a65c721..0587e21a 100644 --- a/packages/doeff-vm/Cargo.toml +++ b/packages/doeff-vm/Cargo.toml @@ -10,7 +10,7 @@ keywords = ["algebraic-effects", "vm", "python", "continuations"] [lib] name = "doeff_vm" -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] [features] default = [] @@ -22,8 +22,13 @@ doeff-vm-core = { path = "../doeff-vm-core", features = ["python_bridge"] } doeff-core-effects = { path = "../doeff-core-effects" } [dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } pyo3 = { version = "0.28", features = ["auto-initialize", "py-clone"] } +[[bench]] +name = "pyvm_baseline" +harness = false + [profile.release] opt-level = 3 lto = true diff --git a/packages/doeff-vm/benches/pyvm_baseline.rs b/packages/doeff-vm/benches/pyvm_baseline.rs new file mode 100644 index 00000000..11a27832 --- /dev/null +++ b/packages/doeff-vm/benches/pyvm_baseline.rs @@ -0,0 +1,106 @@ +use std::path::PathBuf; +use std::sync::Once; +use std::time::Duration; + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use doeff_vm::pyvm::doeff_vm; +use pyo3::append_to_inittab; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList}; + +#[derive(Clone)] +struct BenchmarkCase { + name: String, + runner: String, + workload: String, + invoke: Py, +} + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../..") + .canonicalize() + .expect("repo root should resolve") +} + +fn venv_site_packages() -> PathBuf { + std::env::var_os("DOEFF_BENCH_SITE_PACKAGES") + .map(PathBuf::from) + .expect("DOEFF_BENCH_SITE_PACKAGES should be set") + .canonicalize() + .expect("venv site-packages should resolve") +} + +fn initialize_python() { + static INIT: Once = Once::new(); + INIT.call_once(|| { + append_to_inittab!(doeff_vm); + Python::initialize(); + }); +} + +fn load_cases(iterations: usize) -> PyResult> { + initialize_python(); + Python::attach(|py| { + let sys = py.import("sys")?; + let sys_path_obj = sys.getattr("path")?; + let sys_path = sys_path_obj.cast::()?; + let repo_root = repo_root(); + let repo_root_str = repo_root.to_string_lossy().to_string(); + let site_packages = venv_site_packages(); + let site_packages_str = site_packages.to_string_lossy().to_string(); + sys_path.insert(0, repo_root_str.as_str())?; + sys_path.insert(0, site_packages_str.as_str())?; + + let modules_obj = sys.getattr("modules")?; + let modules = modules_obj.cast::()?; + let doeff_vm_module = py.import("doeff_vm")?; + doeff_vm_module.setattr("doeff_vm", doeff_vm_module.clone())?; + modules.set_item("doeff_vm.doeff_vm", doeff_vm_module)?; + + let workload_module = py.import("benchmarks.pyvm_workloads")?; + let cases = workload_module.call_method1("build_raw_vm_benchmark_cases", (iterations,))?; + + let mut loaded = Vec::new(); + for item in cases.try_iter()? { + let item = item?; + loaded.push(BenchmarkCase { + name: item.getattr("name")?.extract()?, + runner: item.getattr("runner")?.extract()?, + workload: item.getattr("workload")?.extract()?, + invoke: item.getattr("invoke")?.unbind(), + }); + } + Ok(loaded) + }) +} + +fn benchmark_pyvm_baseline(c: &mut Criterion) { + let cases = load_cases(25).expect("criterion workload cases should load"); + let mut group = c.benchmark_group("doeff_vm_baseline"); + group.warm_up_time(Duration::from_millis(500)); + group.measurement_time(Duration::from_secs(3)); + group.sample_size(20); + + for case in cases { + let label = BenchmarkId::new(case.runner.clone(), case.workload.clone()); + let invoke = case.invoke; + let name = case.name.clone(); + group.bench_function(label, move |b| { + b.iter(|| { + Python::attach(|py| { + let result = invoke + .bind(py) + .call0() + .unwrap_or_else(|err| panic!("{name} failed: {err}")); + black_box(result); + }); + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, benchmark_pyvm_baseline); +criterion_main!(benches); diff --git a/tests/test_benchmark_runner.py b/tests/test_benchmark_runner.py new file mode 100644 index 00000000..3269b413 --- /dev/null +++ b/tests/test_benchmark_runner.py @@ -0,0 +1,47 @@ +import csv +import json +from pathlib import Path + +from benchmarks.benchmark_runner import format_report, run_benchmarks, write_report + + +def test_run_benchmarks_writes_json_and_csv_outputs(tmp_path: Path) -> None: + report = run_benchmarks(runs=2, workload_iterations=3) + output_paths = write_report(report, tmp_path) + + json_path = output_paths["json"] + csv_path = output_paths["csv"] + + assert json_path.exists() + assert csv_path.exists() + + payload = json.loads(json_path.read_text()) + assert payload["metadata"]["runs"] == 2 + assert payload["metadata"]["workload_iterations"] == 3 + assert {entry["name"] for entry in payload["results"]} == { + "public_run:pure", + "public_run:state", + "public_run:state_writer", + } + + with csv_path.open(newline="") as handle: + rows = list(csv.DictReader(handle)) + + assert len(rows) == 3 + assert {row["name"] for row in rows} == { + "public_run:pure", + "public_run:state", + "public_run:state_writer", + } + + +def test_format_report_mentions_generated_artifacts(tmp_path: Path) -> None: + report = run_benchmarks(runs=1, workload_iterations=2) + output_paths = write_report(report, tmp_path) + + summary = format_report(report, output_paths=output_paths) + + assert "doeff-vm benchmark results:" in summary + assert "public_run:pure" in summary + assert str(output_paths["json"]) in summary + assert str(output_paths["csv"]) in summary