tradingexpert · tradingexpert · May 19, 2026 · May 19, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -199,3 +199,6 @@ Core tests should prove behavior, not implementation details:
 
 If a change touches matching, venue, OMS, or replay ordering, run the
 solo-equivalence tests before merging.
+
+For performance work, keep direct execution-engine throughput separate from
+full audited replay throughput. See `docs/benchmarks.md`.
diff --git a/README.md b/README.md
@@ -80,8 +80,12 @@ The projects serve different workflows.
 The pure Python engine is still the reference implementation, because it is the
 clearest place to inspect queue behavior and prove equivalence. Packaged wheels
 include the compiled `CppMatchingEngine`; ordinary `Replay(...)` runs prefer it
-because it preserves the same public contract while avoiding the Python hot
-loop. Source checkouts build the extension during normal installation:
+because it is the compiled implementation the project intends to keep
+equivalent and scale over time. The direct C++ batch-ingest path is already
+substantially faster for callers that own the event loop; ordinary audited
+`Replay(...)` currently remains event-by-event so it can record per-event
+valuation marks. Source checkouts build the extension during normal
+installation:
 
 ```bash
 python -m pip install -e ".[dev]"
@@ -275,6 +279,7 @@ Planned next milestones:
 - Connectors: `docs/connectors.md`
 - Releasing: `docs/releasing.md`
 - Engineering standards: `docs/engineering-standards.md`
+- Benchmarks: `docs/benchmarks.md`
 - Example: `examples/canonical.py`
 - Schema reference: `docs/schema.md`
 - AI agent guide: `AGENTS.md`

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Small public benchmark scripts for ordersim."""
diff --git a/benchmarks/engine_throughput.py b/benchmarks/engine_throughput.py
@@ -0,0 +1,166 @@
+"""Measure direct execution-engine event throughput."""
+
+import argparse
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from decimal import Decimal
+from statistics import median
+from time import perf_counter
+
+from benchmarks.workloads import build_mixed_mbo_workload
+from ordersim import (
+    CompiledEventColumns,
+    CppMatchingEngine,
+    MatchingEngine,
+    MBOEvent,
+    cpp_execution_engine_available,
+)
+from ordersim.sim import ExecutionEngine
+
+TICK_SIZE = Decimal("0.10")
+
+
+@dataclass(frozen=True, slots=True)
+class BenchmarkResult:
+    """One measured direct-engine path."""
+
+    path_name: str
+    event_count: int
+    median_seconds: float
+
+    @property
+    def events_per_second(self) -> float:
+        """Return median event throughput."""
+
+        return self.event_count / self.median_seconds
+
+
+def run_scalar(engine: ExecutionEngine, events: Sequence[MBOEvent]) -> None:
+    """Apply one event at a time through the public scalar engine API."""
+
+    for event in events:
+        engine.apply_event(event)
+
+
+def run_batch(engine: CppMatchingEngine, columns: CompiledEventColumns) -> None:
+    """Apply one compiled event slice through the C++ batch API."""
+
+    engine.apply_events_batch(columns.slice(0, len(columns.ts_ns)))
+
+
+def measure(
+    path_name: str,
+    runner: Callable[[], None],
+    *,
+    event_count: int,
+    repeats: int,
+    warmups: int,
+) -> BenchmarkResult:
+    """Measure median elapsed time for one benchmark runner."""
+
+    if repeats <= 0:
+        raise ValueError("repeats must be positive")
+    if warmups < 0:
+        raise ValueError("warmups must be non-negative")
+
+    for _ in range(warmups):
+        runner()
+
+    timings: list[float] = []
+    for _ in range(repeats):
+        started = perf_counter()
+        runner()
+        timings.append(perf_counter() - started)
+
+    return BenchmarkResult(
+        path_name=path_name,
+        event_count=event_count,
+        median_seconds=median(timings),
+    )
+
+
+def format_result(result: BenchmarkResult) -> str:
+    """Render one direct-engine result as a compact terminal row."""
+
+    return (
+        f"{result.path_name:<28}"
+        f"{result.event_count:>10,} events  "
+        f"{result.median_seconds:>8.4f} s  "
+        f"{result.events_per_second:>12,.0f} events/s"
+    )
+
+
+def main() -> None:
+    """Run direct-engine throughput benchmarks from the command line."""
+
+    parser = argparse.ArgumentParser(
+        description="Measure direct execution-engine throughput."
+    )
+    parser.add_argument(
+        "--cycles",
+        type=int,
+        default=20_000,
+        help="number of six-event mixed MBO cycles to generate",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=5,
+        help="number of measured runs per path",
+    )
+    parser.add_argument(
+        "--warmups",
+        type=int,
+        default=1,
+        help="number of discarded warmup runs per path",
+    )
+    args = parser.parse_args()
+
+    events = build_mixed_mbo_workload(args.cycles)
+    columns = CompiledEventColumns.from_events(events, tick_size=TICK_SIZE)
+    results = [
+        measure(
+            "MatchingEngine scalar",
+            lambda: run_scalar(MatchingEngine(), events),
+            event_count=len(events),
+            repeats=args.repeats,
+            warmups=args.warmups,
+        )
+    ]
+
+    print("Direct execution-engine throughput")
+    print("----------------------------------")
+    print(format_result(results[0]))
+
+    if not cpp_execution_engine_available():
+        print("CppMatchingEngine unavailable; compiled paths were skipped.")
+        return
+
+    cpp_scalar = measure(
+        "CppMatchingEngine scalar",
+        lambda: run_scalar(CppMatchingEngine(tick_size=TICK_SIZE), events),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    cpp_batch = measure(
+        "CppMatchingEngine batch",
+        lambda: run_batch(CppMatchingEngine(tick_size=TICK_SIZE), columns),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    results.extend((cpp_scalar, cpp_batch))
+
+    for result in results[1:]:
+        print(format_result(result))
+
+    python_eps = results[0].events_per_second
+    scalar_speedup = cpp_scalar.events_per_second / python_eps
+    batch_speedup = cpp_batch.events_per_second / python_eps
+    print(f"scalar C++ speedup vs Python  {scalar_speedup:>7.2f}x")
+    print(f"batch C++ speedup vs Python   {batch_speedup:>7.2f}x")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/replay_throughput.py b/benchmarks/replay_throughput.py
@@ -0,0 +1,166 @@
+"""Measure full audited replay throughput."""
+
+import argparse
+from collections.abc import Callable
+from dataclasses import dataclass
+from decimal import Decimal
+from statistics import median
+from time import perf_counter
+
+from benchmarks.workloads import build_mixed_mbo_workload
+from ordersim import (
+    InstrumentSpec,
+    MatchingEngine,
+    MBOEvent,
+    Replay,
+    cpp_execution_engine_available,
+)
+
+
+@dataclass(frozen=True, slots=True)
+class BenchmarkResult:
+    """One measured replay path."""
+
+    path_name: str
+    event_count: int
+    median_seconds: float
+
+    @property
+    def events_per_second(self) -> float:
+        """Return median replay throughput."""
+
+        return self.event_count / self.median_seconds
+
+
+def gc_spec() -> InstrumentSpec:
+    """Return the small benchmark instrument definition."""
+
+    return InstrumentSpec(
+        symbol="GC",
+        tick_size=Decimal("0.10"),
+        point_value=Decimal("100"),
+    )
+
+
+def advance_to_end(last_ts_ns: int) -> Callable:
+    """Build the smallest strategy that consumes the full replay."""
+
+    def strategy(gateway) -> None:
+        gateway.advance_to(last_ts_ns)
+
+    return strategy
+
+
+def run_replay(
+    events: tuple[MBOEvent, ...],
+    *,
+    execution_engine_factory=None,
+) -> None:
+    """Construct one replay and run it through the final event."""
+
+    replay = Replay(
+        data=events,
+        instrument=gc_spec(),
+        execution_engine_factory=execution_engine_factory,
+    )
+    replay.run(advance_to_end(events[-1].ts_ns))
+
+
+def measure(
+    path_name: str,
+    runner: Callable[[], None],
+    *,
+    event_count: int,
+    repeats: int,
+    warmups: int,
+) -> BenchmarkResult:
+    """Measure median elapsed time for one replay runner."""
+
+    if repeats <= 0:
+        raise ValueError("repeats must be positive")
+    if warmups < 0:
+        raise ValueError("warmups must be non-negative")
+
+    for _ in range(warmups):
+        runner()
+
+    timings: list[float] = []
+    for _ in range(repeats):
+        started = perf_counter()
+        runner()
+        timings.append(perf_counter() - started)
+
+    return BenchmarkResult(
+        path_name=path_name,
+        event_count=event_count,
+        median_seconds=median(timings),
+    )
+
+
+def format_result(result: BenchmarkResult) -> str:
+    """Render one replay result as a compact terminal row."""
+
+    return (
+        f"{result.path_name:<28}"
+        f"{result.event_count:>10,} events  "
+        f"{result.median_seconds:>8.4f} s  "
+        f"{result.events_per_second:>12,.0f} events/s"
+    )
+
+
+def main() -> None:
+    """Run replay throughput benchmarks from the command line."""
+
+    parser = argparse.ArgumentParser(
+        description="Measure full audited replay throughput."
+    )
+    parser.add_argument(
+        "--cycles",
+        type=int,
+        default=20_000,
+        help="number of six-event mixed MBO cycles to generate",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=5,
+        help="number of measured runs per path",
+    )
+    parser.add_argument(
+        "--warmups",
+        type=int,
+        default=1,
+        help="number of discarded warmup runs per path",
+    )
+    args = parser.parse_args()
+
+    events = build_mixed_mbo_workload(args.cycles)
+    python_result = measure(
+        "Replay + Python engine",
+        lambda: run_replay(events, execution_engine_factory=MatchingEngine),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    default_result = measure(
+        "Replay + default engine",
+        lambda: run_replay(events),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+
+    print("Full audited replay throughput")
+    print("------------------------------")
+    print(format_result(python_result))
+    print(format_result(default_result))
+    print(
+        "default engine speedup vs Python"
+        f"  {default_result.events_per_second / python_result.events_per_second:>7.2f}x"
+    )
+    if not cpp_execution_engine_available():
+        print("CppMatchingEngine unavailable; the default path used Python.")
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Small public benchmark scripts for ordersim."""