diff --git a/AGENTS.md b/AGENTS.md
index 7eabc28..c4d5cfc 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -199,3 +199,6 @@ Core tests should prove behavior, not implementation details:
 
 If a change touches matching, venue, OMS, or replay ordering, run the
 solo-equivalence tests before merging.
+
+For performance work, keep direct execution-engine throughput separate from
+full audited replay throughput. See `docs/benchmarks.md`.
diff --git a/README.md b/README.md
index ce8cd0c..c714e3e 100644
--- a/README.md
+++ b/README.md
@@ -80,8 +80,12 @@ The projects serve different workflows.
 The pure Python engine is still the reference implementation, because it is the
 clearest place to inspect queue behavior and prove equivalence. Packaged wheels
 include the compiled `CppMatchingEngine`; ordinary `Replay(...)` runs prefer it
-because it preserves the same public contract while avoiding the Python hot
-loop. Source checkouts build the extension during normal installation:
+because it is the compiled implementation the project intends to keep
+equivalent and scale over time. The direct C++ batch-ingest path is already
+substantially faster for callers that own the event loop; ordinary audited
+`Replay(...)` currently remains event-by-event so it can record per-event
+valuation marks. Source checkouts build the extension during normal
+installation:
 
 ```bash
 python -m pip install -e ".[dev]"
@@ -275,6 +279,7 @@ Planned next milestones:
 - Connectors: `docs/connectors.md`
 - Releasing: `docs/releasing.md`
 - Engineering standards: `docs/engineering-standards.md`
+- Benchmarks: `docs/benchmarks.md`
 - Example: `examples/canonical.py`
 - Schema reference: `docs/schema.md`
 - AI agent guide: `AGENTS.md`
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 0000000..ee12a8e
--- /dev/null
+++ b/benchmarks/__init__.py
@@ -0,0 +1 @@
+"""Small public benchmark scripts for ordersim."""
diff --git a/benchmarks/engine_throughput.py b/benchmarks/engine_throughput.py
new file mode 100644
index 0000000..96ba813
--- /dev/null
+++ b/benchmarks/engine_throughput.py
@@ -0,0 +1,166 @@
+"""Measure direct execution-engine event throughput."""
+
+import argparse
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from decimal import Decimal
+from statistics import median
+from time import perf_counter
+
+from benchmarks.workloads import build_mixed_mbo_workload
+from ordersim import (
+    CompiledEventColumns,
+    CppMatchingEngine,
+    MatchingEngine,
+    MBOEvent,
+    cpp_execution_engine_available,
+)
+from ordersim.sim import ExecutionEngine
+
+TICK_SIZE = Decimal("0.10")
+
+
+@dataclass(frozen=True, slots=True)
+class BenchmarkResult:
+    """One measured direct-engine path."""
+
+    path_name: str
+    event_count: int
+    median_seconds: float
+
+    @property
+    def events_per_second(self) -> float:
+        """Return median event throughput."""
+
+        return self.event_count / self.median_seconds
+
+
+def run_scalar(engine: ExecutionEngine, events: Sequence[MBOEvent]) -> None:
+    """Apply one event at a time through the public scalar engine API."""
+
+    for event in events:
+        engine.apply_event(event)
+
+
+def run_batch(engine: CppMatchingEngine, columns: CompiledEventColumns) -> None:
+    """Apply one compiled event slice through the C++ batch API."""
+
+    engine.apply_events_batch(columns.slice(0, len(columns.ts_ns)))
+
+
+def measure(
+    path_name: str,
+    runner: Callable[[], None],
+    *,
+    event_count: int,
+    repeats: int,
+    warmups: int,
+) -> BenchmarkResult:
+    """Measure median elapsed time for one benchmark runner."""
+
+    if repeats <= 0:
+        raise ValueError("repeats must be positive")
+    if warmups < 0:
+        raise ValueError("warmups must be non-negative")
+
+    for _ in range(warmups):
+        runner()
+
+    timings: list[float] = []
+    for _ in range(repeats):
+        started = perf_counter()
+        runner()
+        timings.append(perf_counter() - started)
+
+    return BenchmarkResult(
+        path_name=path_name,
+        event_count=event_count,
+        median_seconds=median(timings),
+    )
+
+
+def format_result(result: BenchmarkResult) -> str:
+    """Render one direct-engine result as a compact terminal row."""
+
+    return (
+        f"{result.path_name:<28}"
+        f"{result.event_count:>10,} events  "
+        f"{result.median_seconds:>8.4f} s  "
+        f"{result.events_per_second:>12,.0f} events/s"
+    )
+
+
+def main() -> None:
+    """Run direct-engine throughput benchmarks from the command line."""
+
+    parser = argparse.ArgumentParser(
+        description="Measure direct execution-engine throughput."
+    )
+    parser.add_argument(
+        "--cycles",
+        type=int,
+        default=20_000,
+        help="number of six-event mixed MBO cycles to generate",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=5,
+        help="number of measured runs per path",
+    )
+    parser.add_argument(
+        "--warmups",
+        type=int,
+        default=1,
+        help="number of discarded warmup runs per path",
+    )
+    args = parser.parse_args()
+
+    events = build_mixed_mbo_workload(args.cycles)
+    columns = CompiledEventColumns.from_events(events, tick_size=TICK_SIZE)
+    results = [
+        measure(
+            "MatchingEngine scalar",
+            lambda: run_scalar(MatchingEngine(), events),
+            event_count=len(events),
+            repeats=args.repeats,
+            warmups=args.warmups,
+        )
+    ]
+
+    print("Direct execution-engine throughput")
+    print("----------------------------------")
+    print(format_result(results[0]))
+
+    if not cpp_execution_engine_available():
+        print("CppMatchingEngine unavailable; compiled paths were skipped.")
+        return
+
+    cpp_scalar = measure(
+        "CppMatchingEngine scalar",
+        lambda: run_scalar(CppMatchingEngine(tick_size=TICK_SIZE), events),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    cpp_batch = measure(
+        "CppMatchingEngine batch",
+        lambda: run_batch(CppMatchingEngine(tick_size=TICK_SIZE), columns),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    results.extend((cpp_scalar, cpp_batch))
+
+    for result in results[1:]:
+        print(format_result(result))
+
+    python_eps = results[0].events_per_second
+    scalar_speedup = cpp_scalar.events_per_second / python_eps
+    batch_speedup = cpp_batch.events_per_second / python_eps
+    print(f"scalar C++ speedup vs Python  {scalar_speedup:>7.2f}x")
+    print(f"batch C++ speedup vs Python   {batch_speedup:>7.2f}x")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/replay_throughput.py b/benchmarks/replay_throughput.py
new file mode 100644
index 0000000..9b96d21
--- /dev/null
+++ b/benchmarks/replay_throughput.py
@@ -0,0 +1,166 @@
+"""Measure full audited replay throughput."""
+
+import argparse
+from collections.abc import Callable
+from dataclasses import dataclass
+from decimal import Decimal
+from statistics import median
+from time import perf_counter
+
+from benchmarks.workloads import build_mixed_mbo_workload
+from ordersim import (
+    InstrumentSpec,
+    MatchingEngine,
+    MBOEvent,
+    Replay,
+    cpp_execution_engine_available,
+)
+
+
+@dataclass(frozen=True, slots=True)
+class BenchmarkResult:
+    """One measured replay path."""
+
+    path_name: str
+    event_count: int
+    median_seconds: float
+
+    @property
+    def events_per_second(self) -> float:
+        """Return median replay throughput."""
+
+        return self.event_count / self.median_seconds
+
+
+def gc_spec() -> InstrumentSpec:
+    """Return the small benchmark instrument definition."""
+
+    return InstrumentSpec(
+        symbol="GC",
+        tick_size=Decimal("0.10"),
+        point_value=Decimal("100"),
+    )
+
+
+def advance_to_end(last_ts_ns: int) -> Callable:
+    """Build the smallest strategy that consumes the full replay."""
+
+    def strategy(gateway) -> None:
+        gateway.advance_to(last_ts_ns)
+
+    return strategy
+
+
+def run_replay(
+    events: tuple[MBOEvent, ...],
+    *,
+    execution_engine_factory=None,
+) -> None:
+    """Construct one replay and run it through the final event."""
+
+    replay = Replay(
+        data=events,
+        instrument=gc_spec(),
+        execution_engine_factory=execution_engine_factory,
+    )
+    replay.run(advance_to_end(events[-1].ts_ns))
+
+
+def measure(
+    path_name: str,
+    runner: Callable[[], None],
+    *,
+    event_count: int,
+    repeats: int,
+    warmups: int,
+) -> BenchmarkResult:
+    """Measure median elapsed time for one replay runner."""
+
+    if repeats <= 0:
+        raise ValueError("repeats must be positive")
+    if warmups < 0:
+        raise ValueError("warmups must be non-negative")
+
+    for _ in range(warmups):
+        runner()
+
+    timings: list[float] = []
+    for _ in range(repeats):
+        started = perf_counter()
+        runner()
+        timings.append(perf_counter() - started)
+
+    return BenchmarkResult(
+        path_name=path_name,
+        event_count=event_count,
+        median_seconds=median(timings),
+    )
+
+
+def format_result(result: BenchmarkResult) -> str:
+    """Render one replay result as a compact terminal row."""
+
+    return (
+        f"{result.path_name:<28}"
+        f"{result.event_count:>10,} events  "
+        f"{result.median_seconds:>8.4f} s  "
+        f"{result.events_per_second:>12,.0f} events/s"
+    )
+
+
+def main() -> None:
+    """Run replay throughput benchmarks from the command line."""
+
+    parser = argparse.ArgumentParser(
+        description="Measure full audited replay throughput."
+    )
+    parser.add_argument(
+        "--cycles",
+        type=int,
+        default=20_000,
+        help="number of six-event mixed MBO cycles to generate",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=5,
+        help="number of measured runs per path",
+    )
+    parser.add_argument(
+        "--warmups",
+        type=int,
+        default=1,
+        help="number of discarded warmup runs per path",
+    )
+    args = parser.parse_args()
+
+    events = build_mixed_mbo_workload(args.cycles)
+    python_result = measure(
+        "Replay + Python engine",
+        lambda: run_replay(events, execution_engine_factory=MatchingEngine),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+    default_result = measure(
+        "Replay + default engine",
+        lambda: run_replay(events),
+        event_count=len(events),
+        repeats=args.repeats,
+        warmups=args.warmups,
+    )
+
+    print("Full audited replay throughput")
+    print("------------------------------")
+    print(format_result(python_result))
+    print(format_result(default_result))
+    print(
+        "default engine speedup vs Python"
+        f"  {default_result.events_per_second / python_result.events_per_second:>7.2f}x"
+    )
+    if not cpp_execution_engine_available():
+        print("CppMatchingEngine unavailable; the default path used Python.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/workloads.py b/benchmarks/workloads.py
new file mode 100644
index 0000000..68fbb34
--- /dev/null
+++ b/benchmarks/workloads.py
@@ -0,0 +1,72 @@
+"""Deterministic public workloads shared by benchmark scripts."""
+
+from decimal import Decimal
+
+from ordersim import MBOEvent
+
+
+def build_mixed_mbo_workload(cycles: int) -> tuple[MBOEvent, ...]:
+    """Build mixed MBO cycles that end with an empty visible book."""
+
+    if cycles <= 0:
+        raise ValueError("cycles must be positive")
+
+    events: list[MBOEvent] = []
+    ts_ns = 1
+    for cycle in range(cycles):
+        bid_order_id = 2 * cycle + 1
+        ask_order_id = 2 * cycle + 2
+        events.extend(
+            (
+                MBOEvent(
+                    ts_ns=ts_ns,
+                    action="add",
+                    side="bid",
+                    price=Decimal("100.0"),
+                    size=5,
+                    order_id=bid_order_id,
+                ),
+                MBOEvent(
+                    ts_ns=ts_ns + 1,
+                    action="add",
+                    side="ask",
+                    price=Decimal("101.0"),
+                    size=5,
+                    order_id=ask_order_id,
+                ),
+                MBOEvent(
+                    ts_ns=ts_ns + 2,
+                    action="modify",
+                    side="bid",
+                    price=Decimal("100.0"),
+                    size=4,
+                    order_id=bid_order_id,
+                ),
+                MBOEvent(
+                    ts_ns=ts_ns + 3,
+                    action="trade",
+                    side="bid",
+                    price=Decimal("100.0"),
+                    size=2,
+                    order_id=bid_order_id,
+                ),
+                MBOEvent(
+                    ts_ns=ts_ns + 4,
+                    action="cancel",
+                    side="ask",
+                    price=Decimal("101.0"),
+                    size=5,
+                    order_id=ask_order_id,
+                ),
+                MBOEvent(
+                    ts_ns=ts_ns + 5,
+                    action="cancel",
+                    side="bid",
+                    price=Decimal("100.0"),
+                    size=2,
+                    order_id=bid_order_id,
+                ),
+            )
+        )
+        ts_ns += 6
+    return tuple(events)
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
new file mode 100644
index 0000000..7ccbcab
--- /dev/null
+++ b/docs/benchmarks.md
@@ -0,0 +1,92 @@
+# Benchmarks
+
+Benchmarks answer narrow performance questions without changing replay
+semantics. They are machine-dependent comparisons, not universal throughput
+promises.
+
+## Shared Workload
+
+The public benchmark scripts use the same deterministic mixed MBO workload.
+Each six-event cycle performs:
+
+1. bid add;
+2. ask add;
+3. bid modify;
+4. bid trade;
+5. ask cancel;
+6. bid cancel.
+
+The cycle ends with an empty visible book, so repeated cycles do not accumulate
+state from earlier cycles.
+
+## Direct Engine Throughput
+
+Run:
+
+```bash
+python -m benchmarks.engine_throughput
+```
+
+This benchmark measures event ingestion through the execution engine itself:
+
+| Path | What it measures |
+|---|---|
+| `MatchingEngine` scalar | `apply_event(MBOEvent)` on the Python reference engine |
+| `CppMatchingEngine` scalar | `apply_event(MBOEvent)` one event at a time |
+| `CppMatchingEngine` batch | `apply_events_batch(...)` over precompiled primitive columns |
+
+The batch result excludes `CompiledEventColumns.from_events(...)` construction.
+That conversion is meant to happen once before repeated compiled-engine runs;
+including it would answer a different question.
+
+## Full Replay Throughput
+
+Run:
+
+```bash
+python -m benchmarks.replay_throughput
+```
+
+This benchmark measures the ordinary audited workflow:
+
+```text
+Replay(...) construction + strategy advance_to(end) + result assembly
+```
+
+It compares the explicit Python reference engine with the default engine chosen
+by `Replay(...)`. Full replay is slower than direct engine ingestion because it
+also performs the work that makes `ordersim` inspectable:
+
+- event-by-event replay advancement;
+- per-event valuation marks when both sides of the book exist;
+- fill-ledger and equity-curve assembly;
+- strategy-facing gateway calls.
+
+## Interpreting Results
+
+Direct engine throughput answers, "how quickly can the engine consume already
+normalized events?"
+
+Full replay throughput answers, "how quickly can the normal audited research
+workflow produce a `ReplayResult`?"
+
+Both numbers matter. They should not be collapsed into one claim.
+
+## What This Exposes
+
+The intended next performance step is boundary-batched replay. In that design,
+the compiled engine can advance independently through market-data events until
+the next point where Python must observe or decide:
+
+- the strategy asks to advance only up to a timestamp;
+- a passive fill occurs and strategy logic may need to react;
+- a new order or cancel instruction reaches simulated venue time;
+- replay needs a configured inspection or valuation mark.
+
+That keeps the C++ path useful beyond direct engine benchmarks without giving
+up the auditability of `ReplayResult`. It also keeps fill-connected strategies
+honest: Python should regain control when execution state changes in a way the
+strategy can observe.
+
+CI should keep benchmark code runnable; it should not enforce fixed speed
+thresholds across hardware.
diff --git a/docs/execution-engines.md b/docs/execution-engines.md
index 985516a..3e9a9ab 100644
--- a/docs/execution-engines.md
+++ b/docs/execution-engines.md
@@ -6,8 +6,8 @@
 - the Python engine is the readable reference used to inspect behavior and
   prove equivalence.
 
-That split is intentional. It keeps the hot path fast without making the model
-opaque.
+That split is intentional. It keeps a readable reference model while allowing
+compiled paths to scale without changing public semantics.
 
 Execution engines consume normalized `MBOEvent` rows and strategy order intents.
 They do not read vendor data directly.
@@ -32,10 +32,13 @@ plain and inspectable. Public behavior should be judged against it.
 ## Default Selection
 
 `Replay(...)` prefers `CppMatchingEngine` when the compiled extension is
-available. Packaged wheels are expected to include that extension. Compiled
-replay is the normal useful default once behavioral equivalence has been proven:
-users get the same fills and order log without paying Python-loop cost on every
-run.
+available. Packaged wheels are expected to include that extension. The compiled
+engine is the normal default once behavioral equivalence has been proven, so
+users exercise the implementation the project intends to scale over time.
+
+Ordinary audited `Replay(...)` currently still applies one event at a time so it
+can preserve per-event valuation marks and result assembly. The compiled batch
+path is a separate direct-engine API for callers who own the event loop.
 
 If the extension is unavailable, `Replay(...)` falls back to `MatchingEngine`.
 Pass `execution_engine_factory=MatchingEngine` when the Python version is the
@@ -126,6 +129,10 @@ low-level engine regressions before Python enters the picture; the replay
 equivalence suite is still required because native tests alone cannot prove the
 public API remains identical.
 
+For performance measurements, see `docs/benchmarks.md`. Direct engine
+throughput and full replay throughput are intentionally measured separately
+because they answer different questions.
+
 ## Equivalence Harness
 
 Compiled or alternative execution engines must prove replay equivalence against
diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py
new file mode 100644
index 0000000..055ad5f
--- /dev/null
+++ b/tests/test_benchmarks.py
@@ -0,0 +1,34 @@
+from decimal import Decimal
+
+import pytest
+
+from benchmarks.replay_throughput import advance_to_end, gc_spec, run_replay
+from benchmarks.workloads import build_mixed_mbo_workload
+from ordersim import MatchingEngine
+
+
+def test_mixed_benchmark_workload_is_balanced() -> None:
+    events = build_mixed_mbo_workload(cycles=3)
+    engine = MatchingEngine()
+
+    for event in events:
+        engine.apply_event(event)
+
+    assert len(events) == 18
+    assert engine.book_top() == (None, None)
+
+
+def test_mixed_benchmark_workload_rejects_empty_runs() -> None:
+    with pytest.raises(ValueError, match="cycles must be positive"):
+        build_mixed_mbo_workload(cycles=0)
+
+
+def test_replay_benchmark_uses_a_complete_replay_run() -> None:
+    events = build_mixed_mbo_workload(cycles=1)
+    strategy = advance_to_end(events[-1].ts_ns)
+
+    assert gc_spec().tick_size == Decimal("0.10")
+    strategy_name = strategy.__name__
+    run_replay(events, execution_engine_factory=MatchingEngine)
+
+    assert strategy_name == "strategy"