diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4f02225..68aa6d5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install build dependencies
-        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev
+        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
@@ -95,7 +95,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install build dependencies
-        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev
+        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml
index 34918f9..956c163 100644
--- a/.github/workflows/perf.yml
+++ b/.github/workflows/perf.yml
@@ -3,25 +3,28 @@ name: Performance Tests
 on:
   workflow_dispatch:
     inputs:
-      run_full_load_test:
-        description: 'Run extended load tests (5 sessions, 20 followers, 5 minutes)'
+      benchmark_tier:
+        description: 'Benchmark tier to run'
         required: false
-        default: 'false'
-        type: boolean
-  pull_request:
-    branches: [main]
-    paths:
-      - 'server/**'
-      - 'bench/**'
-      - '.github/workflows/perf.yml'
+        default: 'smoke'
+        type: choice
+        options:
+          - smoke
+          - standard
+          - stress
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - 'server/**'
+  #     - 'bench/**'
+  #     - '.github/workflows/perf.yml'
 
 env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  # Quick non-regression test on every PR
-  regression-test:
-    name: Performance Regression Test
+  benchmark:
+    name: Performance Benchmark
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
@@ -29,7 +32,7 @@ jobs:
       - name: Install build dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y libopenslide-dev python3
+          sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
@@ -46,12 +49,6 @@ jobs:
           key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }}
           restore-keys: ${{ runner.os }}-cargo-
 
-      - name: Install oha (HTTP load testing tool)
-        run: |
-          if ! command -v oha &> /dev/null; then
-            cargo install oha
-          fi
-
       - name: Build server and tests (release)
         run: |
           cargo build --release
@@ -83,49 +80,43 @@ jobs:
           # Verify health
           curl -s http://127.0.0.1:8080/health
 
-      - name: Run WebSocket regression test
+      - name: Determine benchmark tier
+        id: tier
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "tier=${{ github.event.inputs.benchmark_tier }}" >> $GITHUB_OUTPUT
+          else
+            echo "tier=smoke" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Run benchmark
         run: |
           cd server
-          cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_results.txt
-        timeout-minutes: 5
+          cargo test --test perf_tests bench_${{ steps.tier.outputs.tier }} --release -- --ignored --nocapture 2>&1 | tee /tmp/benchmark_results.txt
+        timeout-minutes: 10
 
-      - name: Check WebSocket performance budgets
+      - name: Check benchmark results
         run: |
-          echo "=== WebSocket Test Results ==="
-          cat /tmp/ws_results.txt
+          echo "=== Benchmark Results ==="
+          cat /tmp/benchmark_results.txt
 
           # Check if test passed
-          if grep -q "Overall: PASS" /tmp/ws_results.txt; then
-            echo "✅ WebSocket performance within budget"
+          if grep -q "OVERALL: PASS" /tmp/benchmark_results.txt; then
+            echo "✅ Benchmark passed"
           else
-            echo "❌ WebSocket performance exceeded budget"
+            echo "❌ Benchmark failed"
             exit 1
           fi
 
-      - name: Run HTTP tile stress test (quick)
-        run: |
-          ./bench/load_tests/scenarios/tile_stress.sh \
-            --quick \
-            --output bench/load_tests/results/tile_current.json 2>&1 | tee /tmp/tile_results.txt
-        timeout-minutes: 5
-
-      - name: Compare HTTP tile performance to baseline
+      - name: Extract JSON results
+        if: always()
         run: |
-          echo "=== HTTP Tile Performance ==="
-
-          # Run comparison (--ci mode exits 1 on regression)
-          python3 ./bench/scripts/compare_baseline.py \
-            --current bench/load_tests/results/tile_current.json \
-            --baseline bench/baselines/tile_baseline.json \
-            --threshold 20 \
-            --markdown | tee /tmp/comparison.md
-
-          # Also run with CI mode to get exit code
-          python3 ./bench/scripts/compare_baseline.py \
-            --current bench/load_tests/results/tile_current.json \
-            --baseline bench/baselines/tile_baseline.json \
-            --threshold 20 \
-            --ci
+          # Extract JSON line for machine parsing
+          grep "^JSON:" /tmp/benchmark_results.txt | sed 's/^JSON: //' > bench/load_tests/results/benchmark.json || true
+          if [ -f bench/load_tests/results/benchmark.json ]; then
+            echo "=== JSON Results ==="
+            cat bench/load_tests/results/benchmark.json
+          fi
 
       - name: Collect server metrics
         if: always()
@@ -140,119 +131,8 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results
+          name: benchmark-results-${{ steps.tier.outputs.tier }}
           path: |
             bench/load_tests/results/
-            /tmp/ws_results.txt
-            /tmp/tile_results.txt
-            /tmp/comparison.md
+            /tmp/benchmark_results.txt
           retention-days: 30
-
-  # Extended load test (manual trigger only)
-  extended-load-test:
-    name: Extended Load Tests
-    runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch' && github.event.inputs.run_full_load_test == 'true'
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install build dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libopenslide-dev python3
-
-      - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Cache Cargo
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-            target/
-          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: ${{ runner.os }}-cargo-
-
-      - name: Install oha
-        run: cargo install oha
-
-      - name: Build server and tests (release)
-        run: |
-          cargo build --release
-          cargo test --test perf_tests --no-run --release
-
-      - name: Create test directories
-        run: |
-          mkdir -p /tmp/pathcollab/slides
-          mkdir -p bench/load_tests/results
-
-      - name: Start server in background
-        run: |
-          HOST=127.0.0.1 \
-          PORT=8080 \
-          SLIDES_DIR=/tmp/pathcollab/slides \
-          RUST_LOG=warn \
-          ./target/release/pathcollab &
-
-          for i in {1..30}; do
-            if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
-              echo "Server is ready!"
-              break
-            fi
-            sleep 1
-          done
-
-      - name: Run standard WebSocket load test
-        run: |
-          cd server
-          cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_standard.txt
-        timeout-minutes: 10
-
-      - name: Run extended WebSocket load test
-        run: |
-          cd server
-          cargo test --test perf_tests test_fanout_extended --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_extended.txt
-        timeout-minutes: 15
-
-      - name: Run HTTP tile ramp test
-        run: |
-          ./bench/load_tests/scenarios/ramp_test.sh \
-            --start 1 \
-            --end 50 \
-            --step 5 \
-            --stage-duration 10 \
-            --output bench/load_tests/results 2>&1 | tee /tmp/ramp_results.txt
-        timeout-minutes: 20
-
-      - name: Run HTTP tile standard test
-        run: |
-          ./bench/load_tests/scenarios/tile_stress.sh \
-            --concurrent 20 \
-            --duration 60 \
-            --output bench/load_tests/results/tile_extended.json 2>&1 | tee /tmp/tile_extended.txt
-        timeout-minutes: 10
-
-      - name: Generate performance report
-        if: always()
-        run: |
-          python3 ./bench/scripts/generate_report.py \
-            --input-dir bench/load_tests/results \
-            --output bench/load_tests/results/REPORT.md || true
-
-          echo "=== Performance Report ==="
-          cat bench/load_tests/results/REPORT.md || echo "Report generation failed"
-
-      - name: Upload extended results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: extended-benchmark-results
-          path: |
-            bench/load_tests/results/
-            /tmp/ws_*.txt
-            /tmp/tile_*.txt
-            /tmp/ramp_results.txt
-          retention-days: 90
diff --git a/AGENTS.md b/AGENTS.md
index 80e6ae1..580dbd7 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -316,26 +316,32 @@ cd web && bun test
 cargo test
 
 # 4. Quick perf check (if touching hot paths)
-./bench/load_tests/scenarios/tile_stress.sh --quick
+cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
 ```
 
 ### Performance Testing
 
+The benchmark system runs 3 iterations with warm-up and compares against stored baselines.
+
 ```bash
-# Quick performance check
-SLIDES_DIR=/data/wsi_slides DEMO_ENABLED=true cargo run --release
-./bench/load_tests/scenarios/tile_stress.sh --quick
-python3 ./bench/scripts/compare_baseline.py \
-  --current bench/load_tests/results/tile_current.json \
-  --baseline bench/baselines/tile_baseline.json
-
-# Full benchmark suite (before major changes)
-./bench/scripts/run_all.sh --compare-baseline
-
-# Save new baseline after confirmed improvements
-./bench/scripts/run_all.sh --save-baseline
+# Start the server first
+SLIDES_DIR=~/Documents/tcga_slides cargo run --release &
+
+# Quick smoke test (~30s) - runs on every PR
+cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
+
+# Standard test (~2min) - PR merge gate
+cd server && cargo test --test perf_tests bench_standard --release -- --ignored --nocapture
+
+# Full stress test (~4min) - before releases
+cd server && cargo test --test perf_tests bench_stress --release -- --ignored --nocapture
+
+# Save current results as baseline
+SAVE_BASELINE=1 cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
 ```
 
+Baselines are stored in `.benchmark-baseline.json`. The system detects regressions >15% automatically.
+
 ### Live Metrics
 
 ```bash
diff --git a/Cargo.lock b/Cargo.lock
index 4af8dd3..d5af51b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -29,6 +29,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.100"
@@ -277,6 +286,20 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
 
+[[package]]
+name = "chrono"
+version = "0.4.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118"
+dependencies = [
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "serde",
+ "wasm-bindgen",
+ "windows-link",
+]
+
 [[package]]
 name = "clang-sys"
 version = "1.8.1"
@@ -836,6 +859,30 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "iana-time-zone"
+version = "0.1.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "log",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "icu_collections"
 version = "2.1.1"
@@ -1329,6 +1376,7 @@ dependencies = [
  "async-trait",
  "axum",
  "bytes",
+ "chrono",
  "dashmap",
  "flate2",
  "futures-util",
@@ -2614,12 +2662,65 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows-core"
+version = "0.62.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-link",
+ "windows-result",
+ "windows-strings",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.60.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.59.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "windows-link"
 version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
 
+[[package]]
+name = "windows-result"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5"
+dependencies = [
+ "windows-link",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091"
+dependencies = [
+ "windows-link",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
diff --git a/bench/README.md b/bench/README.md
deleted file mode 100644
index 5043c77..0000000
--- a/bench/README.md
+++ /dev/null
@@ -1,366 +0,0 @@
-# PathCollab Benchmark Suite
-
-Comprehensive profiling and load testing infrastructure for the PathCollab collaborative slide viewer server.
-
-## Quick Start
-
-```bash
-# Install dependencies
-cargo install oha  # HTTP load testing tool
-
-# Run quick benchmark (5 connections, 10 seconds)
-./bench/load_tests/scenarios/tile_stress.sh --quick
-
-# Run full benchmark suite
-./bench/scripts/run_all.sh
-
-# Run with baseline comparison (fails CI if P99 regresses >10%)
-./bench/scripts/run_all.sh --compare-baseline
-```
-
-## Prerequisites
-
-### Required
-
-- **Rust toolchain** (stable, for building server and Criterion benchmarks)
-- **Running PathCollab server** with slides available
-
-### Optional (for full suite)
-
-- **oha**: HTTP load testing tool
-  ```bash
-  cargo install oha
-  ```
-- **Python 3.6+**: For baseline comparison and report generation
-- **jq**: For parsing JSON results in shell scripts
-
-## Directory Structure
-
-```
-bench/
-├── README.md                    # This file
-├── load_tests/
-│   ├── scenarios/
-│   │   ├── tile_stress.sh       # HTTP tile endpoint stress test
-│   │   ├── overlay_stress.sh    # HTTP cell overlay endpoint stress test
-│   │   ├── ramp_test.sh         # Gradual load increase to find breaking point
-│   │   └── combined_load.sh     # HTTP + WebSocket simultaneous load
-│   └── results/                 # Test output (.gitignored)
-├── baselines/
-│   ├── tile_baseline.json       # HTTP tile performance baseline
-│   └── websocket_baseline.json  # WebSocket performance baseline
-└── scripts/
-    ├── run_all.sh               # Orchestrate full benchmark suite
-    ├── compare_baseline.py      # Compare results to baseline
-    └── generate_report.py       # Generate markdown report
-
-server/benches/                  # Criterion micro-benchmarks
-├── tile_encoding.rs             # JPEG encoding, image resize
-├── spatial_index.rs             # R-tree query performance
-└── message_serialization.rs     # JSON serialization for WebSocket
-```
-
-## Running Benchmarks
-
-### 1. HTTP Tile Load Tests
-
-Stress test the tile serving endpoint:
-
-```bash
-# Quick test (5 connections, 10 seconds)
-./bench/load_tests/scenarios/tile_stress.sh --quick
-
-# Standard test (10 connections, 30 seconds)
-./bench/load_tests/scenarios/tile_stress.sh
-
-# Custom configuration
-./bench/load_tests/scenarios/tile_stress.sh \
-    --url http://localhost:8080 \
-    --concurrent 20 \
-    --duration 60 \
-    --output results/tile_test.json
-
-# Find breaking point with ramp test
-./bench/load_tests/scenarios/ramp_test.sh \
-    --start 1 \
-    --end 100 \
-    --step 10
-```
-
-### 2. Cell Overlay Load Tests
-
-Stress test the cell overlay endpoint:
-
-```bash
-# Quick test (5 connections, 10 seconds)
-./bench/load_tests/scenarios/overlay_stress.sh --quick
-
-# Standard test (10 connections, 30 seconds)
-./bench/load_tests/scenarios/overlay_stress.sh
-
-# Custom configuration
-./bench/load_tests/scenarios/overlay_stress.sh \
-    --url http://localhost:8080 \
-    --concurrent 20 \
-    --duration 60 \
-    --viewport-size 1024 \
-    --output results/overlay_test.json
-```
-
-### 3. WebSocket Load Tests
-
-Test session broadcasting under load:
-
-```bash
-cd server
-
-# Quick test (1 session, 3 followers, 3 seconds)
-cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture
-
-# Standard test (5 sessions, 20 followers, 30 seconds)
-cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture
-
-# Extended test (5 minutes)
-cargo test --test perf_tests test_fanout_extended --release -- --ignored --nocapture
-```
-
-### 4. Combined Load Test
-
-Simulate realistic production load with both HTTP and WebSocket traffic:
-
-```bash
-./bench/load_tests/scenarios/combined_load.sh \
-    --tile-concurrent 10 \
-    --ws-sessions 3 \
-    --ws-followers 10 \
-    --duration 30
-```
-
-### 5. Full Benchmark Suite
-
-Run everything with a single command:
-
-```bash
-# Full suite with report generation
-./bench/scripts/run_all.sh
-
-# Quick mode
-./bench/scripts/run_all.sh --quick
-
-# Skip specific phases
-./bench/scripts/run_all.sh --skip-micro --skip-websocket
-
-# Compare to baseline
-./bench/scripts/run_all.sh --compare-baseline
-
-# Save new baseline
-./bench/scripts/run_all.sh --save-baseline
-```
-
-## Performance Budgets
-
-These are the target latencies for production use:
-
-| Metric | Budget | Description |
-|--------|--------|-------------|
-| Tile P99 | < 100ms | HTTP tile serving latency |
-| Overlay P99 | < 100ms | HTTP cell overlay query latency |
-| Cursor P99 | < 100ms | WebSocket cursor broadcast |
-| Viewport P99 | < 150ms | WebSocket viewport broadcast |
-| Message Handling | < 10ms | Server-side message processing |
-
-## Baseline Management
-
-### Creating a Baseline
-
-```bash
-# Run benchmarks and save as baseline
-./bench/scripts/run_all.sh --save-baseline
-
-# Or manually from results
-./bench/scripts/compare_baseline.py \
-    --save-baseline bench/load_tests/results/latest/tile_stress.json \
-    --output bench/baselines/tile_baseline.json \
-    --description "Baseline after performance optimization"
-```
-
-### Comparing to Baseline
-
-```bash
-# Compare and output to terminal
-./bench/scripts/compare_baseline.py \
-    --current bench/load_tests/results/latest/tile_stress.json \
-    --baseline bench/baselines/tile_baseline.json
-
-# Markdown output (for PR comments)
-./bench/scripts/compare_baseline.py \
-    --current results.json \
-    --baseline baseline.json \
-    --markdown
-
-# CI mode (exit code 1 on regression)
-./bench/scripts/compare_baseline.py \
-    --current results.json \
-    --baseline baseline.json \
-    --threshold 10 \
-    --ci
-```
-
-## CI Integration
-
-### GitHub Actions
-
-The existing `.github/workflows/perf.yml` can be extended:
-
-```yaml
-- name: Run benchmark suite
-  run: |
-    ./bench/scripts/run_all.sh \
-      --quick \
-      --compare-baseline \
-      2>&1 | tee benchmark_output.txt
-
-- name: Check for regressions
-  run: |
-    if grep -q "FAILED" benchmark_output.txt; then
-      echo "Performance regression detected!"
-      exit 1
-    fi
-```
-
-### Exit Codes
-
-All scripts follow Unix conventions:
-- `0`: Success / no regressions
-- `1`: Failure / regression detected
-- `2`: Configuration or dependency error
-
-## Interpreting Results
-
-### HTTP Tile Benchmarks
-
-```
-Throughput:   450 req/s        # Higher is better
-P50 latency:  8.5ms            # Median response time
-P95 latency:  25.3ms           # 95th percentile
-P99 latency:  48.2ms           # 99th percentile (main target)
-Success rate: 100%             # Should be 100%
-```
-
-**What "good" looks like:**
-- P99 < 100ms for tile serving
-- Success rate > 99%
-- Throughput scales linearly with concurrency up to CPU saturation
-
-### Cell Overlay Benchmarks
-
-```
-Throughput:   800 req/s        # Higher is better (faster than tiles)
-P50 latency:  3.2ms            # Median response time
-P95 latency:  12.1ms           # 95th percentile
-P99 latency:  28.5ms           # 99th percentile (main target)
-Success rate: 100%             # Should be 100%
-```
-
-**What "good" looks like:**
-- P99 < 100ms for cell overlay queries
-- Success rate > 99%
-- Should be faster than tile serving (no JPEG encoding overhead)
-
-### WebSocket Benchmarks
-
-```
-Messages sent:     9000
-Messages received: 180000       # ~20x sent (fan-out to followers)
-Cursor P99:        45ms         # Broadcast latency
-Viewport P99:      62ms         # Slightly larger messages
-```
-
-**What "good" looks like:**
-- Cursor P99 < 100ms
-- Viewport P99 < 150ms
-- No message drops (received ≈ sent × followers)
-
-### Micro-benchmarks
-
-```
-jpeg_encoding/256x256/85    time: [1.2345 ms 1.2456 ms 1.2567 ms]
-```
-
-- **Low/Mid/High**: Confidence interval for timing
-- Compare to previous runs to detect regressions
-- HTML reports in `target/criterion/` show trends over time
-
-## Troubleshooting
-
-### "oha not found"
-
-```bash
-cargo install oha
-```
-
-### "Server not responding"
-
-Ensure the server is running:
-```bash
-cd server && cargo run --release
-```
-
-Or specify a different URL:
-```bash
-./bench/load_tests/scenarios/tile_stress.sh --url http://localhost:9090
-```
-
-### "No slides found"
-
-The tile tests require at least one slide in the server's slides directory:
-```bash
-# Check configured slides directory in .env or environment
-ls $SLIDES_DIR
-
-# Place WSI files (.svs, .ndpi, .tiff, etc.) in the slides directory
-```
-
-### Benchmark results vary widely
-
-- Ensure no other CPU-intensive processes are running
-- Run multiple iterations and compare medians
-- For Criterion benchmarks, the tool handles statistical analysis automatically
-- For load tests, use longer durations for more stable results
-
-### WebSocket tests timeout
-
-Check that:
-1. Server is compiled in release mode (`cargo build --release`)
-2. No firewall blocking WebSocket connections
-3. Sufficient file descriptors (`ulimit -n`)
-
-## Adding New Benchmarks
-
-### New Load Test Scenario
-
-1. Create script in `bench/load_tests/scenarios/`
-2. Follow the pattern of existing scripts (argument parsing, colors, etc.)
-3. Output JSON for machine parsing
-4. Add to `run_all.sh` if appropriate
-
-## Server Metrics
-
-The server exposes Prometheus metrics at `/metrics/prometheus`:
-
-```bash
-# Key metrics for benchmarking
-curl -s http://localhost:8080/metrics/prometheus | grep pathcollab
-
-# Tile serving
-pathcollab_tile_requests_total
-pathcollab_tile_duration_seconds
-pathcollab_tile_phase_duration_seconds{phase="read|resize|encode"}
-
-# WebSocket
-pathcollab_ws_messages_total
-pathcollab_ws_message_duration_seconds
-pathcollab_ws_broadcast_duration_seconds
-```
-
-These can be scraped during load tests for detailed analysis.
diff --git a/bench/baselines/tile_baseline.json b/bench/baselines/tile_baseline.json
deleted file mode 100644
index e6ad512..0000000
--- a/bench/baselines/tile_baseline.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "created_at": "2026-01-18T18:02:29.793281Z",
-  "description": "Initial baseline from TCGA slides with 20 concurrent connections",
-  "metrics": {
-    "requests_per_sec": 44.65582630258221,
-    "success_rate": 100.0,
-    "p50_ms": 527.124199,
-    "p90_ms": 557.954244,
-    "p95_ms": 564.5460730000001,
-    "p99_ms": 578.905558
-  },
-  "raw_data": {
-    "summary": {
-      "successRate": 1.0,
-      "total": 30.007282609,
-      "slowest": 0.599721819,
-      "fastest": 0.027209147,
-      "average": 0.4501082052939393,
-      "requestsPerSec": 44.65582630258221,
-      "totalData": 5404080,
-      "sizePerRequest": 4094,
-      "sizePerSec": 180092.28194422275
-    },
-    "responseTimeHistogram": {
-      "0.027209147": 1,
-      "0.0844604142": 202,
-      "0.14171168139999998": 24,
-      "0.19896294859999997": 4,
-      "0.2562142158": 4,
-      "0.313465483": 1,
-      "0.37071675019999994": 3,
-      "0.42796801739999996": 2,
-      "0.48521928459999997": 8,
-      "0.5424705517999999": 682,
-      "0.599721819": 389
-    },
-    "latencyPercentiles": {
-      "p10": 0.068543644,
-      "p25": 0.505791205,
-      "p50": 0.527124199,
-      "p75": 0.545522472,
-      "p90": 0.557954244,
-      "p95": 0.564546073,
-      "p99": 0.578905558,
-      "p99.9": 0.59607328,
-      "p99.99": 0.599721819
-    },
-    "rps": {
-      "mean": 28874.193265230362,
-      "stddev": 964723.1848951668,
-      "max": 33333330.575321194,
-      "min": 19.983695702350346,
-      "percentiles": {
-        "p10": 25.598587777109657,
-        "p25": 28.246009523819943,
-        "p50": 31.68769218981417,
-        "p75": 155.03267856313718,
-        "p90": 500.63875246830406,
-        "p95": 917.303275139838,
-        "p99": 12072.337445898871,
-        "p99.9": 341880.34182995924,
-        "p99.99": 33333330.575321194
-      }
-    },
-    "details": {
-      "DNSDialup": {
-        "average": 0.0006074041,
-        "fastest": 0.000105827,
-        "slowest": 0.001841543
-      },
-      "DNSLookup": {
-        "average": 3.813065000000001e-05,
-        "fastest": 3.036e-06,
-        "slowest": 0.000243265
-      }
-    },
-    "statusCodeDistribution": {
-      "200": 1320
-    },
-    "errorDistribution": {
-      "aborted due to deadline": 20
-    }
-  }
-}
\ No newline at end of file
diff --git a/bench/baselines/websocket_baseline.json b/bench/baselines/websocket_baseline.json
deleted file mode 100644
index a342f28..0000000
--- a/bench/baselines/websocket_baseline.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "created_at": "2026-01-18T00:00:00Z",
-  "description": "Initial baseline - placeholder for WebSocket performance",
-  "metrics": {
-    "cursor_p99_ms": 100,
-    "viewport_p99_ms": 150,
-    "message_handling_p99_ms": 10
-  },
-  "notes": "This is a placeholder baseline derived from the performance budgets in tests/load_tests/mod.rs"
-}
diff --git a/bench/load_tests/results/.gitignore b/bench/load_tests/results/.gitignore
deleted file mode 100644
index b2fe286..0000000
--- a/bench/load_tests/results/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Ignore all benchmark results (raw data)
-*
-
-# But track this .gitignore
-!.gitignore
-
-# And track any README
-!README.md
diff --git a/bench/load_tests/scenarios/combined_load.sh b/bench/load_tests/scenarios/combined_load.sh
deleted file mode 100755
index a4547f0..0000000
--- a/bench/load_tests/scenarios/combined_load.sh
+++ /dev/null
@@ -1,306 +0,0 @@
-#!/usr/bin/env bash
-#
-# combined_load.sh - Combined HTTP tile + WebSocket session load test
-#
-# This script simulates realistic production load by running:
-# - HTTP tile requests (simulating viewport navigation)
-# - WebSocket sessions with cursor/viewport updates (using Rust load tests)
-#
-# This captures the combined effect of both workloads on server performance.
-#
-# Prerequisites:
-#   - oha: cargo install oha
-#   - Built Rust server and tests
-#
-# Usage:
-#   ./combined_load.sh [OPTIONS]
-#
-# Options:
-#   -u, --url         Base URL (default: http://127.0.0.1:8080)
-#   -s, --slide       Slide ID (default: auto-detect)
-#   --tile-concurrent Concurrent tile requests (default: 10)
-#   --ws-sessions     Number of WebSocket sessions (default: 3)
-#   --ws-followers    Followers per session (default: 10)
-#   -d, --duration    Test duration in seconds (default: 30)
-#   -o, --output      Output directory (default: bench/load_tests/results)
-#   -h, --help        Show this help message
-
-set -euo pipefail
-
-# Default configuration
-BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
-WS_URL="${WS_URL:-ws://127.0.0.1:8080/ws}"
-SLIDE_ID=""
-TILE_CONCURRENT=10
-WS_SESSIONS=3
-WS_FOLLOWERS=10
-DURATION=30
-OUTPUT_DIR="bench/load_tests/results"
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-CYAN='\033[0;36m'
-NC='\033[0m'
-
-usage() {
-    grep '^#' "$0" | grep -v '#!/' | cut -c3-
-    exit 0
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -u|--url)
-            BASE_URL="$2"
-            WS_URL="ws://${2#http://}/ws"
-            WS_URL="${WS_URL/https:/wss:}"
-            shift 2
-            ;;
-        -s|--slide)
-            SLIDE_ID="$2"
-            shift 2
-            ;;
-        --tile-concurrent)
-            TILE_CONCURRENT="$2"
-            shift 2
-            ;;
-        --ws-sessions)
-            WS_SESSIONS="$2"
-            shift 2
-            ;;
-        --ws-followers)
-            WS_FOLLOWERS="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -o|--output)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            ;;
-        *)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-    esac
-done
-
-# Ensure we're in the project root
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
-
-# Create output directory
-mkdir -p "$OUTPUT_DIR"
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-
-# Check for oha
-if ! command -v oha &> /dev/null; then
-    log_error "oha is not installed. Install with: cargo install oha"
-    exit 1
-fi
-
-# Check server health
-log_info "Checking server health at $BASE_URL..."
-if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then
-    log_error "Server not responding at $BASE_URL"
-    exit 1
-fi
-log_success "Server is healthy"
-
-# Auto-detect slide
-if [[ -z "$SLIDE_ID" ]]; then
-    SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]")
-    SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "")
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}")
-        SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "")
-    fi
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        log_error "No slides available. Place WSI files in the slides directory."
-        exit 1
-    fi
-fi
-log_success "Using slide: $SLIDE_ID"
-
-# Get slide metadata
-METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}")
-NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10")
-TEST_LEVEL=$((NUM_LEVELS / 2))
-[[ $TEST_LEVEL -lt 5 ]] && TEST_LEVEL=5
-
-TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/10/10"
-
-echo ""
-echo "=========================================="
-echo " Combined Load Test"
-echo "=========================================="
-echo " HTTP Base URL:     $BASE_URL"
-echo " WebSocket URL:     $WS_URL"
-echo " Slide:             $SLIDE_ID"
-echo " Tile concurrent:   $TILE_CONCURRENT"
-echo " WS sessions:       $WS_SESSIONS"
-echo " WS followers/sess: $WS_FOLLOWERS"
-echo " Duration:          ${DURATION}s"
-echo "=========================================="
-echo ""
-
-# Prepare output files
-TILE_OUTPUT="$OUTPUT_DIR/combined_${TIMESTAMP}_tiles.json"
-WS_OUTPUT="$OUTPUT_DIR/combined_${TIMESTAMP}_websocket.txt"
-SUMMARY_FILE="$OUTPUT_DIR/combined_${TIMESTAMP}_summary.txt"
-
-# Collect initial metrics from server
-log_info "Collecting baseline metrics..."
-BASELINE_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}")
-BASELINE_CONNECTIONS=$(echo "$BASELINE_METRICS" | grep -o '"total_connections":[0-9]*' | cut -d':' -f2 || echo "0")
-
-# Start tile load test in background
-log_info "Starting HTTP tile load test ($TILE_CONCURRENT concurrent)..."
-oha -c "$TILE_CONCURRENT" -z "${DURATION}s" --json "$TEST_URL" > "$TILE_OUTPUT" 2>&1 &
-TILE_PID=$!
-
-# Start WebSocket load test in background (using Rust tests)
-log_info "Starting WebSocket load test ($WS_SESSIONS sessions, $WS_FOLLOWERS followers each)..."
-
-# Create a temporary test file for custom configuration
-# We use environment variables to configure the Rust test
-export LOAD_TEST_WS_URL="$WS_URL"
-export LOAD_TEST_SESSIONS="$WS_SESSIONS"
-export LOAD_TEST_FOLLOWERS="$WS_FOLLOWERS"
-export LOAD_TEST_DURATION="$DURATION"
-
-# Run the Rust WebSocket test (if compiled)
-if [[ -f "$PROJECT_ROOT/target/release/deps/perf_tests"* ]]; then
-    cd "$PROJECT_ROOT"
-    cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture > "$WS_OUTPUT" 2>&1 &
-    WS_PID=$!
-else
-    log_warn "WebSocket tests not compiled (run: cargo build --release --tests)"
-    log_info "Running tile-only load test..."
-    WS_PID=""
-fi
-
-# Wait for tests to complete
-log_info "Tests running... waiting ${DURATION}s + buffer"
-
-# Monitor progress
-ELAPSED=0
-while [[ $ELAPSED -lt $DURATION ]]; do
-    sleep 5
-    ELAPSED=$((ELAPSED + 5))
-    CURRENT_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}")
-    CURRENT_CONNECTIONS=$(echo "$CURRENT_METRICS" | grep -o '"total_connections":[0-9]*' | cut -d':' -f2 || echo "?")
-    echo -e "  [${ELAPSED}s/${DURATION}s] Active connections: $CURRENT_CONNECTIONS"
-done
-
-# Wait for background jobs
-log_info "Waiting for test completion..."
-wait $TILE_PID || true
-if [[ -n "${WS_PID:-}" ]]; then
-    wait $WS_PID || true
-fi
-
-# Collect final metrics
-FINAL_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}")
-
-echo ""
-echo "=========================================="
-echo " Combined Test Results"
-echo "=========================================="
-
-# Parse tile results
-echo ""
-echo "--- HTTP Tile Results ---"
-if [[ -f "$TILE_OUTPUT" ]] && command -v jq &> /dev/null; then
-    TILE_RPS=$(jq -r '.summary.requestsPerSec // 0 | floor' "$TILE_OUTPUT")
-    TILE_P50=$(jq -r '(.latencyPercentiles.p50 // 0) * 1000 | floor' "$TILE_OUTPUT")
-    TILE_P95=$(jq -r '(.latencyPercentiles.p95 // 0) * 1000 | floor' "$TILE_OUTPUT")
-    TILE_P99=$(jq -r '(.latencyPercentiles.p99 // 0) * 1000 | floor' "$TILE_OUTPUT")
-    TILE_SUCCESS=$(jq -r '(.summary.successRate // 1) * 100 | floor' "$TILE_OUTPUT")
-
-    echo "  Throughput:   $TILE_RPS req/s"
-    echo "  P50 latency:  ${TILE_P50}ms"
-    echo "  P95 latency:  ${TILE_P95}ms"
-    echo "  P99 latency:  ${TILE_P99}ms"
-    echo "  Success rate: ${TILE_SUCCESS}%"
-else
-    echo "  (Results file not found or jq not available)"
-    TILE_RPS=0
-    TILE_P99=0
-fi
-
-# Parse WebSocket results
-echo ""
-echo "--- WebSocket Results ---"
-if [[ -f "$WS_OUTPUT" ]]; then
-    if grep -q "PASS" "$WS_OUTPUT"; then
-        echo "  Status: PASS"
-    elif grep -q "FAIL" "$WS_OUTPUT"; then
-        echo "  Status: FAIL"
-    fi
-
-    # Extract P99 from output
-    WS_CURSOR_P99=$(grep "Cursor.*P99:" "$WS_OUTPUT" | grep -o '[0-9.]*ms' | head -1 || echo "N/A")
-    WS_VIEWPORT_P99=$(grep "Viewport.*P99:" "$WS_OUTPUT" | grep -o '[0-9.]*ms' | head -1 || echo "N/A")
-    WS_SENT=$(grep "Messages sent:" "$WS_OUTPUT" | grep -o '[0-9]*' || echo "N/A")
-    WS_RECV=$(grep "Messages received:" "$WS_OUTPUT" | grep -o '[0-9]*' || echo "N/A")
-
-    echo "  Cursor P99:    $WS_CURSOR_P99"
-    echo "  Viewport P99:  $WS_VIEWPORT_P99"
-    echo "  Messages sent: $WS_SENT"
-    echo "  Messages recv: $WS_RECV"
-else
-    echo "  (WebSocket test not run)"
-fi
-
-# Generate summary
-{
-    echo "Combined Load Test Summary"
-    echo "=========================="
-    echo ""
-    echo "Test Configuration:"
-    echo "  Duration: ${DURATION}s"
-    echo "  Tile concurrent: $TILE_CONCURRENT"
-    echo "  WS sessions: $WS_SESSIONS × $WS_FOLLOWERS followers"
-    echo ""
-    echo "HTTP Tile Results:"
-    echo "  Throughput: ${TILE_RPS:-N/A} req/s"
-    echo "  P99 latency: ${TILE_P99:-N/A}ms"
-    echo ""
-    echo "WebSocket Results:"
-    echo "  Cursor P99: ${WS_CURSOR_P99:-N/A}"
-    echo "  Viewport P99: ${WS_VIEWPORT_P99:-N/A}"
-    echo ""
-    echo "Files:"
-    echo "  Tile results: $TILE_OUTPUT"
-    echo "  WebSocket results: $WS_OUTPUT"
-} > "$SUMMARY_FILE"
-
-echo ""
-log_success "Results saved to $OUTPUT_DIR"
diff --git a/bench/load_tests/scenarios/overlay_stress.sh b/bench/load_tests/scenarios/overlay_stress.sh
deleted file mode 100755
index afab1a0..0000000
--- a/bench/load_tests/scenarios/overlay_stress.sh
+++ /dev/null
@@ -1,280 +0,0 @@
-#!/usr/bin/env bash
-#
-# overlay_stress.sh - HTTP load test for cell overlay endpoints
-#
-# This script hammers the cell overlay endpoint to measure:
-# - Latency percentiles (p50, p90, p95, p99, p99.9)
-# - Throughput (requests/second)
-# - Error rates
-#
-# Prerequisites:
-#   - oha: cargo install oha
-#   - Running PathCollab server with slides and overlays available
-#
-# Usage:
-#   ./overlay_stress.sh [OPTIONS]
-#
-# Options:
-#   -u, --url           Base URL (default: http://127.0.0.1:8080)
-#   -s, --slide         Slide ID to test (default: auto-detect from /api/slides)
-#   -c, --concurrent    Concurrent connections (default: 10)
-#   -d, --duration      Test duration in seconds (default: 30)
-#   -r, --rate          Requests per second limit, 0=unlimited (default: 0)
-#   -v, --viewport-size Viewport size in pixels (default: 512)
-#   -o, --output        Output file for JSON results (optional)
-#   -q, --quick         Quick mode: 5 connections, 10 seconds
-#   -h, --help          Show this help message
-
-set -euo pipefail
-
-# Default configuration
-BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
-SLIDE_ID=""
-CONCURRENT=10
-DURATION=30
-RATE=0
-VIEWPORT_SIZE=512
-OUTPUT_FILE=""
-QUICK_MODE=false
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-usage() {
-    grep '^#' "$0" | grep -v '#!/' | cut -c3-
-    exit 0
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -u|--url)
-            BASE_URL="$2"
-            shift 2
-            ;;
-        -s|--slide)
-            SLIDE_ID="$2"
-            shift 2
-            ;;
-        -c|--concurrent)
-            CONCURRENT="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -r|--rate)
-            RATE="$2"
-            shift 2
-            ;;
-        -v|--viewport-size)
-            VIEWPORT_SIZE="$2"
-            shift 2
-            ;;
-        -o|--output)
-            OUTPUT_FILE="$2"
-            shift 2
-            ;;
-        -q|--quick)
-            QUICK_MODE=true
-            shift
-            ;;
-        -h|--help)
-            usage
-            ;;
-        *)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-    esac
-done
-
-# Quick mode overrides
-if [[ "$QUICK_MODE" == "true" ]]; then
-    CONCURRENT=5
-    DURATION=10
-    log_info "Quick mode enabled: $CONCURRENT connections, ${DURATION}s duration"
-fi
-
-# Check for oha
-if ! command -v oha &> /dev/null; then
-    log_error "oha is not installed. Install with: cargo install oha"
-    exit 1
-fi
-
-# Check server health
-log_info "Checking server health at $BASE_URL..."
-if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then
-    log_error "Server not responding at $BASE_URL"
-    exit 1
-fi
-log_success "Server is healthy"
-
-# Auto-detect slide if not specified
-if [[ -z "$SLIDE_ID" ]]; then
-    log_info "Auto-detecting slide ID..."
-    SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]")
-    SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "")
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        # Try default slide endpoint
-        DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}")
-        SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "")
-    fi
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        log_error "No slides found. Ensure slides are configured or use --slide"
-        exit 1
-    fi
-fi
-log_success "Using slide: $SLIDE_ID"
-
-# Check overlay availability with retry for loading state
-log_info "Checking overlay availability..."
-OVERLAY_READY=false
-for i in {1..10}; do
-    OVERLAY_RESPONSE=$(curl -sf -w "\n%{http_code}" "$BASE_URL/api/slide/$SLIDE_ID/overlay/metadata" 2>/dev/null || echo -e "\n000")
-    HTTP_CODE=$(echo "$OVERLAY_RESPONSE" | tail -1)
-
-    if [[ "$HTTP_CODE" == "200" ]]; then
-        OVERLAY_READY=true
-        break
-    elif [[ "$HTTP_CODE" == "202" ]]; then
-        log_info "Overlay still loading, waiting... (attempt $i/10)"
-        sleep 1
-    elif [[ "$HTTP_CODE" == "404" ]]; then
-        log_error "No overlay available for slide $SLIDE_ID"
-        exit 1
-    else
-        log_warn "Unexpected response code: $HTTP_CODE (attempt $i/10)"
-        sleep 1
-    fi
-done
-
-if [[ "$OVERLAY_READY" != "true" ]]; then
-    log_error "Overlay not ready after 10 attempts"
-    exit 1
-fi
-log_success "Overlay is ready"
-
-# Get slide dimensions
-log_info "Fetching slide metadata..."
-METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}")
-WIDTH=$(echo "$METADATA" | grep -o '"width":[0-9]*' | cut -d':' -f2 || echo "10000")
-HEIGHT=$(echo "$METADATA" | grep -o '"height":[0-9]*' | cut -d':' -f2 || echo "10000")
-
-log_info "Slide dimensions: ${WIDTH}x${HEIGHT}"
-
-# Calculate center and viewport regions
-CENTER_X=$((WIDTH / 2))
-CENTER_Y=$((HEIGHT / 2))
-
-echo ""
-echo "=========================================="
-echo " Overlay Stress Test Configuration"
-echo "=========================================="
-echo " URL:           $BASE_URL"
-echo " Slide:         $SLIDE_ID"
-echo " Viewport:      ${VIEWPORT_SIZE}x${VIEWPORT_SIZE}"
-echo " Concurrent:    $CONCURRENT"
-echo " Duration:      ${DURATION}s"
-echo " Rate limit:    ${RATE:-unlimited} req/s"
-echo "=========================================="
-echo ""
-
-# Generate viewport region URLs file for reference (3x3 grid around center)
-URLS_FILE=$(mktemp)
-trap "rm -f $URLS_FILE" EXIT
-
-log_info "Generating viewport regions (3x3 grid around center)..."
-for dx in -$VIEWPORT_SIZE 0 $VIEWPORT_SIZE; do
-    for dy in -$VIEWPORT_SIZE 0 $VIEWPORT_SIZE; do
-        x=$((CENTER_X + dx))
-        y=$((CENTER_Y + dy))
-        # Clamp to bounds
-        if [[ $x -lt 0 ]]; then x=0; fi
-        if [[ $y -lt 0 ]]; then y=0; fi
-        if [[ $x -gt $((WIDTH - VIEWPORT_SIZE)) ]]; then x=$((WIDTH - VIEWPORT_SIZE)); fi
-        if [[ $y -gt $((HEIGHT - VIEWPORT_SIZE)) ]]; then y=$((HEIGHT - VIEWPORT_SIZE)); fi
-        echo "$BASE_URL/api/slide/$SLIDE_ID/overlay/cells?x=$x&y=$y&width=$VIEWPORT_SIZE&height=$VIEWPORT_SIZE" >> "$URLS_FILE"
-    done
-done
-
-log_info "Generated $(wc -l < "$URLS_FILE") viewport region URLs"
-
-# Build oha command
-OHA_CMD="oha"
-OHA_CMD="$OHA_CMD -c $CONCURRENT"
-OHA_CMD="$OHA_CMD -z ${DURATION}s"
-OHA_CMD="$OHA_CMD --no-tui"
-
-if [[ $RATE -gt 0 ]]; then
-    OHA_CMD="$OHA_CMD -q $RATE"
-fi
-
-# Add JSON output if requested
-if [[ -n "$OUTPUT_FILE" ]]; then
-    OHA_CMD="$OHA_CMD --output-format json -o $OUTPUT_FILE"
-fi
-
-# Test a representative center region URL
-# oha doesn't support URL files directly, so we test the center viewport
-TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/overlay/cells?x=$CENTER_X&y=$CENTER_Y&width=$VIEWPORT_SIZE&height=$VIEWPORT_SIZE"
-
-log_info "Testing overlay cells endpoint: $TEST_URL"
-log_info "Starting load test..."
-echo ""
-
-if [[ -n "$OUTPUT_FILE" ]]; then
-    $OHA_CMD "$TEST_URL" 2>&1
-    log_success "Results saved to $OUTPUT_FILE"
-
-    # Also print summary
-    echo ""
-    echo "=========================================="
-    echo " Results Summary (from JSON)"
-    echo "=========================================="
-    if command -v jq &> /dev/null && [[ -f "$OUTPUT_FILE" ]]; then
-        jq -r '
-            "Duration:     \(.summary.total | floor)s",
-            "Requests:     \(.statusCodeDistribution | to_entries | map(.value) | add)",
-            "Successful:   \(.summary.successRate * 100 | floor)%",
-            "Req/sec:      \(.summary.requestsPerSec | floor)",
-            "",
-            "Latency:",
-            "  P50:        \(.latencyPercentiles.p50 * 1000 | floor)ms",
-            "  P90:        \(.latencyPercentiles.p90 * 1000 | floor)ms",
-            "  P95:        \(.latencyPercentiles.p95 * 1000 | floor)ms",
-            "  P99:        \(.latencyPercentiles.p99 * 1000 | floor)ms",
-            "  P99.9:      \(.latencyPercentiles."p99.9" * 1000 | floor)ms"
-        ' "$OUTPUT_FILE" 2>/dev/null || cat "$OUTPUT_FILE"
-    else
-        cat "$OUTPUT_FILE" 2>/dev/null || echo "(output file not available)"
-    fi
-else
-    $OHA_CMD "$TEST_URL"
-fi
-
-echo ""
-log_success "Overlay stress test completed"
diff --git a/bench/load_tests/scenarios/ramp_test.sh b/bench/load_tests/scenarios/ramp_test.sh
deleted file mode 100755
index 6b35732..0000000
--- a/bench/load_tests/scenarios/ramp_test.sh
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/usr/bin/env bash
-#
-# ramp_test.sh - Gradual load increase test to find breaking point
-#
-# This script increases concurrent connections gradually to identify:
-# - Maximum sustainable throughput
-# - Breaking point where latency degrades significantly
-# - Error threshold (when errors start appearing)
-#
-# Prerequisites:
-#   - oha: cargo install oha
-#   - Running PathCollab server with slides available
-#
-# Usage:
-#   ./ramp_test.sh [OPTIONS]
-#
-# Options:
-#   -u, --url         Base URL (default: http://127.0.0.1:8080)
-#   -s, --slide       Slide ID to test (default: auto-detect)
-#   --start           Starting concurrent connections (default: 1)
-#   --end             Maximum concurrent connections (default: 100)
-#   --step            Concurrency increase per stage (default: 10)
-#   --stage-duration  Duration per stage in seconds (default: 10)
-#   -o, --output      Output directory for results (default: bench/load_tests/results)
-#   -h, --help        Show this help message
-
-set -euo pipefail
-
-# Default configuration
-BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
-SLIDE_ID=""
-START_CONCURRENCY=1
-END_CONCURRENCY=100
-STEP=10
-STAGE_DURATION=10
-OUTPUT_DIR="bench/load_tests/results"
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-CYAN='\033[0;36m'
-NC='\033[0m'
-
-usage() {
-    grep '^#' "$0" | grep -v '#!/' | cut -c3-
-    exit 0
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-log_stage() {
-    echo -e "${CYAN}[STAGE]${NC} $1"
-}
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -u|--url)
-            BASE_URL="$2"
-            shift 2
-            ;;
-        -s|--slide)
-            SLIDE_ID="$2"
-            shift 2
-            ;;
-        --start)
-            START_CONCURRENCY="$2"
-            shift 2
-            ;;
-        --end)
-            END_CONCURRENCY="$2"
-            shift 2
-            ;;
-        --step)
-            STEP="$2"
-            shift 2
-            ;;
-        --stage-duration)
-            STAGE_DURATION="$2"
-            shift 2
-            ;;
-        -o|--output)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            ;;
-        *)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-    esac
-done
-
-# Check for oha
-if ! command -v oha &> /dev/null; then
-    log_error "oha is not installed. Install with: cargo install oha"
-    exit 1
-fi
-
-# Create output directory
-mkdir -p "$OUTPUT_DIR"
-
-# Check server health
-log_info "Checking server health at $BASE_URL..."
-if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then
-    log_error "Server not responding at $BASE_URL"
-    exit 1
-fi
-log_success "Server is healthy"
-
-# Auto-detect slide if not specified
-if [[ -z "$SLIDE_ID" ]]; then
-    SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]")
-    SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "")
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}")
-        SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "")
-    fi
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        log_error "No slides available. Place WSI files in the slides directory."
-        exit 1
-    fi
-fi
-log_success "Using slide: $SLIDE_ID"
-
-# Get slide metadata
-METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}")
-NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10")
-TEST_LEVEL=$((NUM_LEVELS / 2))
-[[ $TEST_LEVEL -lt 5 ]] && TEST_LEVEL=5
-
-TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/10/10"
-
-# Prepare results file
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-RESULTS_FILE="$OUTPUT_DIR/ramp_${TIMESTAMP}.csv"
-SUMMARY_FILE="$OUTPUT_DIR/ramp_${TIMESTAMP}_summary.txt"
-
-echo ""
-echo "=========================================="
-echo " Ramp-Up Load Test"
-echo "=========================================="
-echo " URL:             $BASE_URL"
-echo " Slide:           $SLIDE_ID"
-echo " Level:           $TEST_LEVEL"
-echo " Start:           $START_CONCURRENCY connections"
-echo " End:             $END_CONCURRENCY connections"
-echo " Step:            +$STEP per stage"
-echo " Stage duration:  ${STAGE_DURATION}s"
-echo " Output:          $RESULTS_FILE"
-echo "=========================================="
-echo ""
-
-# CSV header
-echo "concurrency,requests,success_rate,rps,p50_ms,p90_ms,p95_ms,p99_ms,errors" > "$RESULTS_FILE"
-
-# Track best performance
-BEST_RPS=0
-BEST_CONCURRENCY=0
-BREAKING_POINT=0
-
-# Run stages
-CURRENT=$START_CONCURRENCY
-STAGE=1
-
-while [[ $CURRENT -le $END_CONCURRENCY ]]; do
-    log_stage "Stage $STAGE: $CURRENT concurrent connections"
-
-    # Run oha and capture JSON output
-    STAGE_OUTPUT=$(oha -c "$CURRENT" -z "${STAGE_DURATION}s" --json "$TEST_URL" 2>/dev/null || echo "{}")
-
-    # Parse results (using grep/sed for portability, jq if available)
-    if command -v jq &> /dev/null; then
-        REQUESTS=$(echo "$STAGE_OUTPUT" | jq -r '.summary.total // 0')
-        SUCCESS_RATE=$(echo "$STAGE_OUTPUT" | jq -r '(.summary.successRate // 1) * 100 | floor')
-        RPS=$(echo "$STAGE_OUTPUT" | jq -r '.summary.requestsPerSec // 0 | floor')
-        P50=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p50 // 0) * 1000 | floor')
-        P90=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p90 // 0) * 1000 | floor')
-        P95=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p95 // 0) * 1000 | floor')
-        P99=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p99 // 0) * 1000 | floor')
-        ERRORS=$(echo "$STAGE_OUTPUT" | jq -r '.statusCodeDistribution | to_entries | map(select(.key | startswith("5") or startswith("4"))) | map(.value) | add // 0')
-    else
-        # Fallback parsing
-        REQUESTS=$(echo "$STAGE_OUTPUT" | grep -o '"total":[0-9]*' | cut -d':' -f2 || echo "0")
-        SUCCESS_RATE="100"
-        RPS=$(echo "$STAGE_OUTPUT" | grep -o '"requestsPerSec":[0-9.]*' | cut -d':' -f2 | cut -d'.' -f1 || echo "0")
-        P50="0"
-        P90="0"
-        P95="0"
-        P99="0"
-        ERRORS="0"
-    fi
-
-    # Record to CSV
-    echo "$CURRENT,$REQUESTS,$SUCCESS_RATE,$RPS,$P50,$P90,$P95,$P99,$ERRORS" >> "$RESULTS_FILE"
-
-    # Print stage summary
-    echo "    Requests: $REQUESTS | RPS: $RPS | P99: ${P99}ms | Success: ${SUCCESS_RATE}%"
-
-    # Track best RPS
-    if [[ $RPS -gt $BEST_RPS ]]; then
-        BEST_RPS=$RPS
-        BEST_CONCURRENCY=$CURRENT
-    fi
-
-    # Detect breaking point (P99 > 500ms or success rate drops)
-    if [[ $P99 -gt 500 || $SUCCESS_RATE -lt 95 ]]; then
-        if [[ $BREAKING_POINT -eq 0 ]]; then
-            BREAKING_POINT=$CURRENT
-            log_warn "Performance degradation detected at $CURRENT connections"
-        fi
-    fi
-
-    # Next stage
-    CURRENT=$((CURRENT + STEP))
-    STAGE=$((STAGE + 1))
-
-    # Brief pause between stages
-    sleep 1
-done
-
-echo ""
-echo "=========================================="
-echo " Ramp-Up Test Complete"
-echo "=========================================="
-
-# Generate summary
-{
-    echo "Ramp-Up Load Test Summary"
-    echo "========================="
-    echo ""
-    echo "Test Parameters:"
-    echo "  URL: $BASE_URL"
-    echo "  Slide: $SLIDE_ID"
-    echo "  Duration per stage: ${STAGE_DURATION}s"
-    echo ""
-    echo "Results:"
-    echo "  Best throughput: $BEST_RPS req/s at $BEST_CONCURRENCY connections"
-    if [[ $BREAKING_POINT -gt 0 ]]; then
-        echo "  Breaking point: $BREAKING_POINT connections"
-    else
-        echo "  Breaking point: Not reached (max: $END_CONCURRENCY)"
-    fi
-    echo ""
-    echo "Full results: $RESULTS_FILE"
-} | tee "$SUMMARY_FILE"
-
-echo ""
-log_success "Results saved to $OUTPUT_DIR"
diff --git a/bench/load_tests/scenarios/tile_stress.sh b/bench/load_tests/scenarios/tile_stress.sh
deleted file mode 100755
index 67c4ec7..0000000
--- a/bench/load_tests/scenarios/tile_stress.sh
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env bash
-#
-# tile_stress.sh - HTTP load test for tile serving endpoints
-#
-# This script hammers the tile serving endpoint to measure:
-# - Latency percentiles (p50, p90, p95, p99, p99.9)
-# - Throughput (requests/second)
-# - Error rates
-#
-# Prerequisites:
-#   - oha: cargo install oha
-#   - Running PathCollab server with slides available
-#
-# Usage:
-#   ./tile_stress.sh [OPTIONS]
-#
-# Options:
-#   -u, --url        Base URL (default: http://127.0.0.1:8080)
-#   -s, --slide      Slide ID to test (default: auto-detect from /api/slides)
-#   -c, --concurrent Concurrent connections (default: 10)
-#   -d, --duration   Test duration in seconds (default: 30)
-#   -r, --rate       Requests per second limit, 0=unlimited (default: 0)
-#   -o, --output     Output file for JSON results (optional)
-#   -q, --quick      Quick mode: 5 connections, 10 seconds
-#   -h, --help       Show this help message
-
-set -euo pipefail
-
-# Default configuration
-BASE_URL="${BASE_URL:-http://127.0.0.1:8080}"
-SLIDE_ID=""
-CONCURRENT=10
-DURATION=30
-RATE=0
-OUTPUT_FILE=""
-QUICK_MODE=false
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-NC='\033[0m'
-
-usage() {
-    grep '^#' "$0" | grep -v '#!/' | cut -c3-
-    exit 0
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        -u|--url)
-            BASE_URL="$2"
-            shift 2
-            ;;
-        -s|--slide)
-            SLIDE_ID="$2"
-            shift 2
-            ;;
-        -c|--concurrent)
-            CONCURRENT="$2"
-            shift 2
-            ;;
-        -d|--duration)
-            DURATION="$2"
-            shift 2
-            ;;
-        -r|--rate)
-            RATE="$2"
-            shift 2
-            ;;
-        -o|--output)
-            OUTPUT_FILE="$2"
-            shift 2
-            ;;
-        -q|--quick)
-            QUICK_MODE=true
-            shift
-            ;;
-        -h|--help)
-            usage
-            ;;
-        *)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-    esac
-done
-
-# Quick mode overrides
-if [[ "$QUICK_MODE" == "true" ]]; then
-    CONCURRENT=5
-    DURATION=10
-    log_info "Quick mode enabled: $CONCURRENT connections, ${DURATION}s duration"
-fi
-
-# Check for oha
-if ! command -v oha &> /dev/null; then
-    log_error "oha is not installed. Install with: cargo install oha"
-    exit 1
-fi
-
-# Check server health
-log_info "Checking server health at $BASE_URL..."
-if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then
-    log_error "Server not responding at $BASE_URL"
-    exit 1
-fi
-log_success "Server is healthy"
-
-# Auto-detect slide if not specified
-if [[ -z "$SLIDE_ID" ]]; then
-    log_info "Auto-detecting slide ID..."
-    SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]")
-    SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "")
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        # Try default slide endpoint
-        DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}")
-        SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "")
-    fi
-
-    if [[ -z "$SLIDE_ID" ]]; then
-        log_error "No slides found. Ensure slides are configured or use --slide"
-        exit 1
-    fi
-fi
-log_success "Using slide: $SLIDE_ID"
-
-# Get slide metadata to determine valid tile coordinates
-log_info "Fetching slide metadata..."
-METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}")
-NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10")
-TILE_SIZE=$(echo "$METADATA" | grep -o '"tile_size":[0-9]*' | cut -d':' -f2 || echo "256")
-WIDTH=$(echo "$METADATA" | grep -o '"width":[0-9]*' | cut -d':' -f2 || echo "10000")
-HEIGHT=$(echo "$METADATA" | grep -o '"height":[0-9]*' | cut -d':' -f2 || echo "10000")
-
-# Calculate a level that has meaningful tiles (around 10-50 tiles across)
-# DZI: level 0 = 1x1, level (N-1) = full resolution
-# At level L, width = original_width / 2^(N-1-L)
-# We want level where width / tile_size gives us ~20 tiles
-# Test at level (NUM_LEVELS - 4) which is 1/8th of full resolution
-TEST_LEVEL=$((NUM_LEVELS - 4))
-if [[ $TEST_LEVEL -lt 8 ]]; then
-    TEST_LEVEL=8
-fi
-if [[ $TEST_LEVEL -ge $NUM_LEVELS ]]; then
-    TEST_LEVEL=$((NUM_LEVELS - 1))
-fi
-
-# Calculate tiles at this level
-SCALE_FACTOR=$((1 << (NUM_LEVELS - 1 - TEST_LEVEL)))
-LEVEL_WIDTH=$((WIDTH / SCALE_FACTOR))
-LEVEL_HEIGHT=$((HEIGHT / SCALE_FACTOR))
-MAX_TILE_X=$(( (LEVEL_WIDTH + TILE_SIZE - 1) / TILE_SIZE - 1 ))
-MAX_TILE_Y=$(( (LEVEL_HEIGHT + TILE_SIZE - 1) / TILE_SIZE - 1 ))
-
-log_info "Slide: ${WIDTH}x${HEIGHT}, $NUM_LEVELS levels"
-log_info "Testing at level $TEST_LEVEL (${LEVEL_WIDTH}x${LEVEL_HEIGHT}px, tiles: 0-${MAX_TILE_X} x 0-${MAX_TILE_Y})"
-
-# Build tile URL template
-# We'll test a range of tile coordinates to simulate viewport panning
-TILE_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/{x}/{y}"
-
-echo ""
-echo "=========================================="
-echo " Tile Stress Test Configuration"
-echo "=========================================="
-echo " URL:         $BASE_URL"
-echo " Slide:       $SLIDE_ID"
-echo " Level:       $TEST_LEVEL"
-echo " Concurrent:  $CONCURRENT"
-echo " Duration:    ${DURATION}s"
-echo " Rate limit:  ${RATE:-unlimited} req/s"
-echo "=========================================="
-echo ""
-
-# Generate tile URLs file for oha (simulate viewport panning)
-URLS_FILE=$(mktemp)
-trap "rm -f $URLS_FILE" EXIT
-
-# Generate a grid of tile coordinates from center of slide
-CENTER_X=$((MAX_TILE_X / 2))
-CENTER_Y=$((MAX_TILE_Y / 2))
-START_X=$((CENTER_X > 5 ? CENTER_X - 5 : 0))
-START_Y=$((CENTER_Y > 5 ? CENTER_Y - 5 : 0))
-END_X=$((START_X + 9 < MAX_TILE_X ? START_X + 9 : MAX_TILE_X))
-END_Y=$((START_Y + 9 < MAX_TILE_Y ? START_Y + 9 : MAX_TILE_Y))
-
-for x in $(seq $START_X $END_X); do
-    for y in $(seq $START_Y $END_Y); do
-        echo "$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/$x/$y" >> "$URLS_FILE"
-    done
-done
-
-log_info "Generated $(wc -l < "$URLS_FILE") tile URLs (tiles $START_X-$END_X x $START_Y-$END_Y)"
-log_info "Starting load test..."
-echo ""
-
-# Build oha command
-OHA_CMD="oha"
-OHA_CMD="$OHA_CMD -c $CONCURRENT"
-OHA_CMD="$OHA_CMD -z ${DURATION}s"
-OHA_CMD="$OHA_CMD --no-tui"
-
-if [[ $RATE -gt 0 ]]; then
-    OHA_CMD="$OHA_CMD -q $RATE"
-fi
-
-# Add JSON output if requested
-if [[ -n "$OUTPUT_FILE" ]]; then
-    OHA_CMD="$OHA_CMD --output-format json -o $OUTPUT_FILE"
-fi
-
-# Run the load test with URL file
-# oha doesn't support URL files directly, so we use a workaround with random selection
-# Instead, we'll test a single representative tile URL at the center
-TEST_TILE_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/$CENTER_X/$CENTER_Y"
-
-log_info "Testing tile: $TEST_TILE_URL"
-
-if [[ -n "$OUTPUT_FILE" ]]; then
-    $OHA_CMD "$TEST_TILE_URL" 2>&1
-    log_success "Results saved to $OUTPUT_FILE"
-
-    # Also print summary
-    echo ""
-    echo "=========================================="
-    echo " Results Summary (from JSON)"
-    echo "=========================================="
-    if command -v jq &> /dev/null && [[ -f "$OUTPUT_FILE" ]]; then
-        jq -r '
-            "Duration:     \(.summary.total | floor)s",
-            "Requests:     \(.statusCodeDistribution | to_entries | map(.value) | add)",
-            "Successful:   \(.summary.successRate * 100 | floor)%",
-            "Req/sec:      \(.summary.requestsPerSec | floor)",
-            "",
-            "Latency:",
-            "  P50:        \(.latencyPercentiles.p50 * 1000 | floor)ms",
-            "  P90:        \(.latencyPercentiles.p90 * 1000 | floor)ms",
-            "  P95:        \(.latencyPercentiles.p95 * 1000 | floor)ms",
-            "  P99:        \(.latencyPercentiles.p99 * 1000 | floor)ms",
-            "  P99.9:      \(.latencyPercentiles."p99.9" * 1000 | floor)ms"
-        ' "$OUTPUT_FILE" 2>/dev/null || cat "$OUTPUT_FILE"
-    else
-        cat "$OUTPUT_FILE" 2>/dev/null || echo "(output file not available)"
-    fi
-else
-    $OHA_CMD "$TEST_TILE_URL"
-fi
-
-echo ""
-log_success "Tile stress test completed"
diff --git a/bench/scripts/compare_baseline.py b/bench/scripts/compare_baseline.py
deleted file mode 100755
index ee132fc..0000000
--- a/bench/scripts/compare_baseline.py
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/usr/bin/env python3
-"""
-compare_baseline.py - Compare benchmark results against baseline
-
-This script compares current benchmark results to a saved baseline and:
-- Reports percentage changes for key metrics
-- Fails with exit code 1 if P99 regresses by more than threshold
-- Generates a markdown summary suitable for PR comments
-
-Usage:
-    ./compare_baseline.py --current results.json --baseline baseline.json
-    ./compare_baseline.py --current results.json --baseline baseline.json --threshold 10
-    ./compare_baseline.py --save-baseline results.json --output baselines/tile_baseline.json
-
-Examples:
-    # Compare current run to baseline
-    ./compare_baseline.py -c bench/load_tests/results/latest.json -b bench/baselines/tile_baseline.json
-
-    # Save new baseline
-    ./compare_baseline.py --save-baseline bench/load_tests/results/latest.json -o bench/baselines/tile_baseline.json
-"""
-
-import argparse
-import json
-import sys
-from pathlib import Path
-from datetime import datetime
-from typing import Dict, Any, Optional, Tuple
-
-# ANSI colors for terminal output
-class Colors:
-    RED = '\033[0;31m'
-    GREEN = '\033[0;32m'
-    YELLOW = '\033[1;33m'
-    BLUE = '\033[0;34m'
-    NC = '\033[0m'  # No Color
-
-
-def load_json(path: Path) -> Dict[str, Any]:
-    """Load and parse a JSON file."""
-    with open(path) as f:
-        return json.load(f)
-
-
-def save_json(data: Dict[str, Any], path: Path) -> None:
-    """Save data as JSON file."""
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, 'w') as f:
-        json.dump(data, f, indent=2)
-    print(f"{Colors.GREEN}[OK]{Colors.NC} Saved baseline to {path}")
-
-
-def extract_metrics(data: Dict[str, Any]) -> Dict[str, float]:
-    """
-    Extract key metrics from benchmark results.
-
-    Supports both oha JSON output and custom summary format.
-    """
-    metrics = {}
-
-    # oha format
-    if 'summary' in data:
-        summary = data['summary']
-        metrics['requests_per_sec'] = summary.get('requestsPerSec', 0)
-        metrics['success_rate'] = summary.get('successRate', 1.0) * 100
-
-    if 'latencyPercentiles' in data:
-        lat = data['latencyPercentiles']
-        # oha returns latency in seconds, convert to ms
-        metrics['p50_ms'] = lat.get('p50', 0) * 1000
-        metrics['p90_ms'] = lat.get('p90', 0) * 1000
-        metrics['p95_ms'] = lat.get('p95', 0) * 1000
-        metrics['p99_ms'] = lat.get('p99', 0) * 1000
-        if 'p999' in lat:
-            metrics['p999_ms'] = lat.get('p999', 0) * 1000
-
-    # Alternative: latencyDistribution format
-    if 'latencyDistribution' in data and 'percentiles' in data['latencyDistribution']:
-        lat = data['latencyDistribution']['percentiles']
-        metrics['p50_ms'] = lat.get('p50', 0) * 1000
-        metrics['p90_ms'] = lat.get('p90', 0) * 1000
-        metrics['p95_ms'] = lat.get('p95', 0) * 1000
-        metrics['p99_ms'] = lat.get('p99', 0) * 1000
-
-    # Custom baseline format (already in correct units)
-    if 'metrics' in data:
-        metrics.update(data['metrics'])
-
-    return metrics
-
-
-def compare_metrics(
-    current: Dict[str, float],
-    baseline: Dict[str, float],
-    threshold_pct: float = 10.0
-) -> Tuple[bool, str, str]:
-    """
-    Compare current metrics to baseline.
-
-    Returns:
-        (passed, terminal_output, markdown_output)
-    """
-    passed = True
-    terminal_lines = []
-    md_lines = ["| Metric | Baseline | Current | Change | Status |",
-                "|--------|----------|---------|--------|--------|"]
-
-    # Metrics where lower is better (latencies)
-    lower_is_better = {'p50_ms', 'p90_ms', 'p95_ms', 'p99_ms', 'p999_ms'}
-    # Metrics where higher is better (throughput)
-    higher_is_better = {'requests_per_sec', 'success_rate'}
-
-    for metric in sorted(set(current.keys()) | set(baseline.keys())):
-        curr_val = current.get(metric, 0)
-        base_val = baseline.get(metric, 0)
-
-        if base_val == 0:
-            change_pct = 0 if curr_val == 0 else float('inf')
-        else:
-            change_pct = ((curr_val - base_val) / base_val) * 100
-
-        # Determine if this is a regression
-        is_regression = False
-        if metric in lower_is_better and change_pct > threshold_pct:
-            is_regression = True
-        elif metric in higher_is_better and change_pct < -threshold_pct:
-            is_regression = True
-
-        # Format values
-        if metric.endswith('_ms'):
-            base_str = f"{base_val:.1f}ms"
-            curr_str = f"{curr_val:.1f}ms"
-        elif metric == 'success_rate':
-            base_str = f"{base_val:.1f}%"
-            curr_str = f"{curr_val:.1f}%"
-        else:
-            base_str = f"{base_val:.1f}"
-            curr_str = f"{curr_val:.1f}"
-
-        # Format change
-        if change_pct == float('inf'):
-            change_str = "N/A"
-        else:
-            sign = "+" if change_pct > 0 else ""
-            change_str = f"{sign}{change_pct:.1f}%"
-
-        # Status
-        if is_regression:
-            status = f"{Colors.RED}REGRESSED{Colors.NC}"
-            status_md = "🔴 REGRESSED"
-            if metric == 'p99_ms':
-                passed = False  # Only fail on P99 regression
-        elif abs(change_pct) < 5:
-            status = f"{Colors.GREEN}OK{Colors.NC}"
-            status_md = "✅ OK"
-        elif metric in lower_is_better and change_pct < 0:
-            status = f"{Colors.GREEN}IMPROVED{Colors.NC}"
-            status_md = "🟢 IMPROVED"
-        elif metric in higher_is_better and change_pct > 0:
-            status = f"{Colors.GREEN}IMPROVED{Colors.NC}"
-            status_md = "🟢 IMPROVED"
-        else:
-            status = f"{Colors.YELLOW}CHANGED{Colors.NC}"
-            status_md = "🟡 CHANGED"
-
-        terminal_lines.append(
-            f"  {metric:20} {base_str:>12} → {curr_str:>12}  ({change_str:>8})  {status}"
-        )
-        md_lines.append(
-            f"| {metric} | {base_str} | {curr_str} | {change_str} | {status_md} |"
-        )
-
-    terminal_output = "\n".join(terminal_lines)
-    markdown_output = "\n".join(md_lines)
-
-    return passed, terminal_output, markdown_output
-
-
-def create_baseline(results: Dict[str, Any], description: str = "") -> Dict[str, Any]:
-    """Create a baseline document from results."""
-    metrics = extract_metrics(results)
-    return {
-        "created_at": datetime.utcnow().isoformat() + "Z",
-        "description": description,
-        "metrics": metrics,
-        "raw_data": results
-    }
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Compare benchmark results against baseline",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
-    )
-
-    parser.add_argument(
-        "-c", "--current",
-        type=Path,
-        help="Current results JSON file"
-    )
-    parser.add_argument(
-        "-b", "--baseline",
-        type=Path,
-        help="Baseline JSON file to compare against"
-    )
-    parser.add_argument(
-        "-t", "--threshold",
-        type=float,
-        default=10.0,
-        help="Regression threshold percentage (default: 10)"
-    )
-    parser.add_argument(
-        "--save-baseline",
-        type=Path,
-        help="Save results as new baseline"
-    )
-    parser.add_argument(
-        "-o", "--output",
-        type=Path,
-        help="Output path for baseline (with --save-baseline)"
-    )
-    parser.add_argument(
-        "-d", "--description",
-        default="",
-        help="Description for baseline (with --save-baseline)"
-    )
-    parser.add_argument(
-        "--markdown",
-        action="store_true",
-        help="Output comparison as markdown table"
-    )
-    parser.add_argument(
-        "--ci",
-        action="store_true",
-        help="CI mode: minimal output, exit code indicates pass/fail"
-    )
-
-    args = parser.parse_args()
-
-    # Save baseline mode
-    if args.save_baseline:
-        if not args.output:
-            print(f"{Colors.RED}[ERROR]{Colors.NC} --output required with --save-baseline")
-            sys.exit(1)
-
-        results = load_json(args.save_baseline)
-        baseline = create_baseline(results, args.description)
-        save_json(baseline, args.output)
-        sys.exit(0)
-
-    # Comparison mode
-    if not args.current or not args.baseline:
-        parser.print_help()
-        sys.exit(1)
-
-    if not args.current.exists():
-        print(f"{Colors.RED}[ERROR]{Colors.NC} Current results not found: {args.current}")
-        sys.exit(1)
-
-    if not args.baseline.exists():
-        print(f"{Colors.YELLOW}[WARN]{Colors.NC} Baseline not found: {args.baseline}")
-        print("Run with --save-baseline to create initial baseline")
-        sys.exit(0)
-
-    # Load and compare
-    current_data = load_json(args.current)
-    baseline_data = load_json(args.baseline)
-
-    current_metrics = extract_metrics(current_data)
-    baseline_metrics = extract_metrics(baseline_data)
-
-    passed, terminal_output, markdown_output = compare_metrics(
-        current_metrics,
-        baseline_metrics,
-        args.threshold
-    )
-
-    # Output
-    if args.markdown:
-        print("## Benchmark Comparison\n")
-        print(markdown_output)
-        print()
-        if passed:
-            print("**Result: ✅ PASSED** - No significant regressions detected")
-        else:
-            print("**Result: ❌ FAILED** - P99 latency regression exceeds threshold")
-    elif args.ci:
-        if not passed:
-            print(f"FAILED: P99 regression exceeds {args.threshold}% threshold")
-    else:
-        print()
-        print("=" * 60)
-        print(" Benchmark Comparison")
-        print("=" * 60)
-        print()
-        print(f"  Baseline: {args.baseline}")
-        print(f"  Current:  {args.current}")
-        print(f"  Threshold: {args.threshold}%")
-        print()
-        print(terminal_output)
-        print()
-        if passed:
-            print(f"{Colors.GREEN}PASSED{Colors.NC}: No significant regressions detected")
-        else:
-            print(f"{Colors.RED}FAILED{Colors.NC}: P99 latency regression exceeds {args.threshold}% threshold")
-        print()
-
-    sys.exit(0 if passed else 1)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bench/scripts/generate_report.py b/bench/scripts/generate_report.py
deleted file mode 100755
index 494b5ea..0000000
--- a/bench/scripts/generate_report.py
+++ /dev/null
@@ -1,334 +0,0 @@
-#!/usr/bin/env python3
-"""
-generate_report.py - Generate markdown benchmark report
-
-This script aggregates results from all benchmark phases and produces
-a comprehensive markdown report suitable for:
-- PR comments
-- Documentation
-- Historical tracking
-
-Usage:
-    ./generate_report.py --input-dir bench/load_tests/results/run_YYYYMMDD_HHMMSS --output REPORT.md
-"""
-
-import argparse
-import json
-import re
-import sys
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, Any, Optional, List
-
-
-def load_json_safe(path: Path) -> Optional[Dict[str, Any]]:
-    """Load JSON file, returning None on error."""
-    try:
-        with open(path) as f:
-            return json.load(f)
-    except (FileNotFoundError, json.JSONDecodeError):
-        return None
-
-
-def load_text_safe(path: Path) -> Optional[str]:
-    """Load text file, returning None on error."""
-    try:
-        with open(path) as f:
-            return f.read()
-    except FileNotFoundError:
-        return None
-
-
-def parse_criterion_output(text: str) -> List[Dict[str, Any]]:
-    """Parse Criterion benchmark output for key metrics."""
-    results = []
-
-    # Pattern: "benchmark_name  time:   [123.45 µs 125.67 µs 127.89 µs]"
-    pattern = r'(\S+)\s+time:\s+\[(\d+\.?\d*)\s*(\w+)\s+(\d+\.?\d*)\s*(\w+)\s+(\d+\.?\d*)\s*(\w+)\]'
-
-    for match in re.finditer(pattern, text):
-        name = match.group(1)
-        low = float(match.group(2))
-        low_unit = match.group(3)
-        mid = float(match.group(4))
-        mid_unit = match.group(5)
-        high = float(match.group(6))
-        high_unit = match.group(7)
-
-        # Normalize to microseconds
-        def to_us(val, unit):
-            if unit == 'ns':
-                return val / 1000
-            elif unit == 'µs' or unit == 'us':
-                return val
-            elif unit == 'ms':
-                return val * 1000
-            elif unit == 's':
-                return val * 1_000_000
-            return val
-
-        results.append({
-            'name': name,
-            'low_us': to_us(low, low_unit),
-            'mid_us': to_us(mid, mid_unit),
-            'high_us': to_us(high, high_unit),
-        })
-
-    return results
-
-
-def parse_websocket_output(text: str) -> Dict[str, Any]:
-    """Parse WebSocket load test output."""
-    result = {
-        'passed': 'PASS' in text,
-        'messages_sent': 0,
-        'messages_received': 0,
-        'cursor_p99': None,
-        'viewport_p99': None,
-    }
-
-    # Extract metrics
-    if match := re.search(r'Messages sent:\s*(\d+)', text):
-        result['messages_sent'] = int(match.group(1))
-    if match := re.search(r'Messages received:\s*(\d+)', text):
-        result['messages_received'] = int(match.group(1))
-    if match := re.search(r'Cursor.*P99:\s*([\d.]+\w+)', text):
-        result['cursor_p99'] = match.group(1)
-    if match := re.search(r'Viewport.*P99:\s*([\d.]+\w+)', text):
-        result['viewport_p99'] = match.group(1)
-
-    return result
-
-
-def format_duration(us: float) -> str:
-    """Format duration in appropriate units."""
-    if us < 1:
-        return f"{us * 1000:.2f}ns"
-    elif us < 1000:
-        return f"{us:.2f}µs"
-    elif us < 1_000_000:
-        return f"{us / 1000:.2f}ms"
-    else:
-        return f"{us / 1_000_000:.2f}s"
-
-
-def generate_report(input_dir: Path) -> str:
-    """Generate markdown report from benchmark results."""
-
-    lines = []
-    lines.append("# PathCollab Benchmark Report")
-    lines.append("")
-    lines.append(f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC")
-    lines.append(f"**Run directory:** `{input_dir.name}`")
-    lines.append("")
-
-    # Table of Contents
-    lines.append("## Table of Contents")
-    lines.append("- [Summary](#summary)")
-    lines.append("- [HTTP Tile Performance](#http-tile-performance)")
-    lines.append("- [WebSocket Performance](#websocket-performance)")
-    lines.append("- [Micro-benchmarks](#micro-benchmarks)")
-    lines.append("- [Server Metrics](#server-metrics)")
-    lines.append("")
-
-    # Summary
-    lines.append("## Summary")
-    lines.append("")
-
-    tile_data = load_json_safe(input_dir / "tile_stress.json")
-    ws_text = load_text_safe(input_dir / "websocket_load.txt")
-    ws_data = parse_websocket_output(ws_text) if ws_text else {}
-
-    summary_items = []
-
-    if tile_data:
-        rps = tile_data.get('summary', {}).get('requestsPerSec', 0)
-        p99 = tile_data.get('latencyPercentiles', {}).get('p99', 0) * 1000
-        success = tile_data.get('summary', {}).get('successRate', 1) * 100
-        summary_items.append(f"- **Tile serving:** {rps:.0f} req/s, P99: {p99:.1f}ms, Success: {success:.1f}%")
-        tile_status = "✅ PASS" if p99 < 100 else "❌ FAIL (P99 > 100ms)"
-    else:
-        tile_status = "⚠️ No data"
-        summary_items.append("- **Tile serving:** No data collected")
-
-    if ws_data.get('passed'):
-        summary_items.append(f"- **WebSocket:** P99 cursor: {ws_data.get('cursor_p99', 'N/A')}, P99 viewport: {ws_data.get('viewport_p99', 'N/A')}")
-        ws_status = "✅ PASS"
-    elif ws_text:
-        ws_status = "❌ FAIL"
-        summary_items.append("- **WebSocket:** Test failed")
-    else:
-        ws_status = "⚠️ No data"
-        summary_items.append("- **WebSocket:** No data collected")
-
-    lines.append("| Component | Status |")
-    lines.append("|-----------|--------|")
-    lines.append(f"| HTTP Tile Serving | {tile_status} |")
-    lines.append(f"| WebSocket Broadcasting | {ws_status} |")
-    lines.append("")
-    lines.extend(summary_items)
-    lines.append("")
-
-    # HTTP Tile Performance
-    lines.append("## HTTP Tile Performance")
-    lines.append("")
-
-    if tile_data:
-        summary = tile_data.get('summary', {})
-        latency = tile_data.get('latencyPercentiles', {})
-
-        lines.append("### Throughput")
-        lines.append("")
-        lines.append(f"- **Requests/sec:** {summary.get('requestsPerSec', 0):.1f}")
-        lines.append(f"- **Total requests:** {summary.get('total', 0)}")
-        lines.append(f"- **Success rate:** {summary.get('successRate', 1) * 100:.1f}%")
-        lines.append("")
-
-        lines.append("### Latency Distribution")
-        lines.append("")
-        lines.append("| Percentile | Latency |")
-        lines.append("|------------|---------|")
-        for p in ['p50', 'p75', 'p90', 'p95', 'p99', 'p999']:
-            val = latency.get(p, 0) * 1000  # to ms
-            lines.append(f"| {p.upper()} | {val:.2f}ms |")
-        lines.append("")
-
-        # Status codes
-        status_dist = tile_data.get('statusCodeDistribution', {})
-        if status_dist:
-            lines.append("### Status Codes")
-            lines.append("")
-            lines.append("| Code | Count |")
-            lines.append("|------|-------|")
-            for code, count in sorted(status_dist.items()):
-                lines.append(f"| {code} | {count} |")
-            lines.append("")
-    else:
-        lines.append("*No HTTP tile performance data available.*")
-        lines.append("")
-
-    # WebSocket Performance
-    lines.append("## WebSocket Performance")
-    lines.append("")
-
-    if ws_text:
-        lines.append("### Results")
-        lines.append("")
-        lines.append(f"- **Status:** {'PASS' if ws_data.get('passed') else 'FAIL'}")
-        lines.append(f"- **Messages sent:** {ws_data.get('messages_sent', 'N/A')}")
-        lines.append(f"- **Messages received:** {ws_data.get('messages_received', 'N/A')}")
-        lines.append(f"- **Cursor P99:** {ws_data.get('cursor_p99', 'N/A')}")
-        lines.append(f"- **Viewport P99:** {ws_data.get('viewport_p99', 'N/A')}")
-        lines.append("")
-
-        # Include raw output excerpt
-        lines.append("<details>")
-        lines.append("<summary>Raw Output</summary>")
-        lines.append("")
-        lines.append("```")
-        # Include just the results section
-        if "=== Load Test Results ===" in ws_text:
-            start = ws_text.find("=== Load Test Results ===")
-            lines.append(ws_text[start:start + 1500])
-        else:
-            lines.append(ws_text[:1500])
-        lines.append("```")
-        lines.append("</details>")
-        lines.append("")
-    else:
-        lines.append("*No WebSocket performance data available.*")
-        lines.append("")
-
-    # Micro-benchmarks
-    lines.append("## Micro-benchmarks")
-    lines.append("")
-
-    micro_text = load_text_safe(input_dir / "micro_benchmarks.txt")
-    if micro_text:
-        benchmarks = parse_criterion_output(micro_text)
-
-        if benchmarks:
-            # Group by benchmark file
-            groups = {}
-            for b in benchmarks:
-                # Extract group from name like "jpeg_encoding/256x256/85"
-                parts = b['name'].split('/')
-                group = parts[0] if parts else 'other'
-                if group not in groups:
-                    groups[group] = []
-                groups[group].append(b)
-
-            for group_name, items in sorted(groups.items()):
-                lines.append(f"### {group_name.replace('_', ' ').title()}")
-                lines.append("")
-                lines.append("| Benchmark | Time (median) | Range |")
-                lines.append("|-----------|---------------|-------|")
-                for b in items:
-                    name = '/'.join(b['name'].split('/')[1:]) or b['name']
-                    lines.append(f"| {name} | {format_duration(b['mid_us'])} | {format_duration(b['low_us'])} - {format_duration(b['high_us'])} |")
-                lines.append("")
-        else:
-            lines.append("*Could not parse benchmark results.*")
-            lines.append("")
-    else:
-        lines.append("*No micro-benchmark data available.*")
-        lines.append("")
-
-    # Server Metrics
-    lines.append("## Server Metrics")
-    lines.append("")
-
-    metrics_data = load_json_safe(input_dir / "server_metrics.json")
-    if metrics_data:
-        lines.append("| Metric | Value |")
-        lines.append("|--------|-------|")
-        for key, value in sorted(metrics_data.items()):
-            lines.append(f"| {key} | {value} |")
-        lines.append("")
-    else:
-        lines.append("*No server metrics available.*")
-        lines.append("")
-
-    # Footer
-    lines.append("---")
-    lines.append("")
-    lines.append("*Report generated by `bench/scripts/generate_report.py`*")
-
-    return "\n".join(lines)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Generate markdown benchmark report"
-    )
-    parser.add_argument(
-        "--input-dir",
-        type=Path,
-        required=True,
-        help="Directory containing benchmark results"
-    )
-    parser.add_argument(
-        "--output",
-        type=Path,
-        help="Output markdown file (default: stdout)"
-    )
-
-    args = parser.parse_args()
-
-    if not args.input_dir.exists():
-        print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr)
-        sys.exit(1)
-
-    report = generate_report(args.input_dir)
-
-    if args.output:
-        args.output.parent.mkdir(parents=True, exist_ok=True)
-        with open(args.output, 'w') as f:
-            f.write(report)
-        print(f"Report saved to: {args.output}")
-    else:
-        print(report)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/bench/scripts/run_all.sh b/bench/scripts/run_all.sh
deleted file mode 100755
index f705bda..0000000
--- a/bench/scripts/run_all.sh
+++ /dev/null
@@ -1,340 +0,0 @@
-#!/usr/bin/env bash
-#
-# run_all.sh - Orchestrate the complete benchmark suite
-#
-# This script runs all benchmarks in sequence and generates a comprehensive report.
-# It handles server startup (optional), warmup, test execution, and cleanup.
-#
-# Usage:
-#   ./run_all.sh [OPTIONS]
-#
-# Options:
-#   --server-cmd CMD    Command to start the server (default: auto-detect)
-#   --server-url URL    Server URL (default: http://127.0.0.1:8080)
-#   --skip-micro        Skip Criterion micro-benchmarks
-#   --skip-load         Skip HTTP load tests
-#   --skip-websocket    Skip WebSocket load tests
-#   --quick             Quick mode: shorter durations, fewer iterations
-#   --compare-baseline  Compare results to baseline and fail on regression
-#   --save-baseline     Save results as new baseline
-#   -o, --output        Output directory (default: bench/load_tests/results)
-#   -h, --help          Show this help message
-
-set -euo pipefail
-
-# Script directory
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-BENCH_DIR="$PROJECT_ROOT/bench"
-
-# Default configuration
-SERVER_CMD=""
-SERVER_URL="${SERVER_URL:-http://127.0.0.1:8080}"
-SKIP_MICRO=false
-SKIP_LOAD=false
-SKIP_WEBSOCKET=false
-QUICK_MODE=false
-COMPARE_BASELINE=false
-SAVE_BASELINE=false
-OUTPUT_DIR="$BENCH_DIR/load_tests/results"
-SERVER_PID=""
-
-# Colors
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-BLUE='\033[0;34m'
-CYAN='\033[0;36m'
-BOLD='\033[1m'
-NC='\033[0m'
-
-usage() {
-    grep '^#' "$0" | grep -v '#!/' | cut -c3-
-    exit 0
-}
-
-log_header() {
-    echo ""
-    echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════${NC}"
-    echo -e "${BOLD}${CYAN} $1${NC}"
-    echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════${NC}"
-    echo ""
-}
-
-log_info() {
-    echo -e "${BLUE}[INFO]${NC} $1"
-}
-
-log_success() {
-    echo -e "${GREEN}[OK]${NC} $1"
-}
-
-log_warn() {
-    echo -e "${YELLOW}[WARN]${NC} $1"
-}
-
-log_error() {
-    echo -e "${RED}[ERROR]${NC} $1"
-}
-
-cleanup() {
-    if [[ -n "${SERVER_PID:-}" ]]; then
-        log_info "Stopping server (PID: $SERVER_PID)..."
-        kill "$SERVER_PID" 2>/dev/null || true
-        wait "$SERVER_PID" 2>/dev/null || true
-    fi
-}
-
-trap cleanup EXIT
-
-# Parse arguments
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --server-cmd)
-            SERVER_CMD="$2"
-            shift 2
-            ;;
-        --server-url)
-            SERVER_URL="$2"
-            shift 2
-            ;;
-        --skip-micro)
-            SKIP_MICRO=true
-            shift
-            ;;
-        --skip-load)
-            SKIP_LOAD=true
-            shift
-            ;;
-        --skip-websocket)
-            SKIP_WEBSOCKET=true
-            shift
-            ;;
-        --quick)
-            QUICK_MODE=true
-            shift
-            ;;
-        --compare-baseline)
-            COMPARE_BASELINE=true
-            shift
-            ;;
-        --save-baseline)
-            SAVE_BASELINE=true
-            shift
-            ;;
-        -o|--output)
-            OUTPUT_DIR="$2"
-            shift 2
-            ;;
-        -h|--help)
-            usage
-            ;;
-        *)
-            log_error "Unknown option: $1"
-            usage
-            ;;
-    esac
-done
-
-# Create output directory
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-RUN_DIR="$OUTPUT_DIR/run_$TIMESTAMP"
-mkdir -p "$RUN_DIR"
-
-log_header "PathCollab Benchmark Suite"
-
-echo "Configuration:"
-echo "  Project root:    $PROJECT_ROOT"
-echo "  Server URL:      $SERVER_URL"
-echo "  Output:          $RUN_DIR"
-echo "  Quick mode:      $QUICK_MODE"
-echo "  Skip micro:      $SKIP_MICRO"
-echo "  Skip load:       $SKIP_LOAD"
-echo "  Skip WebSocket:  $SKIP_WEBSOCKET"
-echo ""
-
-# Check if server is running, or start it
-log_info "Checking server status..."
-if curl -sf "$SERVER_URL/health" > /dev/null 2>&1; then
-    log_success "Server is already running at $SERVER_URL"
-else
-    if [[ -n "$SERVER_CMD" ]]; then
-        log_info "Starting server with: $SERVER_CMD"
-        $SERVER_CMD &
-        SERVER_PID=$!
-
-        # Wait for server to be ready
-        for i in {1..30}; do
-            if curl -sf "$SERVER_URL/health" > /dev/null 2>&1; then
-                log_success "Server is ready"
-                break
-            fi
-            if [[ $i -eq 30 ]]; then
-                log_error "Server failed to start within 30 seconds"
-                exit 1
-            fi
-            sleep 1
-        done
-    else
-        log_error "Server not running at $SERVER_URL"
-        log_info "Either start the server manually or use --server-cmd"
-        exit 1
-    fi
-fi
-
-# Warmup
-log_header "Warmup Phase"
-log_info "Sending warmup requests..."
-for i in {1..10}; do
-    curl -sf "$SERVER_URL/health" > /dev/null 2>&1 || true
-    curl -sf "$SERVER_URL/api/slides" > /dev/null 2>&1 || true
-done
-log_success "Warmup complete"
-
-# Track overall results
-LOAD_PASSED=true
-WS_PASSED=true
-
-# Phase 1: HTTP load tests
-if [[ "$SKIP_LOAD" != "true" ]]; then
-    log_header "Phase 1: HTTP Load Tests"
-
-    cd "$PROJECT_ROOT"
-
-    if ! command -v oha &> /dev/null; then
-        log_warn "oha not installed, skipping HTTP load tests"
-        log_info "Install with: cargo install oha"
-    else
-        # Tile stress test
-        log_info "Running tile stress test..."
-        if [[ "$QUICK_MODE" == "true" ]]; then
-            bash "$BENCH_DIR/load_tests/scenarios/tile_stress.sh" \
-                --url "$SERVER_URL" \
-                --quick \
-                --output "$RUN_DIR/tile_stress.json" 2>&1 | tee "$RUN_DIR/tile_stress.txt" || LOAD_PASSED=false
-        else
-            bash "$BENCH_DIR/load_tests/scenarios/tile_stress.sh" \
-                --url "$SERVER_URL" \
-                --concurrent 20 \
-                --duration 30 \
-                --output "$RUN_DIR/tile_stress.json" 2>&1 | tee "$RUN_DIR/tile_stress.txt" || LOAD_PASSED=false
-        fi
-
-        # Overlay stress test
-        log_info "Running overlay stress test..."
-        if [[ "$QUICK_MODE" == "true" ]]; then
-            bash "$BENCH_DIR/load_tests/scenarios/overlay_stress.sh" \
-                --url "$SERVER_URL" \
-                --quick \
-                --output "$RUN_DIR/overlay_stress.json" 2>&1 | tee "$RUN_DIR/overlay_stress.txt" || LOAD_PASSED=false
-        else
-            bash "$BENCH_DIR/load_tests/scenarios/overlay_stress.sh" \
-                --url "$SERVER_URL" \
-                --concurrent 20 \
-                --duration 30 \
-                --output "$RUN_DIR/overlay_stress.json" 2>&1 | tee "$RUN_DIR/overlay_stress.txt" || LOAD_PASSED=false
-        fi
-
-        if [[ "$LOAD_PASSED" == "true" ]]; then
-            log_success "HTTP load tests complete"
-        else
-            log_warn "HTTP load tests had issues"
-        fi
-    fi
-else
-    log_info "Skipping HTTP load tests (--skip-load)"
-fi
-
-# Phase 2: WebSocket load tests
-if [[ "$SKIP_WEBSOCKET" != "true" ]]; then
-    log_header "Phase 2: WebSocket Load Tests"
-
-    cd "$PROJECT_ROOT/server"
-
-    log_info "Running WebSocket load tests..."
-    if [[ "$QUICK_MODE" == "true" ]]; then
-        cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture 2>&1 | tee "$RUN_DIR/websocket_load.txt" || WS_PASSED=false
-    else
-        cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture 2>&1 | tee "$RUN_DIR/websocket_load.txt" || WS_PASSED=false
-    fi
-
-    if [[ "$WS_PASSED" == "true" ]]; then
-        log_success "WebSocket load tests complete"
-    else
-        log_warn "WebSocket load tests had issues"
-    fi
-else
-    log_info "Skipping WebSocket load tests (--skip-websocket)"
-fi
-
-# Phase 3: Collect metrics
-log_header "Phase 3: Collecting Metrics"
-
-log_info "Fetching server metrics..."
-curl -sf "$SERVER_URL/metrics" > "$RUN_DIR/server_metrics.json" 2>/dev/null || true
-curl -sf "$SERVER_URL/metrics/prometheus" > "$RUN_DIR/prometheus_metrics.txt" 2>/dev/null || true
-log_success "Metrics collected"
-
-# Phase 4: Generate report
-log_header "Phase 4: Generating Report"
-
-python3 "$BENCH_DIR/scripts/generate_report.py" \
-    --input-dir "$RUN_DIR" \
-    --output "$RUN_DIR/REPORT.md" 2>&1 || log_warn "Report generation had issues"
-
-if [[ -f "$RUN_DIR/REPORT.md" ]]; then
-    log_success "Report generated: $RUN_DIR/REPORT.md"
-fi
-
-# Phase 5: Baseline comparison (if requested)
-if [[ "$COMPARE_BASELINE" == "true" ]] && [[ -f "$RUN_DIR/tile_stress.json" ]]; then
-    log_header "Phase 5: Baseline Comparison"
-
-    BASELINE_FILE="$BENCH_DIR/baselines/tile_baseline.json"
-
-    if [[ -f "$BASELINE_FILE" ]]; then
-        python3 "$BENCH_DIR/scripts/compare_baseline.py" \
-            --current "$RUN_DIR/tile_stress.json" \
-            --baseline "$BASELINE_FILE" \
-            --threshold 10 2>&1 | tee "$RUN_DIR/baseline_comparison.txt"
-
-        if [[ ${PIPESTATUS[0]} -ne 0 ]]; then
-            LOAD_PASSED=false
-        fi
-    else
-        log_warn "No baseline found at $BASELINE_FILE"
-        log_info "Create baseline with: --save-baseline"
-    fi
-fi
-
-# Save baseline (if requested)
-if [[ "$SAVE_BASELINE" == "true" ]] && [[ -f "$RUN_DIR/tile_stress.json" ]]; then
-    log_info "Saving new baseline..."
-    python3 "$BENCH_DIR/scripts/compare_baseline.py" \
-        --save-baseline "$RUN_DIR/tile_stress.json" \
-        --output "$BENCH_DIR/baselines/tile_baseline.json" \
-        --description "Baseline from run $TIMESTAMP"
-fi
-
-# Summary
-log_header "Summary"
-
-echo "Results saved to: $RUN_DIR"
-echo ""
-echo "Test Results:"
-echo "  HTTP load tests:  $([ "$LOAD_PASSED" == "true" ] && echo "✅ PASS" || echo "❌ FAIL")"
-echo "  WebSocket tests:  $([ "$WS_PASSED" == "true" ] && echo "✅ PASS" || echo "⚠️  ISSUES")"
-echo ""
-
-# Create symlink to latest run
-ln -sfn "run_$TIMESTAMP" "$OUTPUT_DIR/latest"
-echo "Latest results linked: $OUTPUT_DIR/latest"
-
-# Exit with appropriate code
-if [[ "$LOAD_PASSED" == "true" ]] && [[ "$WS_PASSED" == "true" ]]; then
-    log_success "All benchmarks passed!"
-    exit 0
-else
-    log_error "Some benchmarks failed"
-    exit 1
-fi
diff --git a/server/.benchmark-baseline.json b/server/.benchmark-baseline.json
new file mode 100644
index 0000000..76086b7
--- /dev/null
+++ b/server/.benchmark-baseline.json
@@ -0,0 +1,12 @@
+{
+  "SMOKE": {
+    "tier": "SMOKE",
+    "timestamp": "2026-01-22T11:21:45.573874+00:00",
+    "tile_p99_ms": 0.6844583333333333,
+    "overlay_p99_ms": 0.653153,
+    "cursor_p99_ms": null,
+    "viewport_p99_ms": null,
+    "error_rate_pct": 0.0,
+    "throughput": 105.66382458464953
+  }
+}
\ No newline at end of file
diff --git a/server/Cargo.toml b/server/Cargo.toml
index b980bab..68dfcc7 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -60,3 +60,4 @@ prost-build = "0.13"
 [dev-dependencies]
 tokio-tungstenite = "0.26"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
+chrono = { version = "0.4", features = ["serde"] }
diff --git a/server/src/session/manager.rs b/server/src/session/manager.rs
index 4b7617b..068f5a5 100644
--- a/server/src/session/manager.rs
+++ b/server/src/session/manager.rs
@@ -11,7 +11,7 @@ use metrics::{counter, histogram};
 use std::collections::HashMap;
 use std::time::Instant;
 use thiserror::Error;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, info, warn};
 use uuid::Uuid;
 
 /// Session manager errors
diff --git a/server/src/slide/cache.rs b/server/src/slide/cache.rs
index 96a342d..85b0275 100644
--- a/server/src/slide/cache.rs
+++ b/server/src/slide/cache.rs
@@ -121,7 +121,7 @@ impl SlideCache {
                 // Probabilistic LRU update: only update every N accesses
                 // This dramatically reduces write lock contention under load
                 let count = self.access_counter.fetch_add(1, Ordering::Relaxed);
-                if count % LRU_UPDATE_FREQUENCY == 0 {
+                if count.is_multiple_of(LRU_UPDATE_FREQUENCY) {
                     // Drop read lock before taking write lock
                     drop(slides);
                     // Update LRU order (best effort - may race but that's OK)
diff --git a/server/src/slide/tile_cache.rs b/server/src/slide/tile_cache.rs
index 60c6004..5c201d6 100644
--- a/server/src/slide/tile_cache.rs
+++ b/server/src/slide/tile_cache.rs
@@ -106,7 +106,7 @@ impl TileCache {
             counter!("pathcollab_tile_cache_hits_total").increment(1);
 
             // Update hit rate gauge periodically (every 100 hits)
-            if hits % 100 == 0 {
+            if hits.is_multiple_of(100) {
                 self.update_hit_rate_gauge();
             }
         } else {
diff --git a/server/tests/load_tests/benchmark.rs b/server/tests/load_tests/benchmark.rs
new file mode 100644
index 0000000..18c4f7a
--- /dev/null
+++ b/server/tests/load_tests/benchmark.rs
@@ -0,0 +1,570 @@
+//! Benchmark runner with warm-up, multiple iterations, and baseline comparison
+//!
+//! Provides a production-grade benchmark system that:
+//! - Runs a warm-up phase to prime caches and connection pools
+//! - Executes multiple iterations for statistical significance
+//! - Compares against stored baseline and detects regressions
+
+use super::BenchmarkTier;
+use super::scenarios::{ComprehensiveStressConfig, ComprehensiveStressScenario};
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+use std::time::Duration;
+
+/// Configuration for benchmark runs
+#[derive(Debug, Clone)]
+pub struct BenchmarkRunConfig {
+    /// Benchmark tier
+    pub tier: BenchmarkTier,
+    /// Number of iterations to run (default: 3)
+    pub iterations: usize,
+    /// Warm-up duration before measuring (default: 3s for smoke, 5s for others)
+    pub warmup_duration: Duration,
+    /// Path to baseline file (default: .benchmark-baseline.json in project root)
+    pub baseline_path: PathBuf,
+    /// Regression threshold as percentage (default: 15%)
+    pub regression_threshold_pct: f64,
+}
+
+impl BenchmarkRunConfig {
+    pub fn for_tier(tier: BenchmarkTier) -> Self {
+        let (iterations, warmup) = match tier {
+            BenchmarkTier::Smoke => (3, Duration::from_secs(2)),
+            BenchmarkTier::Standard => (3, Duration::from_secs(5)),
+            BenchmarkTier::Stress => (3, Duration::from_secs(5)),
+        };
+
+        Self {
+            tier,
+            iterations,
+            warmup_duration: warmup,
+            baseline_path: PathBuf::from(".benchmark-baseline.json"),
+            regression_threshold_pct: 15.0,
+        }
+    }
+}
+
+/// Metrics extracted from a single benchmark run
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct BenchmarkMetrics {
+    pub tile_p99_ms: Option<f64>,
+    pub overlay_p99_ms: Option<f64>,
+    pub cursor_p99_ms: Option<f64>,
+    pub viewport_p99_ms: Option<f64>,
+    pub error_rate: f64,
+    pub throughput: f64,
+}
+
+impl BenchmarkMetrics {
+    /// Extract metrics from comprehensive stress results
+    pub fn from_results(
+        results: &super::scenarios::comprehensive::ComprehensiveStressResults,
+    ) -> Self {
+        let throughput = if results.duration.as_secs_f64() > 0.0 {
+            (results.ws_messages_sent + results.http_requests_sent) as f64
+                / results.duration.as_secs_f64()
+        } else {
+            0.0
+        };
+
+        Self {
+            tile_p99_ms: results
+                .tile_latencies
+                .p99()
+                .map(|d| d.as_secs_f64() * 1000.0),
+            overlay_p99_ms: results
+                .overlay_latencies
+                .p99()
+                .map(|d| d.as_secs_f64() * 1000.0),
+            cursor_p99_ms: results
+                .cursor_latencies
+                .p99()
+                .map(|d| d.as_secs_f64() * 1000.0),
+            viewport_p99_ms: results
+                .viewport_latencies
+                .p99()
+                .map(|d| d.as_secs_f64() * 1000.0),
+            error_rate: results.error_rate(),
+            throughput,
+        }
+    }
+}
+
+/// Statistical summary of a metric across iterations
+#[derive(Debug, Clone)]
+pub struct MetricStats {
+    pub mean: f64,
+    pub stddev: f64,
+}
+
+impl MetricStats {
+    pub fn from_samples(samples: &[f64]) -> Option<Self> {
+        if samples.is_empty() {
+            return None;
+        }
+
+        let n = samples.len() as f64;
+        let mean = samples.iter().sum::<f64>() / n;
+
+        let variance = if samples.len() > 1 {
+            samples.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0)
+        } else {
+            0.0
+        };
+        let stddev = variance.sqrt();
+
+        Some(Self { mean, stddev })
+    }
+
+    /// Format as "mean ± stddev"
+    pub fn format(&self) -> String {
+        if self.stddev < 0.1 {
+            format!("{:.1}ms", self.mean)
+        } else {
+            format!("{:.1}ms ± {:.1}ms", self.mean, self.stddev)
+        }
+    }
+}
+
+/// Aggregated results from multiple benchmark iterations
+#[derive(Debug)]
+pub struct BenchmarkReport {
+    pub tier: BenchmarkTier,
+    pub iterations: usize,
+    pub warmup_duration: Duration,
+    pub tile_p99: Option<MetricStats>,
+    pub overlay_p99: Option<MetricStats>,
+    pub cursor_p99: Option<MetricStats>,
+    pub viewport_p99: Option<MetricStats>,
+    pub error_rate: MetricStats,
+    pub throughput: MetricStats,
+    pub all_passed: bool,
+}
+
+impl BenchmarkReport {
+    /// Aggregate metrics from multiple runs
+    pub fn from_metrics(
+        tier: BenchmarkTier,
+        warmup_duration: Duration,
+        metrics: Vec<BenchmarkMetrics>,
+        all_passed: bool,
+    ) -> Self {
+        let iterations = metrics.len();
+
+        let tile_samples: Vec<f64> = metrics.iter().filter_map(|m| m.tile_p99_ms).collect();
+        let overlay_samples: Vec<f64> = metrics.iter().filter_map(|m| m.overlay_p99_ms).collect();
+        let cursor_samples: Vec<f64> = metrics.iter().filter_map(|m| m.cursor_p99_ms).collect();
+        let viewport_samples: Vec<f64> = metrics.iter().filter_map(|m| m.viewport_p99_ms).collect();
+        let error_samples: Vec<f64> = metrics.iter().map(|m| m.error_rate * 100.0).collect();
+        let throughput_samples: Vec<f64> = metrics.iter().map(|m| m.throughput).collect();
+
+        Self {
+            tier,
+            iterations,
+            warmup_duration,
+            tile_p99: MetricStats::from_samples(&tile_samples),
+            overlay_p99: MetricStats::from_samples(&overlay_samples),
+            cursor_p99: MetricStats::from_samples(&cursor_samples),
+            viewport_p99: MetricStats::from_samples(&viewport_samples),
+            error_rate: MetricStats::from_samples(&error_samples).unwrap(),
+            throughput: MetricStats::from_samples(&throughput_samples).unwrap(),
+            all_passed,
+        }
+    }
+
+    /// Convert to baseline format for storage
+    pub fn to_baseline(&self) -> Baseline {
+        Baseline {
+            tier: self.tier.name().to_string(),
+            timestamp: chrono::Utc::now().to_rfc3339(),
+            tile_p99_ms: self.tile_p99.as_ref().map(|s| s.mean),
+            overlay_p99_ms: self.overlay_p99.as_ref().map(|s| s.mean),
+            cursor_p99_ms: self.cursor_p99.as_ref().map(|s| s.mean),
+            viewport_p99_ms: self.viewport_p99.as_ref().map(|s| s.mean),
+            error_rate_pct: self.error_rate.mean,
+            throughput: self.throughput.mean,
+        }
+    }
+}
+
+/// Stored baseline for comparison
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct Baseline {
+    pub tier: String,
+    pub timestamp: String,
+    pub tile_p99_ms: Option<f64>,
+    pub overlay_p99_ms: Option<f64>,
+    pub cursor_p99_ms: Option<f64>,
+    pub viewport_p99_ms: Option<f64>,
+    pub error_rate_pct: f64,
+    pub throughput: f64,
+}
+
+impl Baseline {
+    /// Load baseline from file
+    pub fn load(path: &PathBuf, tier: &str) -> Option<Self> {
+        let content = std::fs::read_to_string(path).ok()?;
+        let baselines: std::collections::HashMap<String, Baseline> =
+            serde_json::from_str(&content).ok()?;
+        baselines.get(tier).cloned()
+    }
+
+    /// Save baseline to file (preserves other tiers)
+    pub fn save(&self, path: &PathBuf) -> std::io::Result<()> {
+        let mut baselines: std::collections::HashMap<String, Baseline> =
+            std::fs::read_to_string(path)
+                .ok()
+                .and_then(|c| serde_json::from_str(&c).ok())
+                .unwrap_or_default();
+
+        baselines.insert(self.tier.clone(), self.clone());
+
+        let json = serde_json::to_string_pretty(&baselines)?;
+        std::fs::write(path, json)
+    }
+}
+
+/// Comparison result between current run and baseline
+#[derive(Debug)]
+pub struct Comparison {
+    pub metric_name: &'static str,
+    pub current: Option<f64>,
+    pub baseline: Option<f64>,
+    pub change_pct: Option<f64>,
+    pub is_regression: bool,
+    pub higher_is_worse: bool, // true for latency/error, false for throughput
+}
+
+impl Comparison {
+    fn new(
+        metric_name: &'static str,
+        current: Option<f64>,
+        baseline: Option<f64>,
+        threshold_pct: f64,
+        higher_is_worse: bool,
+    ) -> Self {
+        let change_pct = match (current, baseline) {
+            (Some(c), Some(b)) if b > 0.0 => Some((c - b) / b * 100.0),
+            _ => None,
+        };
+
+        let is_regression = change_pct
+            .map(|pct| {
+                if higher_is_worse {
+                    pct > threshold_pct
+                } else {
+                    pct < -threshold_pct
+                }
+            })
+            .unwrap_or(false);
+
+        Self {
+            metric_name,
+            current,
+            baseline,
+            change_pct,
+            is_regression,
+            higher_is_worse,
+        }
+    }
+
+    fn format_value(&self, value: Option<f64>) -> String {
+        match value {
+            Some(v) => {
+                if self.metric_name.contains("P99") {
+                    format!("{:.1}ms", v)
+                } else if self.metric_name == "Error Rate" {
+                    format!("{:.2}%", v)
+                } else {
+                    format!("{:.1}", v)
+                }
+            }
+            None => "N/A".to_string(),
+        }
+    }
+
+    fn format_change(&self) -> String {
+        match self.change_pct {
+            Some(pct) => {
+                let sign = if pct >= 0.0 { "+" } else { "" };
+                let status = if self.is_regression {
+                    "[REGRESSION]"
+                } else if pct.abs() < 5.0 {
+                    "[OK]"
+                } else if (self.higher_is_worse && pct < 0.0)
+                    || (!self.higher_is_worse && pct > 0.0)
+                {
+                    "[IMPROVED]"
+                } else {
+                    "[WARNING]"
+                };
+                format!("({}{:.1}%) {}", sign, pct, status)
+            }
+            None => "".to_string(),
+        }
+    }
+}
+
+/// Benchmark runner that handles warm-up, iterations, and comparison
+pub struct BenchmarkRunner {
+    config: BenchmarkRunConfig,
+}
+
+impl BenchmarkRunner {
+    pub fn new(config: BenchmarkRunConfig) -> Self {
+        Self { config }
+    }
+
+    /// Run the full benchmark with warm-up, iterations, and comparison
+    pub async fn run(&self) -> Result<BenchmarkResult, Box<dyn std::error::Error + Send + Sync>> {
+        let stress_config = ComprehensiveStressConfig::for_tier(self.config.tier);
+
+        println!();
+        println!("═══════════════════════════════════════════════════════════════");
+        println!(
+            " BENCHMARK: {} ({} iterations)",
+            self.config.tier.name(),
+            self.config.iterations
+        );
+        println!("═══════════════════════════════════════════════════════════════");
+
+        // Run warm-up phase
+        if self.config.warmup_duration > Duration::ZERO {
+            println!();
+            println!(
+                " ─── Warm-up ({:.0}s) ───────────────────────────────────────────",
+                self.config.warmup_duration.as_secs_f64()
+            );
+
+            let warmup_config = ComprehensiveStressConfig {
+                duration: self.config.warmup_duration,
+                ..stress_config.clone()
+            };
+            let warmup_scenario = ComprehensiveStressScenario::new(warmup_config);
+            let _ = warmup_scenario.run().await?;
+            println!("   Warm-up complete, starting measured iterations...");
+        }
+
+        // Run iterations
+        let mut metrics = Vec::new();
+        let mut all_passed = true;
+
+        for i in 0..self.config.iterations {
+            println!();
+            println!(
+                " ─── Iteration {}/{} ─────────────────────────────────────────────",
+                i + 1,
+                self.config.iterations
+            );
+
+            let scenario = ComprehensiveStressScenario::new(stress_config.clone());
+            let results = scenario.run().await?;
+
+            let passed = results.meets_budgets();
+            if !passed {
+                all_passed = false;
+            }
+
+            let m = BenchmarkMetrics::from_results(&results);
+            println!(
+                "   Tile P99: {:.1}ms | Error: {:.2}% | Throughput: {:.0} ops/s | {}",
+                m.tile_p99_ms.unwrap_or(0.0),
+                m.error_rate * 100.0,
+                m.throughput,
+                if passed { "PASS" } else { "FAIL" }
+            );
+
+            metrics.push(m);
+        }
+
+        // Generate report
+        let report = BenchmarkReport::from_metrics(
+            self.config.tier,
+            self.config.warmup_duration,
+            metrics,
+            all_passed,
+        );
+
+        // Load baseline and compare
+        let baseline = Baseline::load(&self.config.baseline_path, self.config.tier.name());
+        let comparisons = self.compare(&report, &baseline);
+
+        // Print comparison
+        self.print_comparison(&report, &baseline, &comparisons);
+
+        // Check for regressions
+        let has_regression = comparisons.iter().any(|c| c.is_regression);
+
+        Ok(BenchmarkResult {
+            report,
+            has_regression,
+            all_passed,
+        })
+    }
+
+    fn compare(&self, report: &BenchmarkReport, baseline: &Option<Baseline>) -> Vec<Comparison> {
+        let threshold = self.config.regression_threshold_pct;
+        let baseline = baseline.as_ref();
+
+        vec![
+            Comparison::new(
+                "Tile P99",
+                report.tile_p99.as_ref().map(|s| s.mean),
+                baseline.and_then(|b| b.tile_p99_ms),
+                threshold,
+                true,
+            ),
+            Comparison::new(
+                "Overlay P99",
+                report.overlay_p99.as_ref().map(|s| s.mean),
+                baseline.and_then(|b| b.overlay_p99_ms),
+                threshold,
+                true,
+            ),
+            Comparison::new(
+                "Error Rate",
+                Some(report.error_rate.mean),
+                baseline.map(|b| b.error_rate_pct),
+                threshold,
+                true,
+            ),
+            Comparison::new(
+                "Throughput",
+                Some(report.throughput.mean),
+                baseline.map(|b| b.throughput),
+                threshold,
+                false,
+            ),
+        ]
+    }
+
+    #[allow(clippy::print_literal)]
+    fn print_comparison(
+        &self,
+        report: &BenchmarkReport,
+        baseline: &Option<Baseline>,
+        comparisons: &[Comparison],
+    ) {
+        println!();
+        println!("═══════════════════════════════════════════════════════════════");
+        println!(
+            " RESULTS: {} ({} iterations, {:.0}s warm-up)",
+            self.config.tier.name(),
+            report.iterations,
+            report.warmup_duration.as_secs_f64()
+        );
+        println!("═══════════════════════════════════════════════════════════════");
+        println!();
+
+        if baseline.is_some() {
+            println!(" ─── Comparison vs Baseline ──────────────────────────────────");
+            println!();
+            println!(
+                "   {:12} {:>14}   {:>14}   {}",
+                "Metric", "Current", "Baseline", "Change"
+            );
+            println!(
+                "   {:12} {:>14}   {:>14}   {}",
+                "──────", "───────", "────────", "──────"
+            );
+
+            for c in comparisons {
+                if c.current.is_some() || c.baseline.is_some() {
+                    println!(
+                        "   {:12} {:>14}   {:>14}   {}",
+                        c.metric_name,
+                        c.format_value(c.current),
+                        c.format_value(c.baseline),
+                        c.format_change()
+                    );
+                }
+            }
+        } else {
+            println!(" ─── Results (no baseline) ───────────────────────────────────");
+            println!();
+            if let Some(ref stats) = report.tile_p99 {
+                println!("   Tile P99:     {}", stats.format());
+            }
+            if let Some(ref stats) = report.overlay_p99 {
+                println!("   Overlay P99:  {}", stats.format());
+            }
+            println!(
+                "   Error Rate:   {:.2}% ± {:.2}%",
+                report.error_rate.mean, report.error_rate.stddev
+            );
+            println!(
+                "   Throughput:   {:.0} ± {:.0} ops/s",
+                report.throughput.mean, report.throughput.stddev
+            );
+            println!();
+            println!("   (Run again to establish baseline, or use --save-baseline)");
+        }
+
+        println!();
+        println!("═══════════════════════════════════════════════════════════════");
+
+        let has_regression = comparisons.iter().any(|c| c.is_regression);
+        let overall = if !report.all_passed {
+            "FAIL (budget exceeded)"
+        } else if has_regression {
+            "FAIL (regression detected)"
+        } else {
+            "PASS"
+        };
+        println!(" OVERALL: {}", overall);
+        println!("═══════════════════════════════════════════════════════════════");
+        println!();
+    }
+
+    /// Save current results as the new baseline
+    pub fn save_baseline(&self, report: &BenchmarkReport) -> std::io::Result<()> {
+        let baseline = report.to_baseline();
+        baseline.save(&self.config.baseline_path)?;
+        println!(
+            "Baseline saved to {:?} for tier {}",
+            self.config.baseline_path,
+            self.config.tier.name()
+        );
+        Ok(())
+    }
+}
+
+/// Full benchmark result
+pub struct BenchmarkResult {
+    pub report: BenchmarkReport,
+    pub has_regression: bool,
+    pub all_passed: bool,
+}
+
+impl BenchmarkResult {
+    /// Returns true if benchmark passed (no budget violations and no regressions)
+    pub fn passed(&self) -> bool {
+        self.all_passed && !self.has_regression
+    }
+
+    /// Generate JSON output for CI
+    pub fn to_json(&self) -> String {
+        let tile_p99 = self.report.tile_p99.as_ref().map(|s| s.mean);
+        let overlay_p99 = self.report.overlay_p99.as_ref().map(|s| s.mean);
+
+        let tile_str = tile_p99
+            .map(|v| format!("{:.2}", v))
+            .unwrap_or_else(|| "null".to_string());
+        let overlay_str = overlay_p99
+            .map(|v| format!("{:.2}", v))
+            .unwrap_or_else(|| "null".to_string());
+
+        format!(
+            r#"{{"passed":{},"tier":"{}","iterations":{},"warmup_secs":{:.0},"tile_p99_ms":{},"overlay_p99_ms":{},"error_rate_pct":{:.2},"throughput":{:.1},"has_regression":{}}}"#,
+            self.passed(),
+            self.report.tier.name(),
+            self.report.iterations,
+            self.report.warmup_duration.as_secs_f64(),
+            tile_str,
+            overlay_str,
+            self.report.error_rate.mean,
+            self.report.throughput.mean,
+            self.has_regression
+        )
+    }
+}
diff --git a/server/tests/load_tests/client.rs b/server/tests/load_tests/client.rs
index e53f02d..388b78e 100644
--- a/server/tests/load_tests/client.rs
+++ b/server/tests/load_tests/client.rs
@@ -7,11 +7,9 @@
 
 use futures_util::{SinkExt, StreamExt};
 use serde::{Deserialize, Serialize};
-use std::sync::Arc;
 use std::sync::atomic::{AtomicU64, Ordering};
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use tokio::net::TcpStream;
-use tokio::sync::mpsc;
 use tokio_tungstenite::{MaybeTlsStream, WebSocketStream, connect_async, tungstenite::Message};
 
 /// Client message types (mirror of server protocol)
@@ -89,8 +87,6 @@ pub enum ServerMessage {
 pub struct LoadTestClient {
     ws: WebSocketStream<MaybeTlsStream<TcpStream>>,
     seq: AtomicU64,
-    /// Timestamps of sent messages for latency calculation
-    pending_acks: Arc<tokio::sync::RwLock<std::collections::HashMap<u64, Instant>>>,
     /// Session info after join/create
     pub session_id: Option<String>,
     pub join_secret: Option<String>,
@@ -104,7 +100,6 @@ impl LoadTestClient {
         Ok(Self {
             ws,
             seq: AtomicU64::new(1),
-            pending_acks: Arc::new(tokio::sync::RwLock::new(std::collections::HashMap::new())),
             session_id: None,
             join_secret: None,
             presenter_key: None,
@@ -116,7 +111,7 @@ impl LoadTestClient {
         self.seq.fetch_add(1, Ordering::SeqCst)
     }
 
-    /// Send a message and track for latency measurement
+    /// Send a message and return the sequence number
     pub async fn send(
         &mut self,
         msg: ClientMessage,
@@ -130,12 +125,6 @@ impl LoadTestClient {
             ClientMessage::Ping { seq } => *seq,
         };
 
-        // Track send time for latency calculation
-        {
-            let mut pending = self.pending_acks.write().await;
-            pending.insert(seq, Instant::now());
-        }
-
         let json = serde_json::to_string(&msg)?;
         self.ws.send(Message::Text(json.into())).await?;
         Ok(seq)
@@ -274,84 +263,8 @@ impl LoadTestClient {
     }
 }
 
-/// Spawn a client that sends updates at specified rates
-pub async fn spawn_update_client(
-    mut client: LoadTestClient,
-    cursor_hz: u32,
-    viewport_hz: u32,
-    duration: Duration,
-    results_tx: mpsc::Sender<ClientEvent>,
-) {
-    let cursor_interval = if cursor_hz > 0 {
-        Duration::from_secs_f64(1.0 / cursor_hz as f64)
-    } else {
-        Duration::from_secs(3600) // Effectively disabled
-    };
-
-    let viewport_interval = if viewport_hz > 0 {
-        Duration::from_secs_f64(1.0 / viewport_hz as f64)
-    } else {
-        Duration::from_secs(3600)
-    };
-
-    let start = Instant::now();
-    let mut cursor_ticker = tokio::time::interval(cursor_interval);
-    let mut viewport_ticker = tokio::time::interval(viewport_interval);
-    let mut x = 0.5f64;
-    let mut y = 0.5f64;
-
-    loop {
-        if start.elapsed() >= duration {
-            break;
-        }
-
-        tokio::select! {
-            _ = cursor_ticker.tick() => {
-                // Simulate cursor movement
-                x = (x + 0.001).min(1.0);
-                y = (y + 0.001).min(1.0);
-                if x >= 1.0 { x = 0.0; }
-                if y >= 1.0 { y = 0.0; }
-
-                match client.send_cursor(x, y).await {
-                    Ok(_) => {
-                        let _ = results_tx.send(ClientEvent::MessageSent).await;
-                    }
-                    Err(_) => {
-                        let _ = results_tx.send(ClientEvent::Error).await;
-                    }
-                }
-            }
-            _ = viewport_ticker.tick() => {
-                match client.send_viewport(0.5, 0.5, 1.0).await {
-                    Ok(_) => {
-                        let _ = results_tx.send(ClientEvent::MessageSent).await;
-                    }
-                    Err(_) => {
-                        let _ = results_tx.send(ClientEvent::Error).await;
-                    }
-                }
-            }
-        }
-    }
-
-    let _ = client.close().await;
-}
-
-/// Events from client tasks
-#[derive(Debug)]
-pub enum ClientEvent {
-    MessageSent,
-    MessageReceived {
-        latency: Option<Duration>,
-        msg_type: &'static str,
-    },
-    Error,
-}
-
 /// Slide info returned from the API
 #[derive(Debug, Clone, Deserialize)]
-#[allow(dead_code)]
 pub struct SlideInfo {
     pub id: String,
     pub name: String,
diff --git a/server/tests/load_tests/mod.rs b/server/tests/load_tests/mod.rs
index f8942a3..80305c7 100644
--- a/server/tests/load_tests/mod.rs
+++ b/server/tests/load_tests/mod.rs
@@ -1,59 +1,49 @@
 //! Load testing module for PathCollab
 //!
-//! This module provides load testing infrastructure to validate
-//! that PathCollab can handle activity spikes with 20 followers
-//! per session at 30Hz cursor + 10Hz viewport updates.
+//! Provides a unified benchmark system with three tiers:
+//! - **Smoke**: Quick CI validation on every push (<30s)
+//! - **Standard**: PR merge gate (~2min)
+//! - **Stress**: Manual/release testing (~5min)
+//!
+//! ## Running Benchmarks
+//!
+//! ```bash
+//! # Smoke test (CI)
+//! cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
+//!
+//! # Standard test (PR gate)
+//! cargo test --test perf_tests bench_standard --release -- --ignored --nocapture
+//!
+//! # Stress test (release)
+//! cargo test --test perf_tests bench_stress --release -- --ignored --nocapture
+//! ```
 
 #![allow(clippy::collapsible_if)]
 
+pub mod benchmark;
 pub mod client;
 pub mod scenarios;
 
 use std::time::Duration;
 
-/// Performance budget thresholds
-pub mod budgets {
-    use std::time::Duration;
-
-    /// Maximum acceptable P99 cursor broadcast latency
-    pub const CURSOR_P99_MAX: Duration = Duration::from_millis(100);
-
-    /// Maximum acceptable P99 viewport broadcast latency
-    pub const VIEWPORT_P99_MAX: Duration = Duration::from_millis(150);
-
-    /// Maximum acceptable message handling time
-    pub const MESSAGE_HANDLING_MAX: Duration = Duration::from_millis(10);
-}
-
-/// Load test configuration
-#[derive(Debug, Clone)]
-pub struct LoadTestConfig {
-    /// Number of sessions to create
-    pub num_sessions: usize,
-    /// Number of followers per session
-    pub followers_per_session: usize,
-    /// Cursor update rate (Hz)
-    pub cursor_hz: u32,
-    /// Viewport update rate (Hz)
-    pub viewport_hz: u32,
-    /// Test duration
-    pub duration: Duration,
-    /// Server WebSocket URL
-    pub ws_url: String,
-    /// Server HTTP base URL (for fetching slide info)
-    pub http_url: String,
+/// Benchmark tier for different testing scenarios
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BenchmarkTier {
+    /// Quick CI validation: 5 sessions, 10 users, 10s
+    Smoke,
+    /// PR merge gate: 25 sessions, 50 users, 30s
+    Standard,
+    /// Manual/release testing: 100 sessions, 200 users, 60s
+    Stress,
 }
 
-impl Default for LoadTestConfig {
-    fn default() -> Self {
-        Self {
-            num_sessions: 5,
-            followers_per_session: 20,
-            cursor_hz: 30,
-            viewport_hz: 10,
-            duration: Duration::from_secs(60),
-            ws_url: "ws://127.0.0.1:8080/ws".to_string(),
-            http_url: "http://127.0.0.1:8080".to_string(),
+impl BenchmarkTier {
+    /// Get the tier name for display
+    pub fn name(&self) -> &'static str {
+        match self {
+            BenchmarkTier::Smoke => "SMOKE",
+            BenchmarkTier::Standard => "STANDARD",
+            BenchmarkTier::Stress => "STRESS",
         }
     }
 }
@@ -76,7 +66,7 @@ impl LatencyStats {
     }
 
     /// Calculate percentile (0-100)
-    pub fn percentile(&self, p: f64) -> Option<Duration> {
+    fn percentile(&self, p: f64) -> Option<Duration> {
         if self.samples.is_empty() {
             return None;
         }
@@ -88,141 +78,8 @@ impl LatencyStats {
         Some(sorted[idx.min(sorted.len() - 1)])
     }
 
-    /// Calculate P50 (median)
-    pub fn p50(&self) -> Option<Duration> {
-        self.percentile(50.0)
-    }
-
-    /// Calculate P95
-    pub fn p95(&self) -> Option<Duration> {
-        self.percentile(95.0)
-    }
-
     /// Calculate P99
     pub fn p99(&self) -> Option<Duration> {
         self.percentile(99.0)
     }
 }
-
-/// Load test results
-#[derive(Debug)]
-pub struct LoadTestResults {
-    /// Cursor broadcast latencies
-    pub cursor_latencies: LatencyStats,
-    /// Viewport broadcast latencies
-    pub viewport_latencies: LatencyStats,
-    /// Message handling latencies
-    pub message_latencies: LatencyStats,
-    /// Total messages sent
-    pub messages_sent: u64,
-    /// Total messages received
-    pub messages_received: u64,
-    /// Connection errors
-    pub connection_errors: u64,
-    /// Test duration
-    pub duration: Duration,
-}
-
-impl LoadTestResults {
-    pub fn new() -> Self {
-        Self {
-            cursor_latencies: LatencyStats::new(),
-            viewport_latencies: LatencyStats::new(),
-            message_latencies: LatencyStats::new(),
-            messages_sent: 0,
-            messages_received: 0,
-            connection_errors: 0,
-            duration: Duration::ZERO,
-        }
-    }
-
-    /// Check if results meet performance budgets
-    pub fn meets_budgets(&self) -> bool {
-        let cursor_ok = self
-            .cursor_latencies
-            .p99()
-            .map(|p| p <= budgets::CURSOR_P99_MAX)
-            .unwrap_or(true);
-
-        let viewport_ok = self
-            .viewport_latencies
-            .p99()
-            .map(|p| p <= budgets::VIEWPORT_P99_MAX)
-            .unwrap_or(true);
-
-        let message_ok = self
-            .message_latencies
-            .p99()
-            .map(|p| p <= budgets::MESSAGE_HANDLING_MAX)
-            .unwrap_or(true);
-
-        cursor_ok && viewport_ok && message_ok
-    }
-
-    /// Generate a summary report
-    pub fn report(&self) -> String {
-        let mut report = String::new();
-        report.push_str("=== Load Test Results ===\n\n");
-
-        report.push_str(&format!("Duration: {:.2}s\n", self.duration.as_secs_f64()));
-        report.push_str(&format!("Messages sent: {}\n", self.messages_sent));
-        report.push_str(&format!("Messages received: {}\n", self.messages_received));
-        report.push_str(&format!(
-            "Connection errors: {}\n\n",
-            self.connection_errors
-        ));
-
-        report.push_str("Cursor Latencies:\n");
-        if let Some(p50) = self.cursor_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.cursor_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.cursor_latencies.p99() {
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budgets::CURSOR_P99_MAX,
-                if p99 <= budgets::CURSOR_P99_MAX {
-                    "OK"
-                } else {
-                    "EXCEEDED"
-                }
-            ));
-        }
-
-        report.push_str("\nViewport Latencies:\n");
-        if let Some(p50) = self.viewport_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.viewport_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.viewport_latencies.p99() {
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budgets::VIEWPORT_P99_MAX,
-                if p99 <= budgets::VIEWPORT_P99_MAX {
-                    "OK"
-                } else {
-                    "EXCEEDED"
-                }
-            ));
-        }
-
-        report.push_str(&format!(
-            "\nOverall: {}\n",
-            if self.meets_budgets() { "PASS" } else { "FAIL" }
-        ));
-
-        report
-    }
-}
-
-impl Default for LoadTestResults {
-    fn default() -> Self {
-        Self::new()
-    }
-}
diff --git a/server/tests/load_tests/scenarios/comprehensive.rs b/server/tests/load_tests/scenarios/comprehensive.rs
index dd808c1..01c06b2 100644
--- a/server/tests/load_tests/scenarios/comprehensive.rs
+++ b/server/tests/load_tests/scenarios/comprehensive.rs
@@ -1,15 +1,24 @@
 //! Comprehensive stress test scenario
 //!
-//! Simulates 1000 concurrent users (500 sessions × 2 users each) hitting all server routes:
+//! Simulates concurrent users hitting all server routes:
 //! - WebSocket sessions with cursor/viewport updates
 //! - HTTP tile requests
 //! - HTTP overlay requests (cell and tissue)
 //! - Metadata endpoints
 //!
 //! This tests the server's ability to handle realistic production-like load.
+//!
+//! ## Benchmark Tiers
+//!
+//! | Tier     | Sessions | Users | Duration |
+//! |----------|----------|-------|----------|
+//! | Smoke    | 5        | 10    | 10s      |
+//! | Standard | 25       | 50    | 30s      |
+//! | Stress   | 100      | 200   | 60s      |
 
 #![allow(clippy::collapsible_if)]
 
+use super::super::BenchmarkTier;
 use super::super::LatencyStats;
 use super::super::client::{LoadTestClient, ServerMessage, fetch_first_slide};
 use reqwest::Client;
@@ -54,6 +63,41 @@ impl Default for ComprehensiveStressConfig {
     }
 }
 
+impl ComprehensiveStressConfig {
+    /// Create configuration for a specific benchmark tier
+    pub fn for_tier(tier: BenchmarkTier) -> Self {
+        match tier {
+            BenchmarkTier::Smoke => Self {
+                num_sessions: 5, // 10 users
+                duration: Duration::from_secs(10),
+                cursor_hz: 10,
+                viewport_hz: 5,
+                tile_request_hz: 2,
+                overlay_request_hz: 1,
+                ..Default::default()
+            },
+            BenchmarkTier::Standard => Self {
+                num_sessions: 25, // 50 users
+                duration: Duration::from_secs(30),
+                cursor_hz: 30,
+                viewport_hz: 10,
+                tile_request_hz: 5,
+                overlay_request_hz: 2,
+                ..Default::default()
+            },
+            BenchmarkTier::Stress => Self {
+                num_sessions: 100, // 200 users
+                duration: Duration::from_secs(60),
+                cursor_hz: 30,
+                viewport_hz: 10,
+                tile_request_hz: 5,
+                overlay_request_hz: 2,
+                ..Default::default()
+            },
+        }
+    }
+}
+
 /// Extended results for comprehensive stress test
 #[derive(Debug)]
 pub struct ComprehensiveStressResults {
@@ -81,6 +125,22 @@ pub struct ComprehensiveStressResults {
     pub duration: Duration,
 }
 
+/// Performance budgets for benchmarks
+pub mod budgets {
+    use std::time::Duration;
+
+    /// Maximum acceptable P99 cursor broadcast latency
+    pub const CURSOR_P99_MAX: Duration = Duration::from_millis(100);
+    /// Maximum acceptable P99 viewport broadcast latency
+    pub const VIEWPORT_P99_MAX: Duration = Duration::from_millis(150);
+    /// Maximum acceptable P99 tile serving latency
+    pub const TILE_P99_MAX: Duration = Duration::from_millis(500);
+    /// Maximum acceptable P99 overlay latency
+    pub const OVERLAY_P99_MAX: Duration = Duration::from_millis(1000);
+    /// Maximum acceptable error rate
+    pub const ERROR_RATE_MAX: f64 = 0.01; // 1%
+}
+
 impl ComprehensiveStressResults {
     pub fn new() -> Self {
         Self {
@@ -100,176 +160,60 @@ impl ComprehensiveStressResults {
         }
     }
 
+    /// Calculate error rate as a fraction (0.0 to 1.0)
+    pub fn error_rate(&self) -> f64 {
+        let total_requests = self.http_requests_sent + self.ws_messages_sent;
+        let total_errors = self.http_requests_failed + self.ws_connection_errors;
+        if total_requests > 0 {
+            total_errors as f64 / total_requests as f64
+        } else {
+            0.0
+        }
+    }
+
+    /// Minimum samples required to consider a latency measurement valid
+    const MIN_LATENCY_SAMPLES: usize = 10;
+
     /// Check if results meet performance budgets
     pub fn meets_budgets(&self) -> bool {
         // WebSocket latency budgets
+        // Note: The server doesn't send Acks for cursor/viewport updates (fire-and-forget
+        // for performance), so latency samples may be empty. That's OK - we check if
+        // we have samples, and only fail if samples exceed budget.
         let cursor_ok = self
             .cursor_latencies
             .p99()
-            .map(|p| p <= Duration::from_millis(100))
-            .unwrap_or(true);
+            .map(|p| p <= budgets::CURSOR_P99_MAX)
+            .unwrap_or(true); // OK if no samples (server doesn't Ack cursor updates)
 
         let viewport_ok = self
             .viewport_latencies
             .p99()
-            .map(|p| p <= Duration::from_millis(150))
-            .unwrap_or(true);
-
-        // HTTP latency budgets
-        let tile_ok = self
-            .tile_latencies
-            .p99()
-            .map(|p| p <= Duration::from_millis(500))
-            .unwrap_or(true);
+            .map(|p| p <= budgets::VIEWPORT_P99_MAX)
+            .unwrap_or(true); // OK if no samples (server doesn't Ack viewport updates)
+
+        // HTTP latency budgets - require samples if we had successful requests
+        let tile_ok = if self.http_requests_success > 0 {
+            self.tile_latencies
+                .p99()
+                .map(|p| p <= budgets::TILE_P99_MAX)
+                .unwrap_or_else(|| self.tile_latencies.samples.len() >= Self::MIN_LATENCY_SAMPLES)
+        } else {
+            true
+        };
 
+        // Overlay is optional - many test setups don't have overlay data
         let overlay_ok = self
             .overlay_latencies
             .p99()
-            .map(|p| p <= Duration::from_millis(1000))
-            .unwrap_or(true);
+            .map(|p| p <= budgets::OVERLAY_P99_MAX)
+            .unwrap_or(true); // OK if no overlay data
 
-        // Error rate budget: < 1%
-        let total_requests = self.http_requests_sent + self.ws_messages_sent;
-        let total_errors = self.http_requests_failed + self.ws_connection_errors;
-        let error_rate_ok = if total_requests > 0 {
-            (total_errors as f64 / total_requests as f64) < 0.01
-        } else {
-            true
-        };
+        // Error rate budget
+        let error_rate_ok = self.error_rate() < budgets::ERROR_RATE_MAX;
 
         cursor_ok && viewport_ok && tile_ok && overlay_ok && error_rate_ok
     }
-
-    /// Generate a summary report
-    pub fn report(&self) -> String {
-        let mut report = String::new();
-        report.push_str("=== Comprehensive Stress Test Results ===\n\n");
-
-        report.push_str(&format!("Duration: {:.2}s\n", self.duration.as_secs_f64()));
-        report.push_str(&format!(
-            "Total users: {} (sessions: {}, joined: {})\n",
-            self.sessions_created + self.sessions_joined,
-            self.sessions_created,
-            self.sessions_joined
-        ));
-
-        report.push_str("\n--- WebSocket Stats ---\n");
-        report.push_str(&format!("Messages sent: {}\n", self.ws_messages_sent));
-        report.push_str(&format!(
-            "Messages received: {}\n",
-            self.ws_messages_received
-        ));
-        report.push_str(&format!(
-            "Connection errors: {}\n",
-            self.ws_connection_errors
-        ));
-
-        let ws_throughput = self.ws_messages_sent as f64 / self.duration.as_secs_f64();
-        report.push_str(&format!("WS throughput: {:.1} msg/s\n", ws_throughput));
-
-        report.push_str("\n--- HTTP Stats ---\n");
-        report.push_str(&format!("Requests sent: {}\n", self.http_requests_sent));
-        report.push_str(&format!(
-            "Requests success: {}\n",
-            self.http_requests_success
-        ));
-        report.push_str(&format!("Requests failed: {}\n", self.http_requests_failed));
-
-        let http_throughput = self.http_requests_sent as f64 / self.duration.as_secs_f64();
-        report.push_str(&format!("HTTP throughput: {:.1} req/s\n", http_throughput));
-
-        let total_throughput = ws_throughput + http_throughput;
-        report.push_str(&format!(
-            "\nTotal throughput: {:.1} ops/s\n",
-            total_throughput
-        ));
-
-        report.push_str("\n--- Latencies ---\n");
-
-        report.push_str("\nCursor (WS) Latencies:\n");
-        if let Some(p50) = self.cursor_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.cursor_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.cursor_latencies.p99() {
-            let budget = Duration::from_millis(100);
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budget,
-                if p99 <= budget { "OK" } else { "EXCEEDED" }
-            ));
-        }
-
-        report.push_str("\nViewport (WS) Latencies:\n");
-        if let Some(p50) = self.viewport_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.viewport_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.viewport_latencies.p99() {
-            let budget = Duration::from_millis(150);
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budget,
-                if p99 <= budget { "OK" } else { "EXCEEDED" }
-            ));
-        }
-
-        report.push_str("\nTile (HTTP) Latencies:\n");
-        if let Some(p50) = self.tile_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.tile_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.tile_latencies.p99() {
-            let budget = Duration::from_millis(500);
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budget,
-                if p99 <= budget { "OK" } else { "EXCEEDED" }
-            ));
-        }
-
-        report.push_str("\nOverlay (HTTP) Latencies:\n");
-        if let Some(p50) = self.overlay_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.overlay_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.overlay_latencies.p99() {
-            let budget = Duration::from_millis(1000);
-            report.push_str(&format!(
-                "  P99: {:?} (budget: {:?}) {}\n",
-                p99,
-                budget,
-                if p99 <= budget { "OK" } else { "EXCEEDED" }
-            ));
-        }
-
-        let error_rate = if self.http_requests_sent + self.ws_messages_sent > 0 {
-            (self.http_requests_failed + self.ws_connection_errors) as f64
-                / (self.http_requests_sent + self.ws_messages_sent) as f64
-                * 100.0
-        } else {
-            0.0
-        };
-        report.push_str(&format!("\nError rate: {:.3}% (budget: <1%)\n", error_rate));
-
-        report.push_str(&format!(
-            "\nOverall: {}\n",
-            if self.meets_budgets() { "PASS" } else { "FAIL" }
-        ));
-
-        report
-    }
 }
 
 impl Default for ComprehensiveStressResults {
@@ -280,15 +224,11 @@ impl Default for ComprehensiveStressResults {
 
 /// Event types for comprehensive test
 #[derive(Debug)]
-#[allow(dead_code)]
 pub enum ComprehensiveEvent {
-    WsMessageSent,
-    WsMessageReceived { msg_type: &'static str },
-    WsError,
+    WsCursorAck { latency: Duration },
+    WsViewportAck { latency: Duration },
     HttpTileRequest { latency: Duration, success: bool },
     HttpOverlayRequest { latency: Duration, success: bool },
-    SessionCreated,
-    SessionJoined,
 }
 
 /// Comprehensive stress test scenario
@@ -378,6 +318,8 @@ impl ComprehensiveStressScenario {
                 true, // is_presenter
                 http_client.clone(),
                 slide.id.clone(),
+                slide.width,
+                slide.height,
                 tx.clone(),
                 ws_sent.clone(),
                 ws_recv.clone(),
@@ -411,6 +353,8 @@ impl ComprehensiveStressScenario {
                 false, // is_presenter
                 http_client.clone(),
                 slide.id.clone(),
+                slide.width,
+                slide.height,
                 tx.clone(),
                 ws_sent.clone(),
                 ws_recv.clone(),
@@ -431,6 +375,8 @@ impl ComprehensiveStressScenario {
         drop(tx);
 
         // Collect events
+        let mut cursor_latencies = LatencyStats::new();
+        let mut viewport_latencies = LatencyStats::new();
         let mut tile_latencies = LatencyStats::new();
         let mut overlay_latencies = LatencyStats::new();
 
@@ -440,6 +386,12 @@ impl ComprehensiveStressScenario {
         while collect_start.elapsed() < collect_duration {
             match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await {
                 Ok(Some(event)) => match event {
+                    ComprehensiveEvent::WsCursorAck { latency } => {
+                        cursor_latencies.record(latency);
+                    }
+                    ComprehensiveEvent::WsViewportAck { latency } => {
+                        viewport_latencies.record(latency);
+                    }
                     ComprehensiveEvent::HttpTileRequest {
                         latency,
                         success: true,
@@ -474,6 +426,8 @@ impl ComprehensiveStressScenario {
         results.http_requests_failed = http_failed.load(Ordering::SeqCst);
         results.sessions_created = sessions_created.load(Ordering::SeqCst);
         results.sessions_joined = sessions_joined.load(Ordering::SeqCst);
+        results.cursor_latencies = cursor_latencies;
+        results.viewport_latencies = viewport_latencies;
         results.tile_latencies = tile_latencies;
         results.overlay_latencies = overlay_latencies;
         results.duration = start.elapsed();
@@ -482,12 +436,15 @@ impl ComprehensiveStressScenario {
     }
 
     /// Spawn a user task that does both WebSocket and HTTP operations
+    #[allow(clippy::too_many_arguments)]
     fn spawn_user_task(
         &self,
         mut client: LoadTestClient,
         is_presenter: bool,
         http_client: Client,
         slide_id: String,
+        slide_width: u64,
+        slide_height: u64,
         tx: mpsc::Sender<ComprehensiveEvent>,
         ws_sent: Arc<AtomicU64>,
         ws_recv: Arc<AtomicU64>,
@@ -511,6 +468,19 @@ impl ComprehensiveStressScenario {
         let overlay_hz = self.config.overlay_request_hz;
         let http_url = self.config.http_url.clone();
 
+        // Calculate valid tile range based on slide dimensions
+        // DZI convention: max_level = ceil(log2(max(width, height)))
+        // At level N, dimensions are width/2^(max_level-N) x height/2^(max_level-N)
+        let tile_size = 256u64;
+        let max_level = (slide_width.max(slide_height) as f64).log2().ceil() as u32;
+        // Use a level 3-4 below max to get ~50-200 tiles (good for testing)
+        let test_level = max_level.saturating_sub(3);
+        let level_scale = 1u64 << (max_level - test_level);
+        let level_width = slide_width / level_scale.max(1);
+        let level_height = slide_height / level_scale.max(1);
+        let max_tile_x = level_width.div_ceil(tile_size).max(1) as u32;
+        let max_tile_y = level_height.div_ceil(tile_size).max(1) as u32;
+
         tokio::spawn(async move {
             let cursor_interval = if cursor_hz > 0 {
                 Duration::from_secs_f64(1.0 / cursor_hz as f64)
@@ -548,6 +518,11 @@ impl ComprehensiveStressScenario {
             let mut tile_x = 0u32;
             let mut tile_y = 0u32;
 
+            // Track pending operations for latency measurement
+            // Key: seq number, Value: (send_time, is_cursor)
+            let mut pending_ws: std::collections::HashMap<u64, (Instant, bool)> =
+                std::collections::HashMap::new();
+
             loop {
                 if start.elapsed() >= duration {
                     break;
@@ -561,45 +536,46 @@ impl ComprehensiveStressScenario {
                         if x >= 1.0 { x = 0.0; }
                         if y >= 1.0 { y = 0.0; }
 
-                        match client.send_cursor(x * 100000.0, y * 100000.0).await {
-                            Ok(_) => {
+                        let send_time = Instant::now();
+                        match client.send_cursor(x * slide_width as f64, y * slide_height as f64).await {
+                            Ok(seq) => {
                                 ws_sent.fetch_add(1, Ordering::SeqCst);
-                                let _ = tx.send(ComprehensiveEvent::WsMessageSent).await;
+                                pending_ws.insert(seq, (send_time, true)); // true = cursor
                             }
                             Err(_) => {
                                 ws_errors.fetch_add(1, Ordering::SeqCst);
-                                let _ = tx.send(ComprehensiveEvent::WsError).await;
                             }
                         }
                     }
 
                     // Presenter sends viewport updates
                     _ = viewport_ticker.tick(), if is_presenter => {
+                        let send_time = Instant::now();
                         match client.send_viewport(0.5, 0.5, 1.0).await {
-                            Ok(_) => {
+                            Ok(seq) => {
                                 ws_sent.fetch_add(1, Ordering::SeqCst);
-                                let _ = tx.send(ComprehensiveEvent::WsMessageSent).await;
+                                pending_ws.insert(seq, (send_time, false)); // false = viewport
                             }
                             Err(_) => {
                                 ws_errors.fetch_add(1, Ordering::SeqCst);
-                                let _ = tx.send(ComprehensiveEvent::WsError).await;
                             }
                         }
                     }
 
-                    // Both users request tiles
+                    // Both users request tiles - use valid coordinates
                     _ = tile_ticker.tick() => {
                         http_sent.fetch_add(1, Ordering::SeqCst);
-                        let level = 5;
                         let url = format!(
                             "{}/api/slide/{}/tile/{}/{}/{}",
-                            http_url, slide_id, level, tile_x, tile_y
+                            http_url, slide_id, test_level, tile_x % max_tile_x, tile_y % max_tile_y
                         );
 
                         let req_start = Instant::now();
                         match http_client.get(&url).send().await {
                             Ok(resp) => {
                                 let latency = req_start.elapsed();
+                                // 200 = success, 404 = tile doesn't exist but server responded correctly
+                                // Both count as successful server responses for latency measurement
                                 if resp.status().is_success() || resp.status().as_u16() == 404 {
                                     http_success.fetch_add(1, Ordering::SeqCst);
                                     let _ = tx.send(ComprehensiveEvent::HttpTileRequest {
@@ -608,6 +584,10 @@ impl ComprehensiveStressScenario {
                                     }).await;
                                 } else {
                                     http_failed.fetch_add(1, Ordering::SeqCst);
+                                    let _ = tx.send(ComprehensiveEvent::HttpTileRequest {
+                                        latency,
+                                        success: false,
+                                    }).await;
                                 }
                             }
                             Err(_) => {
@@ -615,9 +595,9 @@ impl ComprehensiveStressScenario {
                             }
                         }
 
-                        tile_x = (tile_x + 1) % 40;
-                        if tile_x == 0 {
-                            tile_y = (tile_y + 1) % 40;
+                        tile_x = tile_x.wrapping_add(1);
+                        if tile_x.is_multiple_of(max_tile_x) {
+                            tile_y = tile_y.wrapping_add(1);
                         }
                     }
 
@@ -626,16 +606,18 @@ impl ComprehensiveStressScenario {
                         http_sent.fetch_add(1, Ordering::SeqCst);
 
                         // Alternate between tissue tiles and cell queries
-                        let is_tissue = tile_x % 2 == 0;
+                        let is_tissue = tile_x.is_multiple_of(2);
                         let url = if is_tissue {
                             format!(
                                 "{}/api/slide/{}/overlay/tissue/{}/{}/{}",
-                                http_url, slide_id, 3, tile_x % 20, tile_y % 20
+                                http_url, slide_id, test_level.saturating_sub(2), tile_x % max_tile_x, tile_y % max_tile_y
                             )
                         } else {
                             format!(
                                 "{}/api/slide/{}/overlay/cells?x={}&y={}&width=5000&height=5000",
-                                http_url, slide_id, (tile_x as f64) * 1000.0, (tile_y as f64) * 1000.0
+                                http_url, slide_id,
+                                ((tile_x % max_tile_x) as f64) * 256.0 * (level_scale as f64),
+                                ((tile_y % max_tile_y) as f64) * 256.0 * (level_scale as f64)
                             )
                         };
 
@@ -643,6 +625,7 @@ impl ComprehensiveStressScenario {
                         match http_client.get(&url).send().await {
                             Ok(resp) => {
                                 let latency = req_start.elapsed();
+                                // Overlays may legitimately 404 if no overlay data exists
                                 if resp.status().is_success() || resp.status().as_u16() == 404 {
                                     http_success.fetch_add(1, Ordering::SeqCst);
                                     let _ = tx.send(ComprehensiveEvent::HttpOverlayRequest {
@@ -659,18 +642,23 @@ impl ComprehensiveStressScenario {
                         }
                     }
 
-                    // Receive WebSocket messages (followers receive presence updates)
+                    // Receive WebSocket messages - track Ack latencies
                     _ = ws_recv_interval.tick() => {
                         match client.recv_timeout(Duration::from_millis(10)).await {
                             Ok(Some(msg)) => {
                                 ws_recv.fetch_add(1, Ordering::SeqCst);
-                                let msg_type = match &msg {
-                                    ServerMessage::PresenceDelta { .. } => "presence",
-                                    ServerMessage::PresenterViewport { .. } => "viewport",
-                                    ServerMessage::Ack { .. } => "ack",
-                                    _ => "other",
-                                };
-                                let _ = tx.send(ComprehensiveEvent::WsMessageReceived { msg_type }).await;
+                                if let ServerMessage::Ack { ack_seq, status, .. } = &msg {
+                                    if status == "ok" {
+                                        if let Some((send_time, is_cursor)) = pending_ws.remove(ack_seq) {
+                                            let latency = send_time.elapsed();
+                                            if is_cursor {
+                                                let _ = tx.send(ComprehensiveEvent::WsCursorAck { latency }).await;
+                                            } else {
+                                                let _ = tx.send(ComprehensiveEvent::WsViewportAck { latency }).await;
+                                            }
+                                        }
+                                    }
+                                }
                             }
                             Ok(None) => {}
                             Err(_) => {
@@ -679,6 +667,9 @@ impl ComprehensiveStressScenario {
                         }
                     }
                 }
+
+                // Clean up old pending entries (older than 5 seconds - likely missed)
+                pending_ws.retain(|_, (time, _)| time.elapsed() < Duration::from_secs(5));
             }
 
             let _ = client.close().await;
@@ -686,54 +677,4 @@ impl ComprehensiveStressScenario {
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[tokio::test]
-    #[ignore = "requires running server"]
-    async fn test_comprehensive_minimal() {
-        let config = ComprehensiveStressConfig {
-            num_sessions: 5, // 10 users
-            duration: Duration::from_secs(10),
-            cursor_hz: 10,
-            viewport_hz: 5,
-            tile_request_hz: 2,
-            overlay_request_hz: 1,
-            ..Default::default()
-        };
-
-        let scenario = ComprehensiveStressScenario::new(config);
-        let results = scenario.run().await.expect("Scenario should complete");
-
-        println!("{}", results.report());
-        assert!(results.ws_messages_sent > 0, "Should have sent WS messages");
-        assert!(
-            results.http_requests_sent > 0,
-            "Should have sent HTTP requests"
-        );
-    }
-
-    #[tokio::test]
-    #[ignore = "requires running server - long running"]
-    async fn test_comprehensive_1000_users() {
-        let config = ComprehensiveStressConfig {
-            num_sessions: 500, // 1000 users
-            duration: Duration::from_secs(60),
-            cursor_hz: 30,
-            viewport_hz: 10,
-            tile_request_hz: 5,
-            overlay_request_hz: 2,
-            ..Default::default()
-        };
-
-        let scenario = ComprehensiveStressScenario::new(config);
-        let results = scenario.run().await.expect("Scenario should complete");
-
-        println!("{}", results.report());
-        assert!(
-            results.meets_budgets(),
-            "Should meet performance budgets under 1000 user load"
-        );
-    }
-}
+// Tests are in perf_tests.rs using the tier-based approach
diff --git a/server/tests/load_tests/scenarios/fanout.rs b/server/tests/load_tests/scenarios/fanout.rs
deleted file mode 100644
index 1b66597..0000000
--- a/server/tests/load_tests/scenarios/fanout.rs
+++ /dev/null
@@ -1,257 +0,0 @@
-//! Fan-out load test scenario
-//!
-//! Validates that PathCollab can handle N sessions with 20 followers each,
-//! where the presenter sends 30Hz cursor updates and 10Hz viewport updates.
-//! All followers should receive broadcasts with P99 < 100ms for cursors.
-
-#![allow(clippy::collapsible_if)]
-
-use super::super::client::{
-    ClientEvent, LoadTestClient, ServerMessage, fetch_first_slide, spawn_update_client,
-};
-use super::super::{LatencyStats, LoadTestConfig, LoadTestResults};
-use std::sync::Arc;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::time::{Duration, Instant};
-use tokio::sync::mpsc;
-
-/// Fan-out load test scenario
-pub struct FanOutScenario {
-    config: LoadTestConfig,
-}
-
-impl FanOutScenario {
-    pub fn new(config: LoadTestConfig) -> Self {
-        Self { config }
-    }
-
-    /// Run the fan-out scenario
-    ///
-    /// Creates N sessions, each with 1 presenter + 20 followers.
-    /// Presenter sends 30Hz cursor + 10Hz viewport updates.
-    /// Measures broadcast latency across all followers.
-    pub async fn run(&self) -> Result<LoadTestResults, Box<dyn std::error::Error + Send + Sync>> {
-        let start = Instant::now();
-        let mut results = LoadTestResults::new();
-
-        // Fetch available slide from server
-        let slide = fetch_first_slide(&self.config.http_url).await?;
-        println!("Using slide: {} ({})", slide.name, slide.id);
-
-        // Channel for collecting events from all clients
-        let (tx, mut rx) = mpsc::channel::<ClientEvent>(10000);
-
-        // Atomic counters for quick stats
-        let messages_sent = Arc::new(AtomicU64::new(0));
-        let messages_received = Arc::new(AtomicU64::new(0));
-        let connection_errors = Arc::new(AtomicU64::new(0));
-
-        let mut join_handles = Vec::new();
-
-        // Create sessions and spawn presenter + follower tasks
-        for session_idx in 0..self.config.num_sessions {
-            println!(
-                "Setting up session {}/{}",
-                session_idx + 1,
-                self.config.num_sessions
-            );
-
-            // Create presenter client
-            let presenter = match LoadTestClient::connect(&self.config.ws_url).await {
-                Ok(mut client) => {
-                    // Create session with the discovered slide
-                    if let Err(e) = client.create_session(&slide.id).await {
-                        eprintln!("Failed to create session {}: {}", session_idx, e);
-                        connection_errors.fetch_add(1, Ordering::SeqCst);
-                        continue;
-                    }
-                    client
-                }
-                Err(e) => {
-                    eprintln!("Failed to connect presenter {}: {}", session_idx, e);
-                    connection_errors.fetch_add(1, Ordering::SeqCst);
-                    continue;
-                }
-            };
-
-            let session_id = presenter.session_id.clone().unwrap();
-            let join_secret = presenter.join_secret.clone().unwrap();
-
-            // Spawn presenter task (sends updates)
-            let presenter_tx = tx.clone();
-            let cursor_hz = self.config.cursor_hz;
-            let viewport_hz = self.config.viewport_hz;
-            let duration = self.config.duration;
-            let handle = tokio::spawn(async move {
-                spawn_update_client(presenter, cursor_hz, viewport_hz, duration, presenter_tx)
-                    .await;
-            });
-            join_handles.push(handle);
-
-            // Create follower clients
-            for follower_idx in 0..self.config.followers_per_session {
-                let follower_tx = tx.clone();
-                let ws_url = self.config.ws_url.clone();
-                let session_id = session_id.clone();
-                let join_secret = join_secret.clone();
-                let duration = self.config.duration;
-                let errors = connection_errors.clone();
-                let recv_count = messages_received.clone();
-
-                let handle = tokio::spawn(async move {
-                    // Connect and join session
-                    let client = match LoadTestClient::connect(&ws_url).await {
-                        Ok(mut c) => {
-                            if let Err(e) = c.join_session(&session_id, &join_secret).await {
-                                eprintln!("Follower {} failed to join: {}", follower_idx, e);
-                                errors.fetch_add(1, Ordering::SeqCst);
-                                return;
-                            }
-                            c
-                        }
-                        Err(e) => {
-                            eprintln!("Follower {} failed to connect: {}", follower_idx, e);
-                            errors.fetch_add(1, Ordering::SeqCst);
-                            return;
-                        }
-                    };
-
-                    // Receive messages for duration
-                    let start = Instant::now();
-                    let mut ws = client;
-                    while start.elapsed() < duration {
-                        match ws.recv_timeout(Duration::from_millis(100)).await {
-                            Ok(Some(msg)) => {
-                                recv_count.fetch_add(1, Ordering::SeqCst);
-                                // Track message type for latency if it's an Ack
-                                let msg_type = match &msg {
-                                    ServerMessage::PresenceDelta { .. } => "presence",
-                                    ServerMessage::PresenterViewport { .. } => "viewport",
-                                    ServerMessage::Ack { .. } => "ack",
-                                    _ => "other",
-                                };
-                                let _ = follower_tx
-                                    .send(ClientEvent::MessageReceived {
-                                        latency: None, // We track latency on presenter side
-                                        msg_type,
-                                    })
-                                    .await;
-                            }
-                            Ok(None) => {}
-                            Err(_) => {
-                                let _ = follower_tx.send(ClientEvent::Error).await;
-                            }
-                        }
-                    }
-
-                    let _ = ws.close().await;
-                });
-                join_handles.push(handle);
-            }
-
-            // Small delay between session setups to avoid thundering herd
-            tokio::time::sleep(Duration::from_millis(50)).await;
-        }
-
-        // Drop the original sender so rx completes when all tasks are done
-        drop(tx);
-
-        // Collect events from all clients
-        let mut cursor_latencies = LatencyStats::new();
-        let mut viewport_latencies = LatencyStats::new();
-
-        // Process events as they come in (but don't block forever)
-        let collect_duration = self.config.duration + Duration::from_secs(5);
-        let collect_start = Instant::now();
-
-        while collect_start.elapsed() < collect_duration {
-            match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await {
-                Ok(Some(event)) => match event {
-                    ClientEvent::MessageSent => {
-                        messages_sent.fetch_add(1, Ordering::SeqCst);
-                    }
-                    ClientEvent::MessageReceived { latency, msg_type } => {
-                        // Note: messages_received is already incremented in the follower tasks
-                        // via recv_count, so we don't increment here to avoid double-counting
-                        if let Some(lat) = latency {
-                            match msg_type {
-                                "presence" | "cursor" => cursor_latencies.record(lat),
-                                "viewport" => viewport_latencies.record(lat),
-                                _ => {}
-                            }
-                        }
-                    }
-                    ClientEvent::Error => {
-                        connection_errors.fetch_add(1, Ordering::SeqCst);
-                    }
-                },
-                Ok(None) => break, // Channel closed
-                Err(_) => {}       // Timeout, continue
-            }
-        }
-
-        // Wait for all tasks to complete
-        for handle in join_handles {
-            let _ = handle.await;
-        }
-
-        results.cursor_latencies = cursor_latencies;
-        results.viewport_latencies = viewport_latencies;
-        results.messages_sent = messages_sent.load(Ordering::SeqCst);
-        results.messages_received = messages_received.load(Ordering::SeqCst);
-        results.connection_errors = connection_errors.load(Ordering::SeqCst);
-        results.duration = start.elapsed();
-
-        Ok(results)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // Note: These tests require a running server
-    // Run with: cargo test --test perf_tests -- --ignored
-
-    #[tokio::test]
-    #[ignore = "requires running server"]
-    async fn test_fanout_single_session() {
-        let config = LoadTestConfig {
-            num_sessions: 1,
-            followers_per_session: 5,
-            cursor_hz: 10,
-            viewport_hz: 5,
-            duration: Duration::from_secs(5),
-            ..Default::default()
-        };
-
-        let scenario = FanOutScenario::new(config);
-        let results = scenario.run().await.expect("Scenario should complete");
-
-        println!("{}", results.report());
-        assert!(results.messages_sent > 0, "Should have sent messages");
-        assert!(
-            results.messages_received > 0,
-            "Should have received messages"
-        );
-    }
-
-    #[tokio::test]
-    #[ignore = "requires running server"]
-    async fn test_fanout_full_load() {
-        let config = LoadTestConfig {
-            num_sessions: 5,
-            followers_per_session: 20,
-            cursor_hz: 30,
-            viewport_hz: 10,
-            duration: Duration::from_secs(60),
-            ..Default::default()
-        };
-
-        let scenario = FanOutScenario::new(config);
-        let results = scenario.run().await.expect("Scenario should complete");
-
-        println!("{}", results.report());
-        assert!(results.meets_budgets(), "Should meet performance budgets");
-    }
-}
diff --git a/server/tests/load_tests/scenarios/mod.rs b/server/tests/load_tests/scenarios/mod.rs
index 7fcb8a9..653cf0d 100644
--- a/server/tests/load_tests/scenarios/mod.rs
+++ b/server/tests/load_tests/scenarios/mod.rs
@@ -1,9 +1,10 @@
 //! Load test scenarios
+//!
+//! Single comprehensive benchmark that tests all hot paths:
+//! - WebSocket cursor/viewport broadcasts
+//! - HTTP tile serving
+//! - HTTP overlay requests
 
 pub mod comprehensive;
-pub mod fanout;
-pub mod overlay;
 
 pub use comprehensive::{ComprehensiveStressConfig, ComprehensiveStressScenario};
-pub use fanout::FanOutScenario;
-pub use overlay::{OverlayStressConfig, OverlayStressScenario};
diff --git a/server/tests/load_tests/scenarios/overlay.rs b/server/tests/load_tests/scenarios/overlay.rs
deleted file mode 100644
index 8840c3c..0000000
--- a/server/tests/load_tests/scenarios/overlay.rs
+++ /dev/null
@@ -1,402 +0,0 @@
-//! Overlay stress test scenario
-//!
-//! Validates that PathCollab can handle concurrent requests for:
-//! - Tissue overlay tiles (GET /api/slide/:id/overlay/tissue/:level/:x/:y)
-//! - Cell overlay queries (GET /api/slide/:id/overlay/cells?x=...&y=...&width=...&height=...)
-//! - Overlay metadata endpoints
-//!
-//! This scenario focuses specifically on the HTTP overlay endpoints under load.
-
-#![allow(clippy::collapsible_if)]
-
-use super::super::client::fetch_first_slide;
-use super::super::{LatencyStats, LoadTestResults};
-use reqwest::Client;
-use std::sync::Arc;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::time::{Duration, Instant};
-use tokio::sync::mpsc;
-
-/// Configuration for overlay stress test
-#[derive(Debug, Clone)]
-pub struct OverlayStressConfig {
-    /// Number of concurrent clients
-    pub num_clients: usize,
-    /// Test duration
-    pub duration: Duration,
-    /// Server base URL (e.g., "http://127.0.0.1:8080")
-    pub base_url: String,
-    /// Rate of tissue tile requests per client (Hz)
-    pub tissue_tile_hz: u32,
-    /// Rate of cell query requests per client (Hz)
-    pub cell_query_hz: u32,
-}
-
-impl Default for OverlayStressConfig {
-    fn default() -> Self {
-        Self {
-            num_clients: 50,
-            duration: Duration::from_secs(30),
-            base_url: "http://127.0.0.1:8080".to_string(),
-            tissue_tile_hz: 10,
-            cell_query_hz: 2,
-        }
-    }
-}
-
-/// Extended results for overlay stress test
-#[derive(Debug)]
-pub struct OverlayStressResults {
-    /// Base results
-    pub base: LoadTestResults,
-    /// Tissue tile request latencies
-    pub tissue_tile_latencies: LatencyStats,
-    /// Cell query latencies
-    pub cell_query_latencies: LatencyStats,
-    /// Metadata request latencies
-    pub metadata_latencies: LatencyStats,
-    /// Number of 404 responses (expected for non-existent tiles)
-    pub not_found_count: u64,
-    /// Number of successful requests
-    pub success_count: u64,
-}
-
-impl OverlayStressResults {
-    pub fn new() -> Self {
-        Self {
-            base: LoadTestResults::new(),
-            tissue_tile_latencies: LatencyStats::new(),
-            cell_query_latencies: LatencyStats::new(),
-            metadata_latencies: LatencyStats::new(),
-            not_found_count: 0,
-            success_count: 0,
-        }
-    }
-
-    /// Generate a summary report
-    pub fn report(&self) -> String {
-        let mut report = String::new();
-        report.push_str("=== Overlay Stress Test Results ===\n\n");
-
-        report.push_str(&format!(
-            "Duration: {:.2}s\n",
-            self.base.duration.as_secs_f64()
-        ));
-        report.push_str(&format!("Total requests: {}\n", self.base.messages_sent));
-        report.push_str(&format!("Successful: {}\n", self.success_count));
-        report.push_str(&format!("Not found (404): {}\n", self.not_found_count));
-        report.push_str(&format!("Errors: {}\n\n", self.base.connection_errors));
-
-        let throughput = self.base.messages_sent as f64 / self.base.duration.as_secs_f64();
-        report.push_str(&format!("Throughput: {:.1} req/s\n\n", throughput));
-
-        report.push_str("Tissue Tile Latencies:\n");
-        if let Some(p50) = self.tissue_tile_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.tissue_tile_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.tissue_tile_latencies.p99() {
-            report.push_str(&format!("  P99: {:?}\n", p99));
-        }
-
-        report.push_str("\nCell Query Latencies:\n");
-        if let Some(p50) = self.cell_query_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.cell_query_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.cell_query_latencies.p99() {
-            report.push_str(&format!("  P99: {:?}\n", p99));
-        }
-
-        report.push_str("\nMetadata Latencies:\n");
-        if let Some(p50) = self.metadata_latencies.p50() {
-            report.push_str(&format!("  P50: {:?}\n", p50));
-        }
-        if let Some(p95) = self.metadata_latencies.p95() {
-            report.push_str(&format!("  P95: {:?}\n", p95));
-        }
-        if let Some(p99) = self.metadata_latencies.p99() {
-            report.push_str(&format!("  P99: {:?}\n", p99));
-        }
-
-        report
-    }
-}
-
-impl Default for OverlayStressResults {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-/// Event types from overlay client tasks
-#[derive(Debug)]
-#[allow(dead_code)]
-pub enum OverlayEvent {
-    TissueTileRequest { latency: Duration, success: bool },
-    CellQueryRequest { latency: Duration, success: bool },
-    MetadataRequest { latency: Duration, success: bool },
-    NotFound,
-    Error,
-}
-
-/// Overlay stress test scenario
-pub struct OverlayStressScenario {
-    config: OverlayStressConfig,
-}
-
-impl OverlayStressScenario {
-    pub fn new(config: OverlayStressConfig) -> Self {
-        Self { config }
-    }
-
-    /// Run the overlay stress test scenario
-    pub async fn run(
-        &self,
-    ) -> Result<OverlayStressResults, Box<dyn std::error::Error + Send + Sync>> {
-        let start = Instant::now();
-        let mut results = OverlayStressResults::new();
-
-        // Fetch available slide from server
-        let slide = fetch_first_slide(&self.config.base_url).await?;
-        println!("Using slide: {} ({})", slide.name, slide.id);
-
-        // Channel for collecting events
-        let (tx, mut rx) = mpsc::channel::<OverlayEvent>(10000);
-
-        // Atomic counters
-        let requests_sent = Arc::new(AtomicU64::new(0));
-        let success_count = Arc::new(AtomicU64::new(0));
-        let not_found_count = Arc::new(AtomicU64::new(0));
-        let error_count = Arc::new(AtomicU64::new(0));
-
-        let mut join_handles = Vec::new();
-
-        // Create HTTP client with connection pooling
-        let http_client = Client::builder()
-            .pool_max_idle_per_host(100)
-            .timeout(Duration::from_secs(30))
-            .build()?;
-
-        println!(
-            "Starting overlay stress test with {} clients for {:?}",
-            self.config.num_clients, self.config.duration
-        );
-
-        // Spawn client tasks
-        for client_idx in 0..self.config.num_clients {
-            let client = http_client.clone();
-            let tx = tx.clone();
-            let base_url = self.config.base_url.clone();
-            let slide_id = slide.id.clone();
-            let duration = self.config.duration;
-            let tissue_hz = self.config.tissue_tile_hz;
-            let cell_hz = self.config.cell_query_hz;
-            let sent = requests_sent.clone();
-            let success = success_count.clone();
-            let not_found = not_found_count.clone();
-            let errors = error_count.clone();
-
-            let handle = tokio::spawn(async move {
-                let tissue_interval = if tissue_hz > 0 {
-                    Duration::from_secs_f64(1.0 / tissue_hz as f64)
-                } else {
-                    Duration::from_secs(3600)
-                };
-
-                let cell_interval = if cell_hz > 0 {
-                    Duration::from_secs_f64(1.0 / cell_hz as f64)
-                } else {
-                    Duration::from_secs(3600)
-                };
-
-                let start = Instant::now();
-                let mut tissue_ticker = tokio::time::interval(tissue_interval);
-                let mut cell_ticker = tokio::time::interval(cell_interval);
-
-                // Vary tile coordinates to simulate realistic access patterns
-                let mut tile_x = client_idx as u32 % 10;
-                let mut tile_y = 0u32;
-                let level = 3; // Mid-level tiles
-
-                loop {
-                    if start.elapsed() >= duration {
-                        break;
-                    }
-
-                    tokio::select! {
-                        _ = tissue_ticker.tick() => {
-                            sent.fetch_add(1, Ordering::SeqCst);
-
-                            // Request tissue tile
-                            let url = format!(
-                                "{}/api/slide/{}/overlay/tissue/{}/{}/{}",
-                                base_url, slide_id, level, tile_x, tile_y
-                            );
-
-                            let req_start = Instant::now();
-                            match client.get(&url).send().await {
-                                Ok(resp) => {
-                                    let latency = req_start.elapsed();
-                                    if resp.status().is_success() {
-                                        success.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::TissueTileRequest {
-                                            latency,
-                                            success: true,
-                                        }).await;
-                                    } else if resp.status().as_u16() == 404 {
-                                        not_found.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::NotFound).await;
-                                    } else {
-                                        errors.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::TissueTileRequest {
-                                            latency,
-                                            success: false,
-                                        }).await;
-                                    }
-                                }
-                                Err(_) => {
-                                    errors.fetch_add(1, Ordering::SeqCst);
-                                    let _ = tx.send(OverlayEvent::Error).await;
-                                }
-                            }
-
-                            // Move to next tile
-                            tile_x = (tile_x + 1) % 20;
-                            if tile_x == 0 {
-                                tile_y = (tile_y + 1) % 20;
-                            }
-                        }
-                        _ = cell_ticker.tick() => {
-                            sent.fetch_add(1, Ordering::SeqCst);
-
-                            // Request cells in region (varying region)
-                            let region_x = (client_idx as f64 * 1000.0) % 50000.0;
-                            let region_y = (client_idx as f64 * 500.0) % 50000.0;
-                            let url = format!(
-                                "{}/api/slide/{}/overlay/cells?x={}&y={}&width=5000&height=5000",
-                                base_url, slide_id, region_x, region_y
-                            );
-
-                            let req_start = Instant::now();
-                            match client.get(&url).send().await {
-                                Ok(resp) => {
-                                    let latency = req_start.elapsed();
-                                    if resp.status().is_success() {
-                                        success.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::CellQueryRequest {
-                                            latency,
-                                            success: true,
-                                        }).await;
-                                    } else if resp.status().as_u16() == 404 {
-                                        not_found.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::NotFound).await;
-                                    } else {
-                                        errors.fetch_add(1, Ordering::SeqCst);
-                                        let _ = tx.send(OverlayEvent::CellQueryRequest {
-                                            latency,
-                                            success: false,
-                                        }).await;
-                                    }
-                                }
-                                Err(_) => {
-                                    errors.fetch_add(1, Ordering::SeqCst);
-                                    let _ = tx.send(OverlayEvent::Error).await;
-                                }
-                            }
-                        }
-                    }
-                }
-            });
-            join_handles.push(handle);
-
-            // Small stagger to avoid thundering herd
-            if client_idx % 10 == 9 {
-                tokio::time::sleep(Duration::from_millis(10)).await;
-            }
-        }
-
-        // Drop the original sender
-        drop(tx);
-
-        // Collect events
-        let mut tissue_latencies = LatencyStats::new();
-        let mut cell_latencies = LatencyStats::new();
-        let mut metadata_latencies = LatencyStats::new();
-
-        let collect_duration = self.config.duration + Duration::from_secs(5);
-        let collect_start = Instant::now();
-
-        while collect_start.elapsed() < collect_duration {
-            match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await {
-                Ok(Some(event)) => match event {
-                    OverlayEvent::TissueTileRequest {
-                        latency,
-                        success: true,
-                    } => {
-                        tissue_latencies.record(latency);
-                    }
-                    OverlayEvent::CellQueryRequest {
-                        latency,
-                        success: true,
-                    } => {
-                        cell_latencies.record(latency);
-                    }
-                    OverlayEvent::MetadataRequest {
-                        latency,
-                        success: true,
-                    } => {
-                        metadata_latencies.record(latency);
-                    }
-                    _ => {}
-                },
-                Ok(None) => break,
-                Err(_) => {}
-            }
-        }
-
-        // Wait for all tasks
-        for handle in join_handles {
-            let _ = handle.await;
-        }
-
-        // Populate results
-        results.base.messages_sent = requests_sent.load(Ordering::SeqCst);
-        results.success_count = success_count.load(Ordering::SeqCst);
-        results.not_found_count = not_found_count.load(Ordering::SeqCst);
-        results.base.connection_errors = error_count.load(Ordering::SeqCst);
-        results.base.duration = start.elapsed();
-        results.tissue_tile_latencies = tissue_latencies;
-        results.cell_query_latencies = cell_latencies;
-        results.metadata_latencies = metadata_latencies;
-
-        Ok(results)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[tokio::test]
-    #[ignore = "requires running server"]
-    async fn test_overlay_stress_minimal() {
-        let config = OverlayStressConfig {
-            num_clients: 5,
-            duration: Duration::from_secs(5),
-            tissue_tile_hz: 5,
-            cell_query_hz: 1,
-            ..Default::default()
-        };
-
-        let scenario = OverlayStressScenario::new(config);
-        let results = scenario.run().await.expect("Scenario should complete");
-
-        println!("{}", results.report());
-        assert!(results.base.messages_sent > 0, "Should have sent requests");
-    }
-}
diff --git a/server/tests/perf_tests.rs b/server/tests/perf_tests.rs
index 51a7d5d..b906c9f 100644
--- a/server/tests/perf_tests.rs
+++ b/server/tests/perf_tests.rs
@@ -1,305 +1,108 @@
-//! Load test entry point
+//! Unified Benchmark Suite for PathCollab
 //!
-//! Run with: cargo test --test perf_tests -- --ignored --nocapture
-//! Or for quick test: cargo test --test perf_tests test_connection -- --ignored --nocapture
+//! This module provides a three-tier benchmark system for validating
+//! server performance under load.
 //!
-//! Available tests:
-//! - test_connection: Quick connectivity test
-//! - test_create_session: Session creation test
-//! - test_fanout_minimal: Quick fan-out test (1 session, 3 followers, 3s)
-//! - test_fanout_standard: Standard fan-out (5 sessions, 20 followers, 30s)
-//! - test_fanout_extended: Extended fan-out (5 sessions, 20 followers, 5min)
-//! - test_overlay_stress_minimal: Quick overlay stress test (5 clients, 5s)
-//! - test_overlay_stress_standard: Standard overlay stress (50 clients, 30s)
-//! - test_comprehensive_minimal: Quick comprehensive test (10 users, 10s)
-//! - test_comprehensive_100_users: 100 users stress test (50 sessions, 30s)
-//! - test_comprehensive_1000_users: Full 1000 users stress test (500 sessions, 60s)
+//! ## Features
+//!
+//! - **Warm-up phase**: Primes caches and connection pools before measuring
+//! - **Multiple iterations**: Runs 3 times for statistical significance
+//! - **Baseline comparison**: Compares against stored baseline, detects regressions
+//!
+//! ## Benchmark Tiers
+//!
+//! | Tier       | Purpose           | Duration | Config                    |
+//! |------------|-------------------|----------|---------------------------|
+//! | `smoke`    | CI on every push  | <30s     | 5 sessions, 10 users, 10s |
+//! | `standard` | PR merge gate     | ~2min    | 25 sessions, 50 users, 30s|
+//! | `stress`   | Manual/release    | ~5min    | 100 sessions, 200 users   |
+//!
+//! ## Running Benchmarks
+//!
+//! ```bash
+//! # Quick smoke test (CI) - 3 iterations with warm-up
+//! cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
+//!
+//! # Standard test (PR merge gate)
+//! cargo test --test perf_tests bench_standard --release -- --ignored --nocapture
+//!
+//! # Full stress test (manual/release)
+//! cargo test --test perf_tests bench_stress --release -- --ignored --nocapture
+//!
+//! # Save current results as baseline
+//! SAVE_BASELINE=1 cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
+//! ```
+//!
+//! ## Baseline Management
+//!
+//! Baselines are stored in `.benchmark-baseline.json`. Set `SAVE_BASELINE=1` to update.
 
 #![allow(clippy::collapsible_if)]
 
 mod load_tests;
 
-use load_tests::scenarios::{
-    ComprehensiveStressConfig, ComprehensiveStressScenario, FanOutScenario, OverlayStressConfig,
-    OverlayStressScenario,
-};
-use load_tests::{LoadTestConfig, LoadTestResults};
-use std::time::Duration;
+use load_tests::BenchmarkTier;
+use load_tests::benchmark::{BenchmarkRunConfig, BenchmarkRunner};
 
-/// Quick connectivity test
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_connection() {
-    use load_tests::client::LoadTestClient;
-
-    let url = "ws://127.0.0.1:8080/ws";
-    let client: LoadTestClient = LoadTestClient::connect(url)
-        .await
-        .expect("Should connect to server");
-
-    println!("Connected successfully to {}", url);
-    client.close().await.expect("Should close cleanly");
-}
-
-/// Quick session creation test
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_create_session() {
-    use load_tests::client::{LoadTestClient, fetch_first_slide};
-
-    // Fetch available slide from server
-    let slide = fetch_first_slide("http://127.0.0.1:8080")
-        .await
-        .expect("Should have slides available");
-    println!("Using slide: {} ({})", slide.name, slide.id);
+/// Run a benchmark for the given tier with warm-up, iterations, and comparison
+async fn run_benchmark(tier: BenchmarkTier) {
+    let config = BenchmarkRunConfig::for_tier(tier);
+    let runner = BenchmarkRunner::new(config.clone());
 
-    let url = "ws://127.0.0.1:8080/ws";
-    let mut client: LoadTestClient = LoadTestClient::connect(url)
-        .await
-        .expect("Should connect to server");
+    let result = runner.run().await.expect("Benchmark should complete");
 
-    client
-        .create_session(&slide.id)
-        .await
-        .expect("Should create session");
+    // Print JSON for CI parsing
+    println!("JSON: {}", result.to_json());
 
-    println!("Session created: {:?}", client.session_id);
-    assert!(client.session_id.is_some());
-    assert!(client.join_secret.is_some());
-    assert!(client.presenter_key.is_some());
-
-    client.close().await.expect("Should close cleanly");
-}
-
-/// Quick fan-out test with minimal load
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_fanout_minimal() {
-    let config = LoadTestConfig {
-        num_sessions: 1,
-        followers_per_session: 3,
-        cursor_hz: 10,
-        viewport_hz: 5,
-        duration: Duration::from_secs(3),
-        ..Default::default()
-    };
-
-    let scenario = FanOutScenario::new(config);
-    let results: LoadTestResults = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-    assert!(results.messages_sent > 0, "Should have sent messages");
-}
-
-/// Standard fan-out test: 5 sessions, 20 followers each, 30 seconds
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_fanout_standard() {
-    let config = LoadTestConfig {
-        num_sessions: 5,
-        followers_per_session: 20,
-        cursor_hz: 30,
-        viewport_hz: 10,
-        duration: Duration::from_secs(30),
-        ..Default::default()
-    };
-
-    let scenario = FanOutScenario::new(config);
-    let results: LoadTestResults = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-
-    // Verify basic functionality
-    assert!(results.messages_sent > 0, "Should have sent messages");
-    assert!(
-        results.messages_received > 0,
-        "Should have received messages"
-    );
-
-    // Check performance budgets
-    if !results.meets_budgets() {
-        println!("WARNING: Performance budgets exceeded!");
-        // Don't fail the test yet, just warn
+    // Save baseline if requested
+    if std::env::var("SAVE_BASELINE").is_ok() {
+        runner
+            .save_baseline(&result.report)
+            .expect("Failed to save baseline");
     }
-}
-
-/// Extended fan-out test: 5 sessions, 20 followers each, 5 minutes
-#[tokio::test]
-#[ignore = "requires running server - long running"]
-async fn test_fanout_extended() {
-    let config = LoadTestConfig {
-        num_sessions: 5,
-        followers_per_session: 20,
-        cursor_hz: 30,
-        viewport_hz: 10,
-        duration: Duration::from_secs(300), // 5 minutes
-        ..Default::default()
-    };
-
-    let scenario = FanOutScenario::new(config);
-    let results: LoadTestResults = scenario.run().await.expect("Scenario should complete");
 
-    println!("{}", results.report());
-
-    // This is the primary performance validation
+    // Assert no regressions and budgets met
     assert!(
-        results.meets_budgets(),
-        "Should meet performance budgets under sustained load"
+        result.all_passed,
+        "Performance budgets not met for {} tier",
+        tier.name()
     );
-}
-
-/// Quick overlay stress test: 5 clients, 5 seconds
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_overlay_stress_minimal() {
-    let config = OverlayStressConfig {
-        num_clients: 5,
-        duration: Duration::from_secs(5),
-        tissue_tile_hz: 5,
-        cell_query_hz: 1,
-        ..Default::default()
-    };
-
-    let scenario = OverlayStressScenario::new(config);
-    let results = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-    assert!(results.base.messages_sent > 0, "Should have sent requests");
-}
-
-/// Standard overlay stress test: 50 clients, 30 seconds
-#[tokio::test]
-#[ignore = "requires running server"]
-async fn test_overlay_stress_standard() {
-    let config = OverlayStressConfig {
-        num_clients: 50,
-        duration: Duration::from_secs(30),
-        tissue_tile_hz: 10,
-        cell_query_hz: 2,
-        ..Default::default()
-    };
-
-    let scenario = OverlayStressScenario::new(config);
-    let results = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-
-    // Basic validation - ensure we actually did work
-    assert!(results.base.messages_sent > 0, "Should have sent requests");
-
-    // Most requests should succeed (allow for 404s on non-existent overlays)
-    let success_rate = (results.success_count + results.not_found_count) as f64
-        / results.base.messages_sent as f64;
     assert!(
-        success_rate > 0.95,
-        "Success rate should be > 95%, was {:.1}%",
-        success_rate * 100.0
+        !result.has_regression,
+        "Performance regression detected for {} tier",
+        tier.name()
     );
 }
 
-// ============================================================================
-// Comprehensive Stress Tests
-// ============================================================================
-
-/// Quick comprehensive test: 10 users (5 sessions), 10 seconds
+/// Smoke benchmark: Quick CI validation on every push
+///
+/// - Duration: ~30 seconds (2s warm-up + 3 × 10s iterations)
+/// - Config: 5 sessions, 10 users
+/// - Purpose: Fast feedback on obvious regressions
 #[tokio::test]
 #[ignore = "requires running server"]
-async fn test_comprehensive_minimal() {
-    let config = ComprehensiveStressConfig {
-        num_sessions: 5, // 10 users
-        duration: Duration::from_secs(10),
-        cursor_hz: 10,
-        viewport_hz: 5,
-        tile_request_hz: 2,
-        overlay_request_hz: 1,
-        ..Default::default()
-    };
-
-    let scenario = ComprehensiveStressScenario::new(config);
-    let results = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-    assert!(results.ws_messages_sent > 0, "Should have sent WS messages");
-    assert!(
-        results.http_requests_sent > 0,
-        "Should have sent HTTP requests"
-    );
+async fn bench_smoke() {
+    run_benchmark(BenchmarkTier::Smoke).await;
 }
 
-/// 100 users comprehensive test: 50 sessions × 2 users, 30 seconds
+/// Standard benchmark: PR merge gate
+///
+/// - Duration: ~2 minutes (5s warm-up + 3 × 30s iterations)
+/// - Config: 25 sessions, 50 users
+/// - Purpose: Validate performance before merging PRs
 #[tokio::test]
 #[ignore = "requires running server"]
-async fn test_comprehensive_100_users() {
-    let config = ComprehensiveStressConfig {
-        num_sessions: 50, // 100 users
-        duration: Duration::from_secs(30),
-        cursor_hz: 30,
-        viewport_hz: 10,
-        tile_request_hz: 5,
-        overlay_request_hz: 2,
-        ..Default::default()
-    };
-
-    let scenario = ComprehensiveStressScenario::new(config);
-    let results = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-
-    // Basic validation
-    assert!(results.ws_messages_sent > 0, "Should have sent WS messages");
-    assert!(
-        results.http_requests_sent > 0,
-        "Should have sent HTTP requests"
-    );
-
-    // Check we created and joined sessions successfully
-    assert!(
-        results.sessions_created >= 40,
-        "Should have created at least 40 sessions (got {})",
-        results.sessions_created
-    );
-    assert!(
-        results.sessions_joined >= 40,
-        "Should have at least 40 followers (got {})",
-        results.sessions_joined
-    );
+async fn bench_standard() {
+    run_benchmark(BenchmarkTier::Standard).await;
 }
 
-/// Full 1000 users stress test: 500 sessions × 2 users, 60 seconds
-/// This is the primary performance validation for production readiness.
+/// Stress benchmark: Manual/release testing
+///
+/// - Duration: ~4 minutes (5s warm-up + 3 × 60s iterations)
+/// - Config: 100 sessions, 200 users
+/// - Purpose: Full stress test for releases
 #[tokio::test]
 #[ignore = "requires running server - long running"]
-async fn test_comprehensive_1000_users() {
-    let config = ComprehensiveStressConfig {
-        num_sessions: 500, // 1000 users
-        duration: Duration::from_secs(60),
-        cursor_hz: 30,
-        viewport_hz: 10,
-        tile_request_hz: 5,
-        overlay_request_hz: 2,
-        ..Default::default()
-    };
-
-    let scenario = ComprehensiveStressScenario::new(config);
-    let results = scenario.run().await.expect("Scenario should complete");
-
-    println!("{}", results.report());
-
-    // This is the primary performance validation
-    assert!(
-        results.meets_budgets(),
-        "Should meet performance budgets under 1000 user load"
-    );
-
-    // Verify we actually achieved the target load
-    assert!(
-        results.sessions_created >= 450,
-        "Should have created at least 450 sessions (got {})",
-        results.sessions_created
-    );
-    assert!(
-        results.sessions_joined >= 450,
-        "Should have at least 450 followers (got {})",
-        results.sessions_joined
-    );
+async fn bench_stress() {
+    run_benchmark(BenchmarkTier::Stress).await;
 }