PABannier · PABannier · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Install build dependencies
-        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev
+        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
@@ -95,7 +95,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Install build dependencies
-        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev
+        run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable

diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml
@@ -3,33 +3,36 @@ name: Performance Tests
 on:
   workflow_dispatch:
     inputs:
-      run_full_load_test:
-        description: 'Run extended load tests (5 sessions, 20 followers, 5 minutes)'
+      benchmark_tier:
+        description: 'Benchmark tier to run'
         required: false
-        default: 'false'
-        type: boolean
-  pull_request:
-    branches: [main]
-    paths:
-      - 'server/**'
-      - 'bench/**'
-      - '.github/workflows/perf.yml'
+        default: 'smoke'
+        type: choice
+        options:
+          - smoke
+          - standard
+          - stress
+  # pull_request:
+  #   branches: [main]
+  #   paths:
+  #     - 'server/**'
+  #     - 'bench/**'
+  #     - '.github/workflows/perf.yml'
 
 env:
   CARGO_TERM_COLOR: always
 
 jobs:
-  # Quick non-regression test on every PR
-  regression-test:
-    name: Performance Regression Test
+  benchmark:
+    name: Performance Benchmark
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
 
       - name: Install build dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y libopenslide-dev python3
+          sudo apt-get install -y libopenslide-dev protobuf-compiler
 
       - name: Install Rust toolchain
         uses: dtolnay/rust-toolchain@stable
@@ -46,12 +49,6 @@ jobs:
           key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }}
           restore-keys: ${{ runner.os }}-cargo-
 
-      - name: Install oha (HTTP load testing tool)
-        run: |
-          if ! command -v oha &> /dev/null; then
-            cargo install oha
-          fi
-
       - name: Build server and tests (release)
         run: |
           cargo build --release
@@ -83,49 +80,43 @@ jobs:
           # Verify health
           curl -s http://127.0.0.1:8080/health
 
-      - name: Run WebSocket regression test
+      - name: Determine benchmark tier
+        id: tier
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "tier=${{ github.event.inputs.benchmark_tier }}" >> $GITHUB_OUTPUT
+          else
+            echo "tier=smoke" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Run benchmark
         run: |
           cd server
-          cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_results.txt
-        timeout-minutes: 5
+          cargo test --test perf_tests bench_${{ steps.tier.outputs.tier }} --release -- --ignored --nocapture 2>&1 | tee /tmp/benchmark_results.txt
+        timeout-minutes: 10
 
-      - name: Check WebSocket performance budgets
+      - name: Check benchmark results
         run: |
-          echo "=== WebSocket Test Results ==="
-          cat /tmp/ws_results.txt
+          echo "=== Benchmark Results ==="
+          cat /tmp/benchmark_results.txt
 
           # Check if test passed
-          if grep -q "Overall: PASS" /tmp/ws_results.txt; then
-            echo "✅ WebSocket performance within budget"
+          if grep -q "OVERALL: PASS" /tmp/benchmark_results.txt; then
+            echo "✅ Benchmark passed"
           else
-            echo "❌ WebSocket performance exceeded budget"
+            echo "❌ Benchmark failed"
             exit 1
           fi
 
-      - name: Run HTTP tile stress test (quick)
-        run: |
-          ./bench/load_tests/scenarios/tile_stress.sh \
-            --quick \
-            --output bench/load_tests/results/tile_current.json 2>&1 | tee /tmp/tile_results.txt
-        timeout-minutes: 5
-
-      - name: Compare HTTP tile performance to baseline
+      - name: Extract JSON results
+        if: always()
         run: |
-          echo "=== HTTP Tile Performance ==="
-
-          # Run comparison (--ci mode exits 1 on regression)
-          python3 ./bench/scripts/compare_baseline.py \
-            --current bench/load_tests/results/tile_current.json \
-            --baseline bench/baselines/tile_baseline.json \
-            --threshold 20 \
-            --markdown | tee /tmp/comparison.md
-
-          # Also run with CI mode to get exit code
-          python3 ./bench/scripts/compare_baseline.py \
-            --current bench/load_tests/results/tile_current.json \
-            --baseline bench/baselines/tile_baseline.json \
-            --threshold 20 \
-            --ci
+          # Extract JSON line for machine parsing
+          grep "^JSON:" /tmp/benchmark_results.txt | sed 's/^JSON: //' > bench/load_tests/results/benchmark.json || true
+          if [ -f bench/load_tests/results/benchmark.json ]; then
+            echo "=== JSON Results ==="
+            cat bench/load_tests/results/benchmark.json
+          fi
 
       - name: Collect server metrics
         if: always()
@@ -140,119 +131,8 @@ jobs:
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results
+          name: benchmark-results-${{ steps.tier.outputs.tier }}
           path: |
             bench/load_tests/results/
-            /tmp/ws_results.txt
-            /tmp/tile_results.txt
-            /tmp/comparison.md
+            /tmp/benchmark_results.txt
           retention-days: 30
-
-  # Extended load test (manual trigger only)
-  extended-load-test:
-    name: Extended Load Tests
-    runs-on: ubuntu-latest
-    if: github.event_name == 'workflow_dispatch' && github.event.inputs.run_full_load_test == 'true'
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Install build dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y libopenslide-dev python3
-
-      - name: Install Rust toolchain
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Cache Cargo
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/bin/
-            ~/.cargo/registry/index/
-            ~/.cargo/registry/cache/
-            ~/.cargo/git/db/
-            target/
-          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }}
-          restore-keys: ${{ runner.os }}-cargo-
-
-      - name: Install oha
-        run: cargo install oha
-
-      - name: Build server and tests (release)
-        run: |
-          cargo build --release
-          cargo test --test perf_tests --no-run --release
-
-      - name: Create test directories
-        run: |
-          mkdir -p /tmp/pathcollab/slides
-          mkdir -p bench/load_tests/results
-
-      - name: Start server in background
-        run: |
-          HOST=127.0.0.1 \
-          PORT=8080 \
-          SLIDES_DIR=/tmp/pathcollab/slides \
-          RUST_LOG=warn \
-          ./target/release/pathcollab &
-
-          for i in {1..30}; do
-            if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
-              echo "Server is ready!"
-              break
-            fi
-            sleep 1
-          done
-
-      - name: Run standard WebSocket load test
-        run: |
-          cd server
-          cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_standard.txt
-        timeout-minutes: 10
-
-      - name: Run extended WebSocket load test
-        run: |
-          cd server
-          cargo test --test perf_tests test_fanout_extended --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_extended.txt
-        timeout-minutes: 15
-
-      - name: Run HTTP tile ramp test
-        run: |
-          ./bench/load_tests/scenarios/ramp_test.sh \
-            --start 1 \
-            --end 50 \
-            --step 5 \
-            --stage-duration 10 \
-            --output bench/load_tests/results 2>&1 | tee /tmp/ramp_results.txt
-        timeout-minutes: 20
-
-      - name: Run HTTP tile standard test
-        run: |
-          ./bench/load_tests/scenarios/tile_stress.sh \
-            --concurrent 20 \
-            --duration 60 \
-            --output bench/load_tests/results/tile_extended.json 2>&1 | tee /tmp/tile_extended.txt
-        timeout-minutes: 10
-
-      - name: Generate performance report
-        if: always()
-        run: |
-          python3 ./bench/scripts/generate_report.py \
-            --input-dir bench/load_tests/results \
-            --output bench/load_tests/results/REPORT.md || true
-
-          echo "=== Performance Report ==="
-          cat bench/load_tests/results/REPORT.md || echo "Report generation failed"
-
-      - name: Upload extended results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: extended-benchmark-results
-          path: |
-            bench/load_tests/results/
-            /tmp/ws_*.txt
-            /tmp/tile_*.txt
-            /tmp/ramp_results.txt
-          retention-days: 90
diff --git a/AGENTS.md b/AGENTS.md
@@ -316,26 +316,32 @@ cd web && bun test
 cargo test
 
 # 4. Quick perf check (if touching hot paths)
-./bench/load_tests/scenarios/tile_stress.sh --quick
+cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
 ```
 
 ### Performance Testing
 
+The benchmark system runs 3 iterations with warm-up and compares against stored baselines.
+
 ```bash
-# Quick performance check
-SLIDES_DIR=/data/wsi_slides DEMO_ENABLED=true cargo run --release
-./bench/load_tests/scenarios/tile_stress.sh --quick
-python3 ./bench/scripts/compare_baseline.py \
-  --current bench/load_tests/results/tile_current.json \
-  --baseline bench/baselines/tile_baseline.json
-
-# Full benchmark suite (before major changes)
-./bench/scripts/run_all.sh --compare-baseline
-
-# Save new baseline after confirmed improvements
-./bench/scripts/run_all.sh --save-baseline
+# Start the server first
+SLIDES_DIR=~/Documents/tcga_slides cargo run --release &
+
+# Quick smoke test (~30s) - runs on every PR
+cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
+
+# Standard test (~2min) - PR merge gate
+cd server && cargo test --test perf_tests bench_standard --release -- --ignored --nocapture
+
+# Full stress test (~4min) - before releases
+cd server && cargo test --test perf_tests bench_stress --release -- --ignored --nocapture
+
+# Save current results as baseline
+SAVE_BASELINE=1 cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture
 ```
 
+Baselines are stored in `.benchmark-baseline.json`. The system detects regressions >15% automatically.
+
 ### Live Metrics
 
 ```bash