diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f02225..68aa6d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - uses: actions/checkout@v4 - name: Install build dependencies - run: sudo apt-get update && sudo apt-get install -y libopenslide-dev + run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable @@ -95,7 +95,7 @@ jobs: uses: actions/checkout@v4 - name: Install build dependencies - run: sudo apt-get update && sudo apt-get install -y libopenslide-dev + run: sudo apt-get update && sudo apt-get install -y libopenslide-dev protobuf-compiler - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable diff --git a/.github/workflows/perf.yml b/.github/workflows/perf.yml index 34918f9..956c163 100644 --- a/.github/workflows/perf.yml +++ b/.github/workflows/perf.yml @@ -3,25 +3,28 @@ name: Performance Tests on: workflow_dispatch: inputs: - run_full_load_test: - description: 'Run extended load tests (5 sessions, 20 followers, 5 minutes)' + benchmark_tier: + description: 'Benchmark tier to run' required: false - default: 'false' - type: boolean - pull_request: - branches: [main] - paths: - - 'server/**' - - 'bench/**' - - '.github/workflows/perf.yml' + default: 'smoke' + type: choice + options: + - smoke + - standard + - stress + # pull_request: + # branches: [main] + # paths: + # - 'server/**' + # - 'bench/**' + # - '.github/workflows/perf.yml' env: CARGO_TERM_COLOR: always jobs: - # Quick non-regression test on every PR - regression-test: - name: Performance Regression Test + benchmark: + name: Performance Benchmark runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -29,7 +32,7 @@ jobs: - name: Install build dependencies run: | sudo apt-get update - sudo apt-get install -y libopenslide-dev python3 + sudo apt-get install -y libopenslide-dev protobuf-compiler - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable @@ -46,12 +49,6 @@ jobs: key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }} restore-keys: ${{ runner.os }}-cargo- - - name: Install oha (HTTP load testing tool) - run: | - if ! command -v oha &> /dev/null; then - cargo install oha - fi - - name: Build server and tests (release) run: | cargo build --release @@ -83,49 +80,43 @@ jobs: # Verify health curl -s http://127.0.0.1:8080/health - - name: Run WebSocket regression test + - name: Determine benchmark tier + id: tier + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "tier=${{ github.event.inputs.benchmark_tier }}" >> $GITHUB_OUTPUT + else + echo "tier=smoke" >> $GITHUB_OUTPUT + fi + + - name: Run benchmark run: | cd server - cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_results.txt - timeout-minutes: 5 + cargo test --test perf_tests bench_${{ steps.tier.outputs.tier }} --release -- --ignored --nocapture 2>&1 | tee /tmp/benchmark_results.txt + timeout-minutes: 10 - - name: Check WebSocket performance budgets + - name: Check benchmark results run: | - echo "=== WebSocket Test Results ===" - cat /tmp/ws_results.txt + echo "=== Benchmark Results ===" + cat /tmp/benchmark_results.txt # Check if test passed - if grep -q "Overall: PASS" /tmp/ws_results.txt; then - echo "✅ WebSocket performance within budget" + if grep -q "OVERALL: PASS" /tmp/benchmark_results.txt; then + echo "✅ Benchmark passed" else - echo "❌ WebSocket performance exceeded budget" + echo "❌ Benchmark failed" exit 1 fi - - name: Run HTTP tile stress test (quick) - run: | - ./bench/load_tests/scenarios/tile_stress.sh \ - --quick \ - --output bench/load_tests/results/tile_current.json 2>&1 | tee /tmp/tile_results.txt - timeout-minutes: 5 - - - name: Compare HTTP tile performance to baseline + - name: Extract JSON results + if: always() run: | - echo "=== HTTP Tile Performance ===" - - # Run comparison (--ci mode exits 1 on regression) - python3 ./bench/scripts/compare_baseline.py \ - --current bench/load_tests/results/tile_current.json \ - --baseline bench/baselines/tile_baseline.json \ - --threshold 20 \ - --markdown | tee /tmp/comparison.md - - # Also run with CI mode to get exit code - python3 ./bench/scripts/compare_baseline.py \ - --current bench/load_tests/results/tile_current.json \ - --baseline bench/baselines/tile_baseline.json \ - --threshold 20 \ - --ci + # Extract JSON line for machine parsing + grep "^JSON:" /tmp/benchmark_results.txt | sed 's/^JSON: //' > bench/load_tests/results/benchmark.json || true + if [ -f bench/load_tests/results/benchmark.json ]; then + echo "=== JSON Results ===" + cat bench/load_tests/results/benchmark.json + fi - name: Collect server metrics if: always() @@ -140,119 +131,8 @@ jobs: if: always() uses: actions/upload-artifact@v4 with: - name: benchmark-results + name: benchmark-results-${{ steps.tier.outputs.tier }} path: | bench/load_tests/results/ - /tmp/ws_results.txt - /tmp/tile_results.txt - /tmp/comparison.md + /tmp/benchmark_results.txt retention-days: 30 - - # Extended load test (manual trigger only) - extended-load-test: - name: Extended Load Tests - runs-on: ubuntu-latest - if: github.event_name == 'workflow_dispatch' && github.event.inputs.run_full_load_test == 'true' - steps: - - uses: actions/checkout@v4 - - - name: Install build dependencies - run: | - sudo apt-get update - sudo apt-get install -y libopenslide-dev python3 - - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@stable - - - name: Cache Cargo - uses: actions/cache@v4 - with: - path: | - ~/.cargo/bin/ - ~/.cargo/registry/index/ - ~/.cargo/registry/cache/ - ~/.cargo/git/db/ - target/ - key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock') }} - restore-keys: ${{ runner.os }}-cargo- - - - name: Install oha - run: cargo install oha - - - name: Build server and tests (release) - run: | - cargo build --release - cargo test --test perf_tests --no-run --release - - - name: Create test directories - run: | - mkdir -p /tmp/pathcollab/slides - mkdir -p bench/load_tests/results - - - name: Start server in background - run: | - HOST=127.0.0.1 \ - PORT=8080 \ - SLIDES_DIR=/tmp/pathcollab/slides \ - RUST_LOG=warn \ - ./target/release/pathcollab & - - for i in {1..30}; do - if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then - echo "Server is ready!" - break - fi - sleep 1 - done - - - name: Run standard WebSocket load test - run: | - cd server - cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_standard.txt - timeout-minutes: 10 - - - name: Run extended WebSocket load test - run: | - cd server - cargo test --test perf_tests test_fanout_extended --release -- --ignored --nocapture 2>&1 | tee /tmp/ws_extended.txt - timeout-minutes: 15 - - - name: Run HTTP tile ramp test - run: | - ./bench/load_tests/scenarios/ramp_test.sh \ - --start 1 \ - --end 50 \ - --step 5 \ - --stage-duration 10 \ - --output bench/load_tests/results 2>&1 | tee /tmp/ramp_results.txt - timeout-minutes: 20 - - - name: Run HTTP tile standard test - run: | - ./bench/load_tests/scenarios/tile_stress.sh \ - --concurrent 20 \ - --duration 60 \ - --output bench/load_tests/results/tile_extended.json 2>&1 | tee /tmp/tile_extended.txt - timeout-minutes: 10 - - - name: Generate performance report - if: always() - run: | - python3 ./bench/scripts/generate_report.py \ - --input-dir bench/load_tests/results \ - --output bench/load_tests/results/REPORT.md || true - - echo "=== Performance Report ===" - cat bench/load_tests/results/REPORT.md || echo "Report generation failed" - - - name: Upload extended results - if: always() - uses: actions/upload-artifact@v4 - with: - name: extended-benchmark-results - path: | - bench/load_tests/results/ - /tmp/ws_*.txt - /tmp/tile_*.txt - /tmp/ramp_results.txt - retention-days: 90 diff --git a/AGENTS.md b/AGENTS.md index 80e6ae1..580dbd7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -316,26 +316,32 @@ cd web && bun test cargo test # 4. Quick perf check (if touching hot paths) -./bench/load_tests/scenarios/tile_stress.sh --quick +cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture ``` ### Performance Testing +The benchmark system runs 3 iterations with warm-up and compares against stored baselines. + ```bash -# Quick performance check -SLIDES_DIR=/data/wsi_slides DEMO_ENABLED=true cargo run --release -./bench/load_tests/scenarios/tile_stress.sh --quick -python3 ./bench/scripts/compare_baseline.py \ - --current bench/load_tests/results/tile_current.json \ - --baseline bench/baselines/tile_baseline.json - -# Full benchmark suite (before major changes) -./bench/scripts/run_all.sh --compare-baseline - -# Save new baseline after confirmed improvements -./bench/scripts/run_all.sh --save-baseline +# Start the server first +SLIDES_DIR=~/Documents/tcga_slides cargo run --release & + +# Quick smoke test (~30s) - runs on every PR +cd server && cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture + +# Standard test (~2min) - PR merge gate +cd server && cargo test --test perf_tests bench_standard --release -- --ignored --nocapture + +# Full stress test (~4min) - before releases +cd server && cargo test --test perf_tests bench_stress --release -- --ignored --nocapture + +# Save current results as baseline +SAVE_BASELINE=1 cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture ``` +Baselines are stored in `.benchmark-baseline.json`. The system detects regressions >15% automatically. + ### Live Metrics ```bash diff --git a/Cargo.lock b/Cargo.lock index 4af8dd3..d5af51b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -29,6 +29,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anyhow" version = "1.0.100" @@ -277,6 +286,20 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + [[package]] name = "clang-sys" version = "1.8.1" @@ -836,6 +859,30 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "icu_collections" version = "2.1.1" @@ -1329,6 +1376,7 @@ dependencies = [ "async-trait", "axum", "bytes", + "chrono", "dashmap", "flate2", "futures-util", @@ -2614,12 +2662,65 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "windows-link" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.52.0" diff --git a/bench/README.md b/bench/README.md deleted file mode 100644 index 5043c77..0000000 --- a/bench/README.md +++ /dev/null @@ -1,366 +0,0 @@ -# PathCollab Benchmark Suite - -Comprehensive profiling and load testing infrastructure for the PathCollab collaborative slide viewer server. - -## Quick Start - -```bash -# Install dependencies -cargo install oha # HTTP load testing tool - -# Run quick benchmark (5 connections, 10 seconds) -./bench/load_tests/scenarios/tile_stress.sh --quick - -# Run full benchmark suite -./bench/scripts/run_all.sh - -# Run with baseline comparison (fails CI if P99 regresses >10%) -./bench/scripts/run_all.sh --compare-baseline -``` - -## Prerequisites - -### Required - -- **Rust toolchain** (stable, for building server and Criterion benchmarks) -- **Running PathCollab server** with slides available - -### Optional (for full suite) - -- **oha**: HTTP load testing tool - ```bash - cargo install oha - ``` -- **Python 3.6+**: For baseline comparison and report generation -- **jq**: For parsing JSON results in shell scripts - -## Directory Structure - -``` -bench/ -├── README.md # This file -├── load_tests/ -│ ├── scenarios/ -│ │ ├── tile_stress.sh # HTTP tile endpoint stress test -│ │ ├── overlay_stress.sh # HTTP cell overlay endpoint stress test -│ │ ├── ramp_test.sh # Gradual load increase to find breaking point -│ │ └── combined_load.sh # HTTP + WebSocket simultaneous load -│ └── results/ # Test output (.gitignored) -├── baselines/ -│ ├── tile_baseline.json # HTTP tile performance baseline -│ └── websocket_baseline.json # WebSocket performance baseline -└── scripts/ - ├── run_all.sh # Orchestrate full benchmark suite - ├── compare_baseline.py # Compare results to baseline - └── generate_report.py # Generate markdown report - -server/benches/ # Criterion micro-benchmarks -├── tile_encoding.rs # JPEG encoding, image resize -├── spatial_index.rs # R-tree query performance -└── message_serialization.rs # JSON serialization for WebSocket -``` - -## Running Benchmarks - -### 1. HTTP Tile Load Tests - -Stress test the tile serving endpoint: - -```bash -# Quick test (5 connections, 10 seconds) -./bench/load_tests/scenarios/tile_stress.sh --quick - -# Standard test (10 connections, 30 seconds) -./bench/load_tests/scenarios/tile_stress.sh - -# Custom configuration -./bench/load_tests/scenarios/tile_stress.sh \ - --url http://localhost:8080 \ - --concurrent 20 \ - --duration 60 \ - --output results/tile_test.json - -# Find breaking point with ramp test -./bench/load_tests/scenarios/ramp_test.sh \ - --start 1 \ - --end 100 \ - --step 10 -``` - -### 2. Cell Overlay Load Tests - -Stress test the cell overlay endpoint: - -```bash -# Quick test (5 connections, 10 seconds) -./bench/load_tests/scenarios/overlay_stress.sh --quick - -# Standard test (10 connections, 30 seconds) -./bench/load_tests/scenarios/overlay_stress.sh - -# Custom configuration -./bench/load_tests/scenarios/overlay_stress.sh \ - --url http://localhost:8080 \ - --concurrent 20 \ - --duration 60 \ - --viewport-size 1024 \ - --output results/overlay_test.json -``` - -### 3. WebSocket Load Tests - -Test session broadcasting under load: - -```bash -cd server - -# Quick test (1 session, 3 followers, 3 seconds) -cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture - -# Standard test (5 sessions, 20 followers, 30 seconds) -cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture - -# Extended test (5 minutes) -cargo test --test perf_tests test_fanout_extended --release -- --ignored --nocapture -``` - -### 4. Combined Load Test - -Simulate realistic production load with both HTTP and WebSocket traffic: - -```bash -./bench/load_tests/scenarios/combined_load.sh \ - --tile-concurrent 10 \ - --ws-sessions 3 \ - --ws-followers 10 \ - --duration 30 -``` - -### 5. Full Benchmark Suite - -Run everything with a single command: - -```bash -# Full suite with report generation -./bench/scripts/run_all.sh - -# Quick mode -./bench/scripts/run_all.sh --quick - -# Skip specific phases -./bench/scripts/run_all.sh --skip-micro --skip-websocket - -# Compare to baseline -./bench/scripts/run_all.sh --compare-baseline - -# Save new baseline -./bench/scripts/run_all.sh --save-baseline -``` - -## Performance Budgets - -These are the target latencies for production use: - -| Metric | Budget | Description | -|--------|--------|-------------| -| Tile P99 | < 100ms | HTTP tile serving latency | -| Overlay P99 | < 100ms | HTTP cell overlay query latency | -| Cursor P99 | < 100ms | WebSocket cursor broadcast | -| Viewport P99 | < 150ms | WebSocket viewport broadcast | -| Message Handling | < 10ms | Server-side message processing | - -## Baseline Management - -### Creating a Baseline - -```bash -# Run benchmarks and save as baseline -./bench/scripts/run_all.sh --save-baseline - -# Or manually from results -./bench/scripts/compare_baseline.py \ - --save-baseline bench/load_tests/results/latest/tile_stress.json \ - --output bench/baselines/tile_baseline.json \ - --description "Baseline after performance optimization" -``` - -### Comparing to Baseline - -```bash -# Compare and output to terminal -./bench/scripts/compare_baseline.py \ - --current bench/load_tests/results/latest/tile_stress.json \ - --baseline bench/baselines/tile_baseline.json - -# Markdown output (for PR comments) -./bench/scripts/compare_baseline.py \ - --current results.json \ - --baseline baseline.json \ - --markdown - -# CI mode (exit code 1 on regression) -./bench/scripts/compare_baseline.py \ - --current results.json \ - --baseline baseline.json \ - --threshold 10 \ - --ci -``` - -## CI Integration - -### GitHub Actions - -The existing `.github/workflows/perf.yml` can be extended: - -```yaml -- name: Run benchmark suite - run: | - ./bench/scripts/run_all.sh \ - --quick \ - --compare-baseline \ - 2>&1 | tee benchmark_output.txt - -- name: Check for regressions - run: | - if grep -q "FAILED" benchmark_output.txt; then - echo "Performance regression detected!" - exit 1 - fi -``` - -### Exit Codes - -All scripts follow Unix conventions: -- `0`: Success / no regressions -- `1`: Failure / regression detected -- `2`: Configuration or dependency error - -## Interpreting Results - -### HTTP Tile Benchmarks - -``` -Throughput: 450 req/s # Higher is better -P50 latency: 8.5ms # Median response time -P95 latency: 25.3ms # 95th percentile -P99 latency: 48.2ms # 99th percentile (main target) -Success rate: 100% # Should be 100% -``` - -**What "good" looks like:** -- P99 < 100ms for tile serving -- Success rate > 99% -- Throughput scales linearly with concurrency up to CPU saturation - -### Cell Overlay Benchmarks - -``` -Throughput: 800 req/s # Higher is better (faster than tiles) -P50 latency: 3.2ms # Median response time -P95 latency: 12.1ms # 95th percentile -P99 latency: 28.5ms # 99th percentile (main target) -Success rate: 100% # Should be 100% -``` - -**What "good" looks like:** -- P99 < 100ms for cell overlay queries -- Success rate > 99% -- Should be faster than tile serving (no JPEG encoding overhead) - -### WebSocket Benchmarks - -``` -Messages sent: 9000 -Messages received: 180000 # ~20x sent (fan-out to followers) -Cursor P99: 45ms # Broadcast latency -Viewport P99: 62ms # Slightly larger messages -``` - -**What "good" looks like:** -- Cursor P99 < 100ms -- Viewport P99 < 150ms -- No message drops (received ≈ sent × followers) - -### Micro-benchmarks - -``` -jpeg_encoding/256x256/85 time: [1.2345 ms 1.2456 ms 1.2567 ms] -``` - -- **Low/Mid/High**: Confidence interval for timing -- Compare to previous runs to detect regressions -- HTML reports in `target/criterion/` show trends over time - -## Troubleshooting - -### "oha not found" - -```bash -cargo install oha -``` - -### "Server not responding" - -Ensure the server is running: -```bash -cd server && cargo run --release -``` - -Or specify a different URL: -```bash -./bench/load_tests/scenarios/tile_stress.sh --url http://localhost:9090 -``` - -### "No slides found" - -The tile tests require at least one slide in the server's slides directory: -```bash -# Check configured slides directory in .env or environment -ls $SLIDES_DIR - -# Place WSI files (.svs, .ndpi, .tiff, etc.) in the slides directory -``` - -### Benchmark results vary widely - -- Ensure no other CPU-intensive processes are running -- Run multiple iterations and compare medians -- For Criterion benchmarks, the tool handles statistical analysis automatically -- For load tests, use longer durations for more stable results - -### WebSocket tests timeout - -Check that: -1. Server is compiled in release mode (`cargo build --release`) -2. No firewall blocking WebSocket connections -3. Sufficient file descriptors (`ulimit -n`) - -## Adding New Benchmarks - -### New Load Test Scenario - -1. Create script in `bench/load_tests/scenarios/` -2. Follow the pattern of existing scripts (argument parsing, colors, etc.) -3. Output JSON for machine parsing -4. Add to `run_all.sh` if appropriate - -## Server Metrics - -The server exposes Prometheus metrics at `/metrics/prometheus`: - -```bash -# Key metrics for benchmarking -curl -s http://localhost:8080/metrics/prometheus | grep pathcollab - -# Tile serving -pathcollab_tile_requests_total -pathcollab_tile_duration_seconds -pathcollab_tile_phase_duration_seconds{phase="read|resize|encode"} - -# WebSocket -pathcollab_ws_messages_total -pathcollab_ws_message_duration_seconds -pathcollab_ws_broadcast_duration_seconds -``` - -These can be scraped during load tests for detailed analysis. diff --git a/bench/baselines/tile_baseline.json b/bench/baselines/tile_baseline.json deleted file mode 100644 index e6ad512..0000000 --- a/bench/baselines/tile_baseline.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "created_at": "2026-01-18T18:02:29.793281Z", - "description": "Initial baseline from TCGA slides with 20 concurrent connections", - "metrics": { - "requests_per_sec": 44.65582630258221, - "success_rate": 100.0, - "p50_ms": 527.124199, - "p90_ms": 557.954244, - "p95_ms": 564.5460730000001, - "p99_ms": 578.905558 - }, - "raw_data": { - "summary": { - "successRate": 1.0, - "total": 30.007282609, - "slowest": 0.599721819, - "fastest": 0.027209147, - "average": 0.4501082052939393, - "requestsPerSec": 44.65582630258221, - "totalData": 5404080, - "sizePerRequest": 4094, - "sizePerSec": 180092.28194422275 - }, - "responseTimeHistogram": { - "0.027209147": 1, - "0.0844604142": 202, - "0.14171168139999998": 24, - "0.19896294859999997": 4, - "0.2562142158": 4, - "0.313465483": 1, - "0.37071675019999994": 3, - "0.42796801739999996": 2, - "0.48521928459999997": 8, - "0.5424705517999999": 682, - "0.599721819": 389 - }, - "latencyPercentiles": { - "p10": 0.068543644, - "p25": 0.505791205, - "p50": 0.527124199, - "p75": 0.545522472, - "p90": 0.557954244, - "p95": 0.564546073, - "p99": 0.578905558, - "p99.9": 0.59607328, - "p99.99": 0.599721819 - }, - "rps": { - "mean": 28874.193265230362, - "stddev": 964723.1848951668, - "max": 33333330.575321194, - "min": 19.983695702350346, - "percentiles": { - "p10": 25.598587777109657, - "p25": 28.246009523819943, - "p50": 31.68769218981417, - "p75": 155.03267856313718, - "p90": 500.63875246830406, - "p95": 917.303275139838, - "p99": 12072.337445898871, - "p99.9": 341880.34182995924, - "p99.99": 33333330.575321194 - } - }, - "details": { - "DNSDialup": { - "average": 0.0006074041, - "fastest": 0.000105827, - "slowest": 0.001841543 - }, - "DNSLookup": { - "average": 3.813065000000001e-05, - "fastest": 3.036e-06, - "slowest": 0.000243265 - } - }, - "statusCodeDistribution": { - "200": 1320 - }, - "errorDistribution": { - "aborted due to deadline": 20 - } - } -} \ No newline at end of file diff --git a/bench/baselines/websocket_baseline.json b/bench/baselines/websocket_baseline.json deleted file mode 100644 index a342f28..0000000 --- a/bench/baselines/websocket_baseline.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "created_at": "2026-01-18T00:00:00Z", - "description": "Initial baseline - placeholder for WebSocket performance", - "metrics": { - "cursor_p99_ms": 100, - "viewport_p99_ms": 150, - "message_handling_p99_ms": 10 - }, - "notes": "This is a placeholder baseline derived from the performance budgets in tests/load_tests/mod.rs" -} diff --git a/bench/load_tests/results/.gitignore b/bench/load_tests/results/.gitignore deleted file mode 100644 index b2fe286..0000000 --- a/bench/load_tests/results/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Ignore all benchmark results (raw data) -* - -# But track this .gitignore -!.gitignore - -# And track any README -!README.md diff --git a/bench/load_tests/scenarios/combined_load.sh b/bench/load_tests/scenarios/combined_load.sh deleted file mode 100755 index a4547f0..0000000 --- a/bench/load_tests/scenarios/combined_load.sh +++ /dev/null @@ -1,306 +0,0 @@ -#!/usr/bin/env bash -# -# combined_load.sh - Combined HTTP tile + WebSocket session load test -# -# This script simulates realistic production load by running: -# - HTTP tile requests (simulating viewport navigation) -# - WebSocket sessions with cursor/viewport updates (using Rust load tests) -# -# This captures the combined effect of both workloads on server performance. -# -# Prerequisites: -# - oha: cargo install oha -# - Built Rust server and tests -# -# Usage: -# ./combined_load.sh [OPTIONS] -# -# Options: -# -u, --url Base URL (default: http://127.0.0.1:8080) -# -s, --slide Slide ID (default: auto-detect) -# --tile-concurrent Concurrent tile requests (default: 10) -# --ws-sessions Number of WebSocket sessions (default: 3) -# --ws-followers Followers per session (default: 10) -# -d, --duration Test duration in seconds (default: 30) -# -o, --output Output directory (default: bench/load_tests/results) -# -h, --help Show this help message - -set -euo pipefail - -# Default configuration -BASE_URL="${BASE_URL:-http://127.0.0.1:8080}" -WS_URL="${WS_URL:-ws://127.0.0.1:8080/ws}" -SLIDE_ID="" -TILE_CONCURRENT=10 -WS_SESSIONS=3 -WS_FOLLOWERS=10 -DURATION=30 -OUTPUT_DIR="bench/load_tests/results" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' - -usage() { - grep '^#' "$0" | grep -v '#!/' | cut -c3- - exit 0 -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -u|--url) - BASE_URL="$2" - WS_URL="ws://${2#http://}/ws" - WS_URL="${WS_URL/https:/wss:}" - shift 2 - ;; - -s|--slide) - SLIDE_ID="$2" - shift 2 - ;; - --tile-concurrent) - TILE_CONCURRENT="$2" - shift 2 - ;; - --ws-sessions) - WS_SESSIONS="$2" - shift 2 - ;; - --ws-followers) - WS_FOLLOWERS="$2" - shift 2 - ;; - -d|--duration) - DURATION="$2" - shift 2 - ;; - -o|--output) - OUTPUT_DIR="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac -done - -# Ensure we're in the project root -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" - -# Create output directory -mkdir -p "$OUTPUT_DIR" -TIMESTAMP=$(date +%Y%m%d_%H%M%S) - -# Check for oha -if ! command -v oha &> /dev/null; then - log_error "oha is not installed. Install with: cargo install oha" - exit 1 -fi - -# Check server health -log_info "Checking server health at $BASE_URL..." -if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then - log_error "Server not responding at $BASE_URL" - exit 1 -fi -log_success "Server is healthy" - -# Auto-detect slide -if [[ -z "$SLIDE_ID" ]]; then - SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]") - SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") - - if [[ -z "$SLIDE_ID" ]]; then - DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}") - SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "") - fi - - if [[ -z "$SLIDE_ID" ]]; then - log_error "No slides available. Place WSI files in the slides directory." - exit 1 - fi -fi -log_success "Using slide: $SLIDE_ID" - -# Get slide metadata -METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}") -NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10") -TEST_LEVEL=$((NUM_LEVELS / 2)) -[[ $TEST_LEVEL -lt 5 ]] && TEST_LEVEL=5 - -TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/10/10" - -echo "" -echo "==========================================" -echo " Combined Load Test" -echo "==========================================" -echo " HTTP Base URL: $BASE_URL" -echo " WebSocket URL: $WS_URL" -echo " Slide: $SLIDE_ID" -echo " Tile concurrent: $TILE_CONCURRENT" -echo " WS sessions: $WS_SESSIONS" -echo " WS followers/sess: $WS_FOLLOWERS" -echo " Duration: ${DURATION}s" -echo "==========================================" -echo "" - -# Prepare output files -TILE_OUTPUT="$OUTPUT_DIR/combined_${TIMESTAMP}_tiles.json" -WS_OUTPUT="$OUTPUT_DIR/combined_${TIMESTAMP}_websocket.txt" -SUMMARY_FILE="$OUTPUT_DIR/combined_${TIMESTAMP}_summary.txt" - -# Collect initial metrics from server -log_info "Collecting baseline metrics..." -BASELINE_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}") -BASELINE_CONNECTIONS=$(echo "$BASELINE_METRICS" | grep -o '"total_connections":[0-9]*' | cut -d':' -f2 || echo "0") - -# Start tile load test in background -log_info "Starting HTTP tile load test ($TILE_CONCURRENT concurrent)..." -oha -c "$TILE_CONCURRENT" -z "${DURATION}s" --json "$TEST_URL" > "$TILE_OUTPUT" 2>&1 & -TILE_PID=$! - -# Start WebSocket load test in background (using Rust tests) -log_info "Starting WebSocket load test ($WS_SESSIONS sessions, $WS_FOLLOWERS followers each)..." - -# Create a temporary test file for custom configuration -# We use environment variables to configure the Rust test -export LOAD_TEST_WS_URL="$WS_URL" -export LOAD_TEST_SESSIONS="$WS_SESSIONS" -export LOAD_TEST_FOLLOWERS="$WS_FOLLOWERS" -export LOAD_TEST_DURATION="$DURATION" - -# Run the Rust WebSocket test (if compiled) -if [[ -f "$PROJECT_ROOT/target/release/deps/perf_tests"* ]]; then - cd "$PROJECT_ROOT" - cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture > "$WS_OUTPUT" 2>&1 & - WS_PID=$! -else - log_warn "WebSocket tests not compiled (run: cargo build --release --tests)" - log_info "Running tile-only load test..." - WS_PID="" -fi - -# Wait for tests to complete -log_info "Tests running... waiting ${DURATION}s + buffer" - -# Monitor progress -ELAPSED=0 -while [[ $ELAPSED -lt $DURATION ]]; do - sleep 5 - ELAPSED=$((ELAPSED + 5)) - CURRENT_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}") - CURRENT_CONNECTIONS=$(echo "$CURRENT_METRICS" | grep -o '"total_connections":[0-9]*' | cut -d':' -f2 || echo "?") - echo -e " [${ELAPSED}s/${DURATION}s] Active connections: $CURRENT_CONNECTIONS" -done - -# Wait for background jobs -log_info "Waiting for test completion..." -wait $TILE_PID || true -if [[ -n "${WS_PID:-}" ]]; then - wait $WS_PID || true -fi - -# Collect final metrics -FINAL_METRICS=$(curl -sf "$BASE_URL/metrics" 2>/dev/null || echo "{}") - -echo "" -echo "==========================================" -echo " Combined Test Results" -echo "==========================================" - -# Parse tile results -echo "" -echo "--- HTTP Tile Results ---" -if [[ -f "$TILE_OUTPUT" ]] && command -v jq &> /dev/null; then - TILE_RPS=$(jq -r '.summary.requestsPerSec // 0 | floor' "$TILE_OUTPUT") - TILE_P50=$(jq -r '(.latencyPercentiles.p50 // 0) * 1000 | floor' "$TILE_OUTPUT") - TILE_P95=$(jq -r '(.latencyPercentiles.p95 // 0) * 1000 | floor' "$TILE_OUTPUT") - TILE_P99=$(jq -r '(.latencyPercentiles.p99 // 0) * 1000 | floor' "$TILE_OUTPUT") - TILE_SUCCESS=$(jq -r '(.summary.successRate // 1) * 100 | floor' "$TILE_OUTPUT") - - echo " Throughput: $TILE_RPS req/s" - echo " P50 latency: ${TILE_P50}ms" - echo " P95 latency: ${TILE_P95}ms" - echo " P99 latency: ${TILE_P99}ms" - echo " Success rate: ${TILE_SUCCESS}%" -else - echo " (Results file not found or jq not available)" - TILE_RPS=0 - TILE_P99=0 -fi - -# Parse WebSocket results -echo "" -echo "--- WebSocket Results ---" -if [[ -f "$WS_OUTPUT" ]]; then - if grep -q "PASS" "$WS_OUTPUT"; then - echo " Status: PASS" - elif grep -q "FAIL" "$WS_OUTPUT"; then - echo " Status: FAIL" - fi - - # Extract P99 from output - WS_CURSOR_P99=$(grep "Cursor.*P99:" "$WS_OUTPUT" | grep -o '[0-9.]*ms' | head -1 || echo "N/A") - WS_VIEWPORT_P99=$(grep "Viewport.*P99:" "$WS_OUTPUT" | grep -o '[0-9.]*ms' | head -1 || echo "N/A") - WS_SENT=$(grep "Messages sent:" "$WS_OUTPUT" | grep -o '[0-9]*' || echo "N/A") - WS_RECV=$(grep "Messages received:" "$WS_OUTPUT" | grep -o '[0-9]*' || echo "N/A") - - echo " Cursor P99: $WS_CURSOR_P99" - echo " Viewport P99: $WS_VIEWPORT_P99" - echo " Messages sent: $WS_SENT" - echo " Messages recv: $WS_RECV" -else - echo " (WebSocket test not run)" -fi - -# Generate summary -{ - echo "Combined Load Test Summary" - echo "==========================" - echo "" - echo "Test Configuration:" - echo " Duration: ${DURATION}s" - echo " Tile concurrent: $TILE_CONCURRENT" - echo " WS sessions: $WS_SESSIONS × $WS_FOLLOWERS followers" - echo "" - echo "HTTP Tile Results:" - echo " Throughput: ${TILE_RPS:-N/A} req/s" - echo " P99 latency: ${TILE_P99:-N/A}ms" - echo "" - echo "WebSocket Results:" - echo " Cursor P99: ${WS_CURSOR_P99:-N/A}" - echo " Viewport P99: ${WS_VIEWPORT_P99:-N/A}" - echo "" - echo "Files:" - echo " Tile results: $TILE_OUTPUT" - echo " WebSocket results: $WS_OUTPUT" -} > "$SUMMARY_FILE" - -echo "" -log_success "Results saved to $OUTPUT_DIR" diff --git a/bench/load_tests/scenarios/overlay_stress.sh b/bench/load_tests/scenarios/overlay_stress.sh deleted file mode 100755 index afab1a0..0000000 --- a/bench/load_tests/scenarios/overlay_stress.sh +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env bash -# -# overlay_stress.sh - HTTP load test for cell overlay endpoints -# -# This script hammers the cell overlay endpoint to measure: -# - Latency percentiles (p50, p90, p95, p99, p99.9) -# - Throughput (requests/second) -# - Error rates -# -# Prerequisites: -# - oha: cargo install oha -# - Running PathCollab server with slides and overlays available -# -# Usage: -# ./overlay_stress.sh [OPTIONS] -# -# Options: -# -u, --url Base URL (default: http://127.0.0.1:8080) -# -s, --slide Slide ID to test (default: auto-detect from /api/slides) -# -c, --concurrent Concurrent connections (default: 10) -# -d, --duration Test duration in seconds (default: 30) -# -r, --rate Requests per second limit, 0=unlimited (default: 0) -# -v, --viewport-size Viewport size in pixels (default: 512) -# -o, --output Output file for JSON results (optional) -# -q, --quick Quick mode: 5 connections, 10 seconds -# -h, --help Show this help message - -set -euo pipefail - -# Default configuration -BASE_URL="${BASE_URL:-http://127.0.0.1:8080}" -SLIDE_ID="" -CONCURRENT=10 -DURATION=30 -RATE=0 -VIEWPORT_SIZE=512 -OUTPUT_FILE="" -QUICK_MODE=false - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -usage() { - grep '^#' "$0" | grep -v '#!/' | cut -c3- - exit 0 -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -u|--url) - BASE_URL="$2" - shift 2 - ;; - -s|--slide) - SLIDE_ID="$2" - shift 2 - ;; - -c|--concurrent) - CONCURRENT="$2" - shift 2 - ;; - -d|--duration) - DURATION="$2" - shift 2 - ;; - -r|--rate) - RATE="$2" - shift 2 - ;; - -v|--viewport-size) - VIEWPORT_SIZE="$2" - shift 2 - ;; - -o|--output) - OUTPUT_FILE="$2" - shift 2 - ;; - -q|--quick) - QUICK_MODE=true - shift - ;; - -h|--help) - usage - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac -done - -# Quick mode overrides -if [[ "$QUICK_MODE" == "true" ]]; then - CONCURRENT=5 - DURATION=10 - log_info "Quick mode enabled: $CONCURRENT connections, ${DURATION}s duration" -fi - -# Check for oha -if ! command -v oha &> /dev/null; then - log_error "oha is not installed. Install with: cargo install oha" - exit 1 -fi - -# Check server health -log_info "Checking server health at $BASE_URL..." -if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then - log_error "Server not responding at $BASE_URL" - exit 1 -fi -log_success "Server is healthy" - -# Auto-detect slide if not specified -if [[ -z "$SLIDE_ID" ]]; then - log_info "Auto-detecting slide ID..." - SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]") - SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") - - if [[ -z "$SLIDE_ID" ]]; then - # Try default slide endpoint - DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}") - SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "") - fi - - if [[ -z "$SLIDE_ID" ]]; then - log_error "No slides found. Ensure slides are configured or use --slide" - exit 1 - fi -fi -log_success "Using slide: $SLIDE_ID" - -# Check overlay availability with retry for loading state -log_info "Checking overlay availability..." -OVERLAY_READY=false -for i in {1..10}; do - OVERLAY_RESPONSE=$(curl -sf -w "\n%{http_code}" "$BASE_URL/api/slide/$SLIDE_ID/overlay/metadata" 2>/dev/null || echo -e "\n000") - HTTP_CODE=$(echo "$OVERLAY_RESPONSE" | tail -1) - - if [[ "$HTTP_CODE" == "200" ]]; then - OVERLAY_READY=true - break - elif [[ "$HTTP_CODE" == "202" ]]; then - log_info "Overlay still loading, waiting... (attempt $i/10)" - sleep 1 - elif [[ "$HTTP_CODE" == "404" ]]; then - log_error "No overlay available for slide $SLIDE_ID" - exit 1 - else - log_warn "Unexpected response code: $HTTP_CODE (attempt $i/10)" - sleep 1 - fi -done - -if [[ "$OVERLAY_READY" != "true" ]]; then - log_error "Overlay not ready after 10 attempts" - exit 1 -fi -log_success "Overlay is ready" - -# Get slide dimensions -log_info "Fetching slide metadata..." -METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}") -WIDTH=$(echo "$METADATA" | grep -o '"width":[0-9]*' | cut -d':' -f2 || echo "10000") -HEIGHT=$(echo "$METADATA" | grep -o '"height":[0-9]*' | cut -d':' -f2 || echo "10000") - -log_info "Slide dimensions: ${WIDTH}x${HEIGHT}" - -# Calculate center and viewport regions -CENTER_X=$((WIDTH / 2)) -CENTER_Y=$((HEIGHT / 2)) - -echo "" -echo "==========================================" -echo " Overlay Stress Test Configuration" -echo "==========================================" -echo " URL: $BASE_URL" -echo " Slide: $SLIDE_ID" -echo " Viewport: ${VIEWPORT_SIZE}x${VIEWPORT_SIZE}" -echo " Concurrent: $CONCURRENT" -echo " Duration: ${DURATION}s" -echo " Rate limit: ${RATE:-unlimited} req/s" -echo "==========================================" -echo "" - -# Generate viewport region URLs file for reference (3x3 grid around center) -URLS_FILE=$(mktemp) -trap "rm -f $URLS_FILE" EXIT - -log_info "Generating viewport regions (3x3 grid around center)..." -for dx in -$VIEWPORT_SIZE 0 $VIEWPORT_SIZE; do - for dy in -$VIEWPORT_SIZE 0 $VIEWPORT_SIZE; do - x=$((CENTER_X + dx)) - y=$((CENTER_Y + dy)) - # Clamp to bounds - if [[ $x -lt 0 ]]; then x=0; fi - if [[ $y -lt 0 ]]; then y=0; fi - if [[ $x -gt $((WIDTH - VIEWPORT_SIZE)) ]]; then x=$((WIDTH - VIEWPORT_SIZE)); fi - if [[ $y -gt $((HEIGHT - VIEWPORT_SIZE)) ]]; then y=$((HEIGHT - VIEWPORT_SIZE)); fi - echo "$BASE_URL/api/slide/$SLIDE_ID/overlay/cells?x=$x&y=$y&width=$VIEWPORT_SIZE&height=$VIEWPORT_SIZE" >> "$URLS_FILE" - done -done - -log_info "Generated $(wc -l < "$URLS_FILE") viewport region URLs" - -# Build oha command -OHA_CMD="oha" -OHA_CMD="$OHA_CMD -c $CONCURRENT" -OHA_CMD="$OHA_CMD -z ${DURATION}s" -OHA_CMD="$OHA_CMD --no-tui" - -if [[ $RATE -gt 0 ]]; then - OHA_CMD="$OHA_CMD -q $RATE" -fi - -# Add JSON output if requested -if [[ -n "$OUTPUT_FILE" ]]; then - OHA_CMD="$OHA_CMD --output-format json -o $OUTPUT_FILE" -fi - -# Test a representative center region URL -# oha doesn't support URL files directly, so we test the center viewport -TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/overlay/cells?x=$CENTER_X&y=$CENTER_Y&width=$VIEWPORT_SIZE&height=$VIEWPORT_SIZE" - -log_info "Testing overlay cells endpoint: $TEST_URL" -log_info "Starting load test..." -echo "" - -if [[ -n "$OUTPUT_FILE" ]]; then - $OHA_CMD "$TEST_URL" 2>&1 - log_success "Results saved to $OUTPUT_FILE" - - # Also print summary - echo "" - echo "==========================================" - echo " Results Summary (from JSON)" - echo "==========================================" - if command -v jq &> /dev/null && [[ -f "$OUTPUT_FILE" ]]; then - jq -r ' - "Duration: \(.summary.total | floor)s", - "Requests: \(.statusCodeDistribution | to_entries | map(.value) | add)", - "Successful: \(.summary.successRate * 100 | floor)%", - "Req/sec: \(.summary.requestsPerSec | floor)", - "", - "Latency:", - " P50: \(.latencyPercentiles.p50 * 1000 | floor)ms", - " P90: \(.latencyPercentiles.p90 * 1000 | floor)ms", - " P95: \(.latencyPercentiles.p95 * 1000 | floor)ms", - " P99: \(.latencyPercentiles.p99 * 1000 | floor)ms", - " P99.9: \(.latencyPercentiles."p99.9" * 1000 | floor)ms" - ' "$OUTPUT_FILE" 2>/dev/null || cat "$OUTPUT_FILE" - else - cat "$OUTPUT_FILE" 2>/dev/null || echo "(output file not available)" - fi -else - $OHA_CMD "$TEST_URL" -fi - -echo "" -log_success "Overlay stress test completed" diff --git a/bench/load_tests/scenarios/ramp_test.sh b/bench/load_tests/scenarios/ramp_test.sh deleted file mode 100755 index 6b35732..0000000 --- a/bench/load_tests/scenarios/ramp_test.sh +++ /dev/null @@ -1,269 +0,0 @@ -#!/usr/bin/env bash -# -# ramp_test.sh - Gradual load increase test to find breaking point -# -# This script increases concurrent connections gradually to identify: -# - Maximum sustainable throughput -# - Breaking point where latency degrades significantly -# - Error threshold (when errors start appearing) -# -# Prerequisites: -# - oha: cargo install oha -# - Running PathCollab server with slides available -# -# Usage: -# ./ramp_test.sh [OPTIONS] -# -# Options: -# -u, --url Base URL (default: http://127.0.0.1:8080) -# -s, --slide Slide ID to test (default: auto-detect) -# --start Starting concurrent connections (default: 1) -# --end Maximum concurrent connections (default: 100) -# --step Concurrency increase per stage (default: 10) -# --stage-duration Duration per stage in seconds (default: 10) -# -o, --output Output directory for results (default: bench/load_tests/results) -# -h, --help Show this help message - -set -euo pipefail - -# Default configuration -BASE_URL="${BASE_URL:-http://127.0.0.1:8080}" -SLIDE_ID="" -START_CONCURRENCY=1 -END_CONCURRENCY=100 -STEP=10 -STAGE_DURATION=10 -OUTPUT_DIR="bench/load_tests/results" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -NC='\033[0m' - -usage() { - grep '^#' "$0" | grep -v '#!/' | cut -c3- - exit 0 -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -log_stage() { - echo -e "${CYAN}[STAGE]${NC} $1" -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -u|--url) - BASE_URL="$2" - shift 2 - ;; - -s|--slide) - SLIDE_ID="$2" - shift 2 - ;; - --start) - START_CONCURRENCY="$2" - shift 2 - ;; - --end) - END_CONCURRENCY="$2" - shift 2 - ;; - --step) - STEP="$2" - shift 2 - ;; - --stage-duration) - STAGE_DURATION="$2" - shift 2 - ;; - -o|--output) - OUTPUT_DIR="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac -done - -# Check for oha -if ! command -v oha &> /dev/null; then - log_error "oha is not installed. Install with: cargo install oha" - exit 1 -fi - -# Create output directory -mkdir -p "$OUTPUT_DIR" - -# Check server health -log_info "Checking server health at $BASE_URL..." -if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then - log_error "Server not responding at $BASE_URL" - exit 1 -fi -log_success "Server is healthy" - -# Auto-detect slide if not specified -if [[ -z "$SLIDE_ID" ]]; then - SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]") - SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") - - if [[ -z "$SLIDE_ID" ]]; then - DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}") - SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "") - fi - - if [[ -z "$SLIDE_ID" ]]; then - log_error "No slides available. Place WSI files in the slides directory." - exit 1 - fi -fi -log_success "Using slide: $SLIDE_ID" - -# Get slide metadata -METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}") -NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10") -TEST_LEVEL=$((NUM_LEVELS / 2)) -[[ $TEST_LEVEL -lt 5 ]] && TEST_LEVEL=5 - -TEST_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/10/10" - -# Prepare results file -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -RESULTS_FILE="$OUTPUT_DIR/ramp_${TIMESTAMP}.csv" -SUMMARY_FILE="$OUTPUT_DIR/ramp_${TIMESTAMP}_summary.txt" - -echo "" -echo "==========================================" -echo " Ramp-Up Load Test" -echo "==========================================" -echo " URL: $BASE_URL" -echo " Slide: $SLIDE_ID" -echo " Level: $TEST_LEVEL" -echo " Start: $START_CONCURRENCY connections" -echo " End: $END_CONCURRENCY connections" -echo " Step: +$STEP per stage" -echo " Stage duration: ${STAGE_DURATION}s" -echo " Output: $RESULTS_FILE" -echo "==========================================" -echo "" - -# CSV header -echo "concurrency,requests,success_rate,rps,p50_ms,p90_ms,p95_ms,p99_ms,errors" > "$RESULTS_FILE" - -# Track best performance -BEST_RPS=0 -BEST_CONCURRENCY=0 -BREAKING_POINT=0 - -# Run stages -CURRENT=$START_CONCURRENCY -STAGE=1 - -while [[ $CURRENT -le $END_CONCURRENCY ]]; do - log_stage "Stage $STAGE: $CURRENT concurrent connections" - - # Run oha and capture JSON output - STAGE_OUTPUT=$(oha -c "$CURRENT" -z "${STAGE_DURATION}s" --json "$TEST_URL" 2>/dev/null || echo "{}") - - # Parse results (using grep/sed for portability, jq if available) - if command -v jq &> /dev/null; then - REQUESTS=$(echo "$STAGE_OUTPUT" | jq -r '.summary.total // 0') - SUCCESS_RATE=$(echo "$STAGE_OUTPUT" | jq -r '(.summary.successRate // 1) * 100 | floor') - RPS=$(echo "$STAGE_OUTPUT" | jq -r '.summary.requestsPerSec // 0 | floor') - P50=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p50 // 0) * 1000 | floor') - P90=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p90 // 0) * 1000 | floor') - P95=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p95 // 0) * 1000 | floor') - P99=$(echo "$STAGE_OUTPUT" | jq -r '(.latencyPercentiles.p99 // 0) * 1000 | floor') - ERRORS=$(echo "$STAGE_OUTPUT" | jq -r '.statusCodeDistribution | to_entries | map(select(.key | startswith("5") or startswith("4"))) | map(.value) | add // 0') - else - # Fallback parsing - REQUESTS=$(echo "$STAGE_OUTPUT" | grep -o '"total":[0-9]*' | cut -d':' -f2 || echo "0") - SUCCESS_RATE="100" - RPS=$(echo "$STAGE_OUTPUT" | grep -o '"requestsPerSec":[0-9.]*' | cut -d':' -f2 | cut -d'.' -f1 || echo "0") - P50="0" - P90="0" - P95="0" - P99="0" - ERRORS="0" - fi - - # Record to CSV - echo "$CURRENT,$REQUESTS,$SUCCESS_RATE,$RPS,$P50,$P90,$P95,$P99,$ERRORS" >> "$RESULTS_FILE" - - # Print stage summary - echo " Requests: $REQUESTS | RPS: $RPS | P99: ${P99}ms | Success: ${SUCCESS_RATE}%" - - # Track best RPS - if [[ $RPS -gt $BEST_RPS ]]; then - BEST_RPS=$RPS - BEST_CONCURRENCY=$CURRENT - fi - - # Detect breaking point (P99 > 500ms or success rate drops) - if [[ $P99 -gt 500 || $SUCCESS_RATE -lt 95 ]]; then - if [[ $BREAKING_POINT -eq 0 ]]; then - BREAKING_POINT=$CURRENT - log_warn "Performance degradation detected at $CURRENT connections" - fi - fi - - # Next stage - CURRENT=$((CURRENT + STEP)) - STAGE=$((STAGE + 1)) - - # Brief pause between stages - sleep 1 -done - -echo "" -echo "==========================================" -echo " Ramp-Up Test Complete" -echo "==========================================" - -# Generate summary -{ - echo "Ramp-Up Load Test Summary" - echo "=========================" - echo "" - echo "Test Parameters:" - echo " URL: $BASE_URL" - echo " Slide: $SLIDE_ID" - echo " Duration per stage: ${STAGE_DURATION}s" - echo "" - echo "Results:" - echo " Best throughput: $BEST_RPS req/s at $BEST_CONCURRENCY connections" - if [[ $BREAKING_POINT -gt 0 ]]; then - echo " Breaking point: $BREAKING_POINT connections" - else - echo " Breaking point: Not reached (max: $END_CONCURRENCY)" - fi - echo "" - echo "Full results: $RESULTS_FILE" -} | tee "$SUMMARY_FILE" - -echo "" -log_success "Results saved to $OUTPUT_DIR" diff --git a/bench/load_tests/scenarios/tile_stress.sh b/bench/load_tests/scenarios/tile_stress.sh deleted file mode 100755 index 67c4ec7..0000000 --- a/bench/load_tests/scenarios/tile_stress.sh +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env bash -# -# tile_stress.sh - HTTP load test for tile serving endpoints -# -# This script hammers the tile serving endpoint to measure: -# - Latency percentiles (p50, p90, p95, p99, p99.9) -# - Throughput (requests/second) -# - Error rates -# -# Prerequisites: -# - oha: cargo install oha -# - Running PathCollab server with slides available -# -# Usage: -# ./tile_stress.sh [OPTIONS] -# -# Options: -# -u, --url Base URL (default: http://127.0.0.1:8080) -# -s, --slide Slide ID to test (default: auto-detect from /api/slides) -# -c, --concurrent Concurrent connections (default: 10) -# -d, --duration Test duration in seconds (default: 30) -# -r, --rate Requests per second limit, 0=unlimited (default: 0) -# -o, --output Output file for JSON results (optional) -# -q, --quick Quick mode: 5 connections, 10 seconds -# -h, --help Show this help message - -set -euo pipefail - -# Default configuration -BASE_URL="${BASE_URL:-http://127.0.0.1:8080}" -SLIDE_ID="" -CONCURRENT=10 -DURATION=30 -RATE=0 -OUTPUT_FILE="" -QUICK_MODE=false - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' - -usage() { - grep '^#' "$0" | grep -v '#!/' | cut -c3- - exit 0 -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - -u|--url) - BASE_URL="$2" - shift 2 - ;; - -s|--slide) - SLIDE_ID="$2" - shift 2 - ;; - -c|--concurrent) - CONCURRENT="$2" - shift 2 - ;; - -d|--duration) - DURATION="$2" - shift 2 - ;; - -r|--rate) - RATE="$2" - shift 2 - ;; - -o|--output) - OUTPUT_FILE="$2" - shift 2 - ;; - -q|--quick) - QUICK_MODE=true - shift - ;; - -h|--help) - usage - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac -done - -# Quick mode overrides -if [[ "$QUICK_MODE" == "true" ]]; then - CONCURRENT=5 - DURATION=10 - log_info "Quick mode enabled: $CONCURRENT connections, ${DURATION}s duration" -fi - -# Check for oha -if ! command -v oha &> /dev/null; then - log_error "oha is not installed. Install with: cargo install oha" - exit 1 -fi - -# Check server health -log_info "Checking server health at $BASE_URL..." -if ! curl -sf "$BASE_URL/health" > /dev/null 2>&1; then - log_error "Server not responding at $BASE_URL" - exit 1 -fi -log_success "Server is healthy" - -# Auto-detect slide if not specified -if [[ -z "$SLIDE_ID" ]]; then - log_info "Auto-detecting slide ID..." - SLIDES_JSON=$(curl -sf "$BASE_URL/api/slides" 2>/dev/null || echo "[]") - SLIDE_ID=$(echo "$SLIDES_JSON" | grep -o '"id":"[^"]*"' | head -1 | cut -d'"' -f4 || echo "") - - if [[ -z "$SLIDE_ID" ]]; then - # Try default slide endpoint - DEFAULT_JSON=$(curl -sf "$BASE_URL/api/slides/default" 2>/dev/null || echo "{}") - SLIDE_ID=$(echo "$DEFAULT_JSON" | grep -o '"slide_id":"[^"]*"' | cut -d'"' -f4 || echo "") - fi - - if [[ -z "$SLIDE_ID" ]]; then - log_error "No slides found. Ensure slides are configured or use --slide" - exit 1 - fi -fi -log_success "Using slide: $SLIDE_ID" - -# Get slide metadata to determine valid tile coordinates -log_info "Fetching slide metadata..." -METADATA=$(curl -sf "$BASE_URL/api/slide/$SLIDE_ID" 2>/dev/null || echo "{}") -NUM_LEVELS=$(echo "$METADATA" | grep -o '"num_levels":[0-9]*' | cut -d':' -f2 || echo "10") -TILE_SIZE=$(echo "$METADATA" | grep -o '"tile_size":[0-9]*' | cut -d':' -f2 || echo "256") -WIDTH=$(echo "$METADATA" | grep -o '"width":[0-9]*' | cut -d':' -f2 || echo "10000") -HEIGHT=$(echo "$METADATA" | grep -o '"height":[0-9]*' | cut -d':' -f2 || echo "10000") - -# Calculate a level that has meaningful tiles (around 10-50 tiles across) -# DZI: level 0 = 1x1, level (N-1) = full resolution -# At level L, width = original_width / 2^(N-1-L) -# We want level where width / tile_size gives us ~20 tiles -# Test at level (NUM_LEVELS - 4) which is 1/8th of full resolution -TEST_LEVEL=$((NUM_LEVELS - 4)) -if [[ $TEST_LEVEL -lt 8 ]]; then - TEST_LEVEL=8 -fi -if [[ $TEST_LEVEL -ge $NUM_LEVELS ]]; then - TEST_LEVEL=$((NUM_LEVELS - 1)) -fi - -# Calculate tiles at this level -SCALE_FACTOR=$((1 << (NUM_LEVELS - 1 - TEST_LEVEL))) -LEVEL_WIDTH=$((WIDTH / SCALE_FACTOR)) -LEVEL_HEIGHT=$((HEIGHT / SCALE_FACTOR)) -MAX_TILE_X=$(( (LEVEL_WIDTH + TILE_SIZE - 1) / TILE_SIZE - 1 )) -MAX_TILE_Y=$(( (LEVEL_HEIGHT + TILE_SIZE - 1) / TILE_SIZE - 1 )) - -log_info "Slide: ${WIDTH}x${HEIGHT}, $NUM_LEVELS levels" -log_info "Testing at level $TEST_LEVEL (${LEVEL_WIDTH}x${LEVEL_HEIGHT}px, tiles: 0-${MAX_TILE_X} x 0-${MAX_TILE_Y})" - -# Build tile URL template -# We'll test a range of tile coordinates to simulate viewport panning -TILE_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/{x}/{y}" - -echo "" -echo "==========================================" -echo " Tile Stress Test Configuration" -echo "==========================================" -echo " URL: $BASE_URL" -echo " Slide: $SLIDE_ID" -echo " Level: $TEST_LEVEL" -echo " Concurrent: $CONCURRENT" -echo " Duration: ${DURATION}s" -echo " Rate limit: ${RATE:-unlimited} req/s" -echo "==========================================" -echo "" - -# Generate tile URLs file for oha (simulate viewport panning) -URLS_FILE=$(mktemp) -trap "rm -f $URLS_FILE" EXIT - -# Generate a grid of tile coordinates from center of slide -CENTER_X=$((MAX_TILE_X / 2)) -CENTER_Y=$((MAX_TILE_Y / 2)) -START_X=$((CENTER_X > 5 ? CENTER_X - 5 : 0)) -START_Y=$((CENTER_Y > 5 ? CENTER_Y - 5 : 0)) -END_X=$((START_X + 9 < MAX_TILE_X ? START_X + 9 : MAX_TILE_X)) -END_Y=$((START_Y + 9 < MAX_TILE_Y ? START_Y + 9 : MAX_TILE_Y)) - -for x in $(seq $START_X $END_X); do - for y in $(seq $START_Y $END_Y); do - echo "$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/$x/$y" >> "$URLS_FILE" - done -done - -log_info "Generated $(wc -l < "$URLS_FILE") tile URLs (tiles $START_X-$END_X x $START_Y-$END_Y)" -log_info "Starting load test..." -echo "" - -# Build oha command -OHA_CMD="oha" -OHA_CMD="$OHA_CMD -c $CONCURRENT" -OHA_CMD="$OHA_CMD -z ${DURATION}s" -OHA_CMD="$OHA_CMD --no-tui" - -if [[ $RATE -gt 0 ]]; then - OHA_CMD="$OHA_CMD -q $RATE" -fi - -# Add JSON output if requested -if [[ -n "$OUTPUT_FILE" ]]; then - OHA_CMD="$OHA_CMD --output-format json -o $OUTPUT_FILE" -fi - -# Run the load test with URL file -# oha doesn't support URL files directly, so we use a workaround with random selection -# Instead, we'll test a single representative tile URL at the center -TEST_TILE_URL="$BASE_URL/api/slide/$SLIDE_ID/tile/$TEST_LEVEL/$CENTER_X/$CENTER_Y" - -log_info "Testing tile: $TEST_TILE_URL" - -if [[ -n "$OUTPUT_FILE" ]]; then - $OHA_CMD "$TEST_TILE_URL" 2>&1 - log_success "Results saved to $OUTPUT_FILE" - - # Also print summary - echo "" - echo "==========================================" - echo " Results Summary (from JSON)" - echo "==========================================" - if command -v jq &> /dev/null && [[ -f "$OUTPUT_FILE" ]]; then - jq -r ' - "Duration: \(.summary.total | floor)s", - "Requests: \(.statusCodeDistribution | to_entries | map(.value) | add)", - "Successful: \(.summary.successRate * 100 | floor)%", - "Req/sec: \(.summary.requestsPerSec | floor)", - "", - "Latency:", - " P50: \(.latencyPercentiles.p50 * 1000 | floor)ms", - " P90: \(.latencyPercentiles.p90 * 1000 | floor)ms", - " P95: \(.latencyPercentiles.p95 * 1000 | floor)ms", - " P99: \(.latencyPercentiles.p99 * 1000 | floor)ms", - " P99.9: \(.latencyPercentiles."p99.9" * 1000 | floor)ms" - ' "$OUTPUT_FILE" 2>/dev/null || cat "$OUTPUT_FILE" - else - cat "$OUTPUT_FILE" 2>/dev/null || echo "(output file not available)" - fi -else - $OHA_CMD "$TEST_TILE_URL" -fi - -echo "" -log_success "Tile stress test completed" diff --git a/bench/scripts/compare_baseline.py b/bench/scripts/compare_baseline.py deleted file mode 100755 index ee132fc..0000000 --- a/bench/scripts/compare_baseline.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python3 -""" -compare_baseline.py - Compare benchmark results against baseline - -This script compares current benchmark results to a saved baseline and: -- Reports percentage changes for key metrics -- Fails with exit code 1 if P99 regresses by more than threshold -- Generates a markdown summary suitable for PR comments - -Usage: - ./compare_baseline.py --current results.json --baseline baseline.json - ./compare_baseline.py --current results.json --baseline baseline.json --threshold 10 - ./compare_baseline.py --save-baseline results.json --output baselines/tile_baseline.json - -Examples: - # Compare current run to baseline - ./compare_baseline.py -c bench/load_tests/results/latest.json -b bench/baselines/tile_baseline.json - - # Save new baseline - ./compare_baseline.py --save-baseline bench/load_tests/results/latest.json -o bench/baselines/tile_baseline.json -""" - -import argparse -import json -import sys -from pathlib import Path -from datetime import datetime -from typing import Dict, Any, Optional, Tuple - -# ANSI colors for terminal output -class Colors: - RED = '\033[0;31m' - GREEN = '\033[0;32m' - YELLOW = '\033[1;33m' - BLUE = '\033[0;34m' - NC = '\033[0m' # No Color - - -def load_json(path: Path) -> Dict[str, Any]: - """Load and parse a JSON file.""" - with open(path) as f: - return json.load(f) - - -def save_json(data: Dict[str, Any], path: Path) -> None: - """Save data as JSON file.""" - path.parent.mkdir(parents=True, exist_ok=True) - with open(path, 'w') as f: - json.dump(data, f, indent=2) - print(f"{Colors.GREEN}[OK]{Colors.NC} Saved baseline to {path}") - - -def extract_metrics(data: Dict[str, Any]) -> Dict[str, float]: - """ - Extract key metrics from benchmark results. - - Supports both oha JSON output and custom summary format. - """ - metrics = {} - - # oha format - if 'summary' in data: - summary = data['summary'] - metrics['requests_per_sec'] = summary.get('requestsPerSec', 0) - metrics['success_rate'] = summary.get('successRate', 1.0) * 100 - - if 'latencyPercentiles' in data: - lat = data['latencyPercentiles'] - # oha returns latency in seconds, convert to ms - metrics['p50_ms'] = lat.get('p50', 0) * 1000 - metrics['p90_ms'] = lat.get('p90', 0) * 1000 - metrics['p95_ms'] = lat.get('p95', 0) * 1000 - metrics['p99_ms'] = lat.get('p99', 0) * 1000 - if 'p999' in lat: - metrics['p999_ms'] = lat.get('p999', 0) * 1000 - - # Alternative: latencyDistribution format - if 'latencyDistribution' in data and 'percentiles' in data['latencyDistribution']: - lat = data['latencyDistribution']['percentiles'] - metrics['p50_ms'] = lat.get('p50', 0) * 1000 - metrics['p90_ms'] = lat.get('p90', 0) * 1000 - metrics['p95_ms'] = lat.get('p95', 0) * 1000 - metrics['p99_ms'] = lat.get('p99', 0) * 1000 - - # Custom baseline format (already in correct units) - if 'metrics' in data: - metrics.update(data['metrics']) - - return metrics - - -def compare_metrics( - current: Dict[str, float], - baseline: Dict[str, float], - threshold_pct: float = 10.0 -) -> Tuple[bool, str, str]: - """ - Compare current metrics to baseline. - - Returns: - (passed, terminal_output, markdown_output) - """ - passed = True - terminal_lines = [] - md_lines = ["| Metric | Baseline | Current | Change | Status |", - "|--------|----------|---------|--------|--------|"] - - # Metrics where lower is better (latencies) - lower_is_better = {'p50_ms', 'p90_ms', 'p95_ms', 'p99_ms', 'p999_ms'} - # Metrics where higher is better (throughput) - higher_is_better = {'requests_per_sec', 'success_rate'} - - for metric in sorted(set(current.keys()) | set(baseline.keys())): - curr_val = current.get(metric, 0) - base_val = baseline.get(metric, 0) - - if base_val == 0: - change_pct = 0 if curr_val == 0 else float('inf') - else: - change_pct = ((curr_val - base_val) / base_val) * 100 - - # Determine if this is a regression - is_regression = False - if metric in lower_is_better and change_pct > threshold_pct: - is_regression = True - elif metric in higher_is_better and change_pct < -threshold_pct: - is_regression = True - - # Format values - if metric.endswith('_ms'): - base_str = f"{base_val:.1f}ms" - curr_str = f"{curr_val:.1f}ms" - elif metric == 'success_rate': - base_str = f"{base_val:.1f}%" - curr_str = f"{curr_val:.1f}%" - else: - base_str = f"{base_val:.1f}" - curr_str = f"{curr_val:.1f}" - - # Format change - if change_pct == float('inf'): - change_str = "N/A" - else: - sign = "+" if change_pct > 0 else "" - change_str = f"{sign}{change_pct:.1f}%" - - # Status - if is_regression: - status = f"{Colors.RED}REGRESSED{Colors.NC}" - status_md = "🔴 REGRESSED" - if metric == 'p99_ms': - passed = False # Only fail on P99 regression - elif abs(change_pct) < 5: - status = f"{Colors.GREEN}OK{Colors.NC}" - status_md = "✅ OK" - elif metric in lower_is_better and change_pct < 0: - status = f"{Colors.GREEN}IMPROVED{Colors.NC}" - status_md = "🟢 IMPROVED" - elif metric in higher_is_better and change_pct > 0: - status = f"{Colors.GREEN}IMPROVED{Colors.NC}" - status_md = "🟢 IMPROVED" - else: - status = f"{Colors.YELLOW}CHANGED{Colors.NC}" - status_md = "🟡 CHANGED" - - terminal_lines.append( - f" {metric:20} {base_str:>12} → {curr_str:>12} ({change_str:>8}) {status}" - ) - md_lines.append( - f"| {metric} | {base_str} | {curr_str} | {change_str} | {status_md} |" - ) - - terminal_output = "\n".join(terminal_lines) - markdown_output = "\n".join(md_lines) - - return passed, terminal_output, markdown_output - - -def create_baseline(results: Dict[str, Any], description: str = "") -> Dict[str, Any]: - """Create a baseline document from results.""" - metrics = extract_metrics(results) - return { - "created_at": datetime.utcnow().isoformat() + "Z", - "description": description, - "metrics": metrics, - "raw_data": results - } - - -def main(): - parser = argparse.ArgumentParser( - description="Compare benchmark results against baseline", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__ - ) - - parser.add_argument( - "-c", "--current", - type=Path, - help="Current results JSON file" - ) - parser.add_argument( - "-b", "--baseline", - type=Path, - help="Baseline JSON file to compare against" - ) - parser.add_argument( - "-t", "--threshold", - type=float, - default=10.0, - help="Regression threshold percentage (default: 10)" - ) - parser.add_argument( - "--save-baseline", - type=Path, - help="Save results as new baseline" - ) - parser.add_argument( - "-o", "--output", - type=Path, - help="Output path for baseline (with --save-baseline)" - ) - parser.add_argument( - "-d", "--description", - default="", - help="Description for baseline (with --save-baseline)" - ) - parser.add_argument( - "--markdown", - action="store_true", - help="Output comparison as markdown table" - ) - parser.add_argument( - "--ci", - action="store_true", - help="CI mode: minimal output, exit code indicates pass/fail" - ) - - args = parser.parse_args() - - # Save baseline mode - if args.save_baseline: - if not args.output: - print(f"{Colors.RED}[ERROR]{Colors.NC} --output required with --save-baseline") - sys.exit(1) - - results = load_json(args.save_baseline) - baseline = create_baseline(results, args.description) - save_json(baseline, args.output) - sys.exit(0) - - # Comparison mode - if not args.current or not args.baseline: - parser.print_help() - sys.exit(1) - - if not args.current.exists(): - print(f"{Colors.RED}[ERROR]{Colors.NC} Current results not found: {args.current}") - sys.exit(1) - - if not args.baseline.exists(): - print(f"{Colors.YELLOW}[WARN]{Colors.NC} Baseline not found: {args.baseline}") - print("Run with --save-baseline to create initial baseline") - sys.exit(0) - - # Load and compare - current_data = load_json(args.current) - baseline_data = load_json(args.baseline) - - current_metrics = extract_metrics(current_data) - baseline_metrics = extract_metrics(baseline_data) - - passed, terminal_output, markdown_output = compare_metrics( - current_metrics, - baseline_metrics, - args.threshold - ) - - # Output - if args.markdown: - print("## Benchmark Comparison\n") - print(markdown_output) - print() - if passed: - print("**Result: ✅ PASSED** - No significant regressions detected") - else: - print("**Result: ❌ FAILED** - P99 latency regression exceeds threshold") - elif args.ci: - if not passed: - print(f"FAILED: P99 regression exceeds {args.threshold}% threshold") - else: - print() - print("=" * 60) - print(" Benchmark Comparison") - print("=" * 60) - print() - print(f" Baseline: {args.baseline}") - print(f" Current: {args.current}") - print(f" Threshold: {args.threshold}%") - print() - print(terminal_output) - print() - if passed: - print(f"{Colors.GREEN}PASSED{Colors.NC}: No significant regressions detected") - else: - print(f"{Colors.RED}FAILED{Colors.NC}: P99 latency regression exceeds {args.threshold}% threshold") - print() - - sys.exit(0 if passed else 1) - - -if __name__ == "__main__": - main() diff --git a/bench/scripts/generate_report.py b/bench/scripts/generate_report.py deleted file mode 100755 index 494b5ea..0000000 --- a/bench/scripts/generate_report.py +++ /dev/null @@ -1,334 +0,0 @@ -#!/usr/bin/env python3 -""" -generate_report.py - Generate markdown benchmark report - -This script aggregates results from all benchmark phases and produces -a comprehensive markdown report suitable for: -- PR comments -- Documentation -- Historical tracking - -Usage: - ./generate_report.py --input-dir bench/load_tests/results/run_YYYYMMDD_HHMMSS --output REPORT.md -""" - -import argparse -import json -import re -import sys -from datetime import datetime -from pathlib import Path -from typing import Dict, Any, Optional, List - - -def load_json_safe(path: Path) -> Optional[Dict[str, Any]]: - """Load JSON file, returning None on error.""" - try: - with open(path) as f: - return json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - return None - - -def load_text_safe(path: Path) -> Optional[str]: - """Load text file, returning None on error.""" - try: - with open(path) as f: - return f.read() - except FileNotFoundError: - return None - - -def parse_criterion_output(text: str) -> List[Dict[str, Any]]: - """Parse Criterion benchmark output for key metrics.""" - results = [] - - # Pattern: "benchmark_name time: [123.45 µs 125.67 µs 127.89 µs]" - pattern = r'(\S+)\s+time:\s+\[(\d+\.?\d*)\s*(\w+)\s+(\d+\.?\d*)\s*(\w+)\s+(\d+\.?\d*)\s*(\w+)\]' - - for match in re.finditer(pattern, text): - name = match.group(1) - low = float(match.group(2)) - low_unit = match.group(3) - mid = float(match.group(4)) - mid_unit = match.group(5) - high = float(match.group(6)) - high_unit = match.group(7) - - # Normalize to microseconds - def to_us(val, unit): - if unit == 'ns': - return val / 1000 - elif unit == 'µs' or unit == 'us': - return val - elif unit == 'ms': - return val * 1000 - elif unit == 's': - return val * 1_000_000 - return val - - results.append({ - 'name': name, - 'low_us': to_us(low, low_unit), - 'mid_us': to_us(mid, mid_unit), - 'high_us': to_us(high, high_unit), - }) - - return results - - -def parse_websocket_output(text: str) -> Dict[str, Any]: - """Parse WebSocket load test output.""" - result = { - 'passed': 'PASS' in text, - 'messages_sent': 0, - 'messages_received': 0, - 'cursor_p99': None, - 'viewport_p99': None, - } - - # Extract metrics - if match := re.search(r'Messages sent:\s*(\d+)', text): - result['messages_sent'] = int(match.group(1)) - if match := re.search(r'Messages received:\s*(\d+)', text): - result['messages_received'] = int(match.group(1)) - if match := re.search(r'Cursor.*P99:\s*([\d.]+\w+)', text): - result['cursor_p99'] = match.group(1) - if match := re.search(r'Viewport.*P99:\s*([\d.]+\w+)', text): - result['viewport_p99'] = match.group(1) - - return result - - -def format_duration(us: float) -> str: - """Format duration in appropriate units.""" - if us < 1: - return f"{us * 1000:.2f}ns" - elif us < 1000: - return f"{us:.2f}µs" - elif us < 1_000_000: - return f"{us / 1000:.2f}ms" - else: - return f"{us / 1_000_000:.2f}s" - - -def generate_report(input_dir: Path) -> str: - """Generate markdown report from benchmark results.""" - - lines = [] - lines.append("# PathCollab Benchmark Report") - lines.append("") - lines.append(f"**Generated:** {datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')} UTC") - lines.append(f"**Run directory:** `{input_dir.name}`") - lines.append("") - - # Table of Contents - lines.append("## Table of Contents") - lines.append("- [Summary](#summary)") - lines.append("- [HTTP Tile Performance](#http-tile-performance)") - lines.append("- [WebSocket Performance](#websocket-performance)") - lines.append("- [Micro-benchmarks](#micro-benchmarks)") - lines.append("- [Server Metrics](#server-metrics)") - lines.append("") - - # Summary - lines.append("## Summary") - lines.append("") - - tile_data = load_json_safe(input_dir / "tile_stress.json") - ws_text = load_text_safe(input_dir / "websocket_load.txt") - ws_data = parse_websocket_output(ws_text) if ws_text else {} - - summary_items = [] - - if tile_data: - rps = tile_data.get('summary', {}).get('requestsPerSec', 0) - p99 = tile_data.get('latencyPercentiles', {}).get('p99', 0) * 1000 - success = tile_data.get('summary', {}).get('successRate', 1) * 100 - summary_items.append(f"- **Tile serving:** {rps:.0f} req/s, P99: {p99:.1f}ms, Success: {success:.1f}%") - tile_status = "✅ PASS" if p99 < 100 else "❌ FAIL (P99 > 100ms)" - else: - tile_status = "⚠️ No data" - summary_items.append("- **Tile serving:** No data collected") - - if ws_data.get('passed'): - summary_items.append(f"- **WebSocket:** P99 cursor: {ws_data.get('cursor_p99', 'N/A')}, P99 viewport: {ws_data.get('viewport_p99', 'N/A')}") - ws_status = "✅ PASS" - elif ws_text: - ws_status = "❌ FAIL" - summary_items.append("- **WebSocket:** Test failed") - else: - ws_status = "⚠️ No data" - summary_items.append("- **WebSocket:** No data collected") - - lines.append("| Component | Status |") - lines.append("|-----------|--------|") - lines.append(f"| HTTP Tile Serving | {tile_status} |") - lines.append(f"| WebSocket Broadcasting | {ws_status} |") - lines.append("") - lines.extend(summary_items) - lines.append("") - - # HTTP Tile Performance - lines.append("## HTTP Tile Performance") - lines.append("") - - if tile_data: - summary = tile_data.get('summary', {}) - latency = tile_data.get('latencyPercentiles', {}) - - lines.append("### Throughput") - lines.append("") - lines.append(f"- **Requests/sec:** {summary.get('requestsPerSec', 0):.1f}") - lines.append(f"- **Total requests:** {summary.get('total', 0)}") - lines.append(f"- **Success rate:** {summary.get('successRate', 1) * 100:.1f}%") - lines.append("") - - lines.append("### Latency Distribution") - lines.append("") - lines.append("| Percentile | Latency |") - lines.append("|------------|---------|") - for p in ['p50', 'p75', 'p90', 'p95', 'p99', 'p999']: - val = latency.get(p, 0) * 1000 # to ms - lines.append(f"| {p.upper()} | {val:.2f}ms |") - lines.append("") - - # Status codes - status_dist = tile_data.get('statusCodeDistribution', {}) - if status_dist: - lines.append("### Status Codes") - lines.append("") - lines.append("| Code | Count |") - lines.append("|------|-------|") - for code, count in sorted(status_dist.items()): - lines.append(f"| {code} | {count} |") - lines.append("") - else: - lines.append("*No HTTP tile performance data available.*") - lines.append("") - - # WebSocket Performance - lines.append("## WebSocket Performance") - lines.append("") - - if ws_text: - lines.append("### Results") - lines.append("") - lines.append(f"- **Status:** {'PASS' if ws_data.get('passed') else 'FAIL'}") - lines.append(f"- **Messages sent:** {ws_data.get('messages_sent', 'N/A')}") - lines.append(f"- **Messages received:** {ws_data.get('messages_received', 'N/A')}") - lines.append(f"- **Cursor P99:** {ws_data.get('cursor_p99', 'N/A')}") - lines.append(f"- **Viewport P99:** {ws_data.get('viewport_p99', 'N/A')}") - lines.append("") - - # Include raw output excerpt - lines.append("
") - lines.append("Raw Output") - lines.append("") - lines.append("```") - # Include just the results section - if "=== Load Test Results ===" in ws_text: - start = ws_text.find("=== Load Test Results ===") - lines.append(ws_text[start:start + 1500]) - else: - lines.append(ws_text[:1500]) - lines.append("```") - lines.append("
") - lines.append("") - else: - lines.append("*No WebSocket performance data available.*") - lines.append("") - - # Micro-benchmarks - lines.append("## Micro-benchmarks") - lines.append("") - - micro_text = load_text_safe(input_dir / "micro_benchmarks.txt") - if micro_text: - benchmarks = parse_criterion_output(micro_text) - - if benchmarks: - # Group by benchmark file - groups = {} - for b in benchmarks: - # Extract group from name like "jpeg_encoding/256x256/85" - parts = b['name'].split('/') - group = parts[0] if parts else 'other' - if group not in groups: - groups[group] = [] - groups[group].append(b) - - for group_name, items in sorted(groups.items()): - lines.append(f"### {group_name.replace('_', ' ').title()}") - lines.append("") - lines.append("| Benchmark | Time (median) | Range |") - lines.append("|-----------|---------------|-------|") - for b in items: - name = '/'.join(b['name'].split('/')[1:]) or b['name'] - lines.append(f"| {name} | {format_duration(b['mid_us'])} | {format_duration(b['low_us'])} - {format_duration(b['high_us'])} |") - lines.append("") - else: - lines.append("*Could not parse benchmark results.*") - lines.append("") - else: - lines.append("*No micro-benchmark data available.*") - lines.append("") - - # Server Metrics - lines.append("## Server Metrics") - lines.append("") - - metrics_data = load_json_safe(input_dir / "server_metrics.json") - if metrics_data: - lines.append("| Metric | Value |") - lines.append("|--------|-------|") - for key, value in sorted(metrics_data.items()): - lines.append(f"| {key} | {value} |") - lines.append("") - else: - lines.append("*No server metrics available.*") - lines.append("") - - # Footer - lines.append("---") - lines.append("") - lines.append("*Report generated by `bench/scripts/generate_report.py`*") - - return "\n".join(lines) - - -def main(): - parser = argparse.ArgumentParser( - description="Generate markdown benchmark report" - ) - parser.add_argument( - "--input-dir", - type=Path, - required=True, - help="Directory containing benchmark results" - ) - parser.add_argument( - "--output", - type=Path, - help="Output markdown file (default: stdout)" - ) - - args = parser.parse_args() - - if not args.input_dir.exists(): - print(f"Error: Input directory not found: {args.input_dir}", file=sys.stderr) - sys.exit(1) - - report = generate_report(args.input_dir) - - if args.output: - args.output.parent.mkdir(parents=True, exist_ok=True) - with open(args.output, 'w') as f: - f.write(report) - print(f"Report saved to: {args.output}") - else: - print(report) - - -if __name__ == "__main__": - main() diff --git a/bench/scripts/run_all.sh b/bench/scripts/run_all.sh deleted file mode 100755 index f705bda..0000000 --- a/bench/scripts/run_all.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env bash -# -# run_all.sh - Orchestrate the complete benchmark suite -# -# This script runs all benchmarks in sequence and generates a comprehensive report. -# It handles server startup (optional), warmup, test execution, and cleanup. -# -# Usage: -# ./run_all.sh [OPTIONS] -# -# Options: -# --server-cmd CMD Command to start the server (default: auto-detect) -# --server-url URL Server URL (default: http://127.0.0.1:8080) -# --skip-micro Skip Criterion micro-benchmarks -# --skip-load Skip HTTP load tests -# --skip-websocket Skip WebSocket load tests -# --quick Quick mode: shorter durations, fewer iterations -# --compare-baseline Compare results to baseline and fail on regression -# --save-baseline Save results as new baseline -# -o, --output Output directory (default: bench/load_tests/results) -# -h, --help Show this help message - -set -euo pipefail - -# Script directory -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -BENCH_DIR="$PROJECT_ROOT/bench" - -# Default configuration -SERVER_CMD="" -SERVER_URL="${SERVER_URL:-http://127.0.0.1:8080}" -SKIP_MICRO=false -SKIP_LOAD=false -SKIP_WEBSOCKET=false -QUICK_MODE=false -COMPARE_BASELINE=false -SAVE_BASELINE=false -OUTPUT_DIR="$BENCH_DIR/load_tests/results" -SERVER_PID="" - -# Colors -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -CYAN='\033[0;36m' -BOLD='\033[1m' -NC='\033[0m' - -usage() { - grep '^#' "$0" | grep -v '#!/' | cut -c3- - exit 0 -} - -log_header() { - echo "" - echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════${NC}" - echo -e "${BOLD}${CYAN} $1${NC}" - echo -e "${BOLD}${CYAN}════════════════════════════════════════════════════════════${NC}" - echo "" -} - -log_info() { - echo -e "${BLUE}[INFO]${NC} $1" -} - -log_success() { - echo -e "${GREEN}[OK]${NC} $1" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $1" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $1" -} - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]]; then - log_info "Stopping server (PID: $SERVER_PID)..." - kill "$SERVER_PID" 2>/dev/null || true - wait "$SERVER_PID" 2>/dev/null || true - fi -} - -trap cleanup EXIT - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --server-cmd) - SERVER_CMD="$2" - shift 2 - ;; - --server-url) - SERVER_URL="$2" - shift 2 - ;; - --skip-micro) - SKIP_MICRO=true - shift - ;; - --skip-load) - SKIP_LOAD=true - shift - ;; - --skip-websocket) - SKIP_WEBSOCKET=true - shift - ;; - --quick) - QUICK_MODE=true - shift - ;; - --compare-baseline) - COMPARE_BASELINE=true - shift - ;; - --save-baseline) - SAVE_BASELINE=true - shift - ;; - -o|--output) - OUTPUT_DIR="$2" - shift 2 - ;; - -h|--help) - usage - ;; - *) - log_error "Unknown option: $1" - usage - ;; - esac -done - -# Create output directory -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -RUN_DIR="$OUTPUT_DIR/run_$TIMESTAMP" -mkdir -p "$RUN_DIR" - -log_header "PathCollab Benchmark Suite" - -echo "Configuration:" -echo " Project root: $PROJECT_ROOT" -echo " Server URL: $SERVER_URL" -echo " Output: $RUN_DIR" -echo " Quick mode: $QUICK_MODE" -echo " Skip micro: $SKIP_MICRO" -echo " Skip load: $SKIP_LOAD" -echo " Skip WebSocket: $SKIP_WEBSOCKET" -echo "" - -# Check if server is running, or start it -log_info "Checking server status..." -if curl -sf "$SERVER_URL/health" > /dev/null 2>&1; then - log_success "Server is already running at $SERVER_URL" -else - if [[ -n "$SERVER_CMD" ]]; then - log_info "Starting server with: $SERVER_CMD" - $SERVER_CMD & - SERVER_PID=$! - - # Wait for server to be ready - for i in {1..30}; do - if curl -sf "$SERVER_URL/health" > /dev/null 2>&1; then - log_success "Server is ready" - break - fi - if [[ $i -eq 30 ]]; then - log_error "Server failed to start within 30 seconds" - exit 1 - fi - sleep 1 - done - else - log_error "Server not running at $SERVER_URL" - log_info "Either start the server manually or use --server-cmd" - exit 1 - fi -fi - -# Warmup -log_header "Warmup Phase" -log_info "Sending warmup requests..." -for i in {1..10}; do - curl -sf "$SERVER_URL/health" > /dev/null 2>&1 || true - curl -sf "$SERVER_URL/api/slides" > /dev/null 2>&1 || true -done -log_success "Warmup complete" - -# Track overall results -LOAD_PASSED=true -WS_PASSED=true - -# Phase 1: HTTP load tests -if [[ "$SKIP_LOAD" != "true" ]]; then - log_header "Phase 1: HTTP Load Tests" - - cd "$PROJECT_ROOT" - - if ! command -v oha &> /dev/null; then - log_warn "oha not installed, skipping HTTP load tests" - log_info "Install with: cargo install oha" - else - # Tile stress test - log_info "Running tile stress test..." - if [[ "$QUICK_MODE" == "true" ]]; then - bash "$BENCH_DIR/load_tests/scenarios/tile_stress.sh" \ - --url "$SERVER_URL" \ - --quick \ - --output "$RUN_DIR/tile_stress.json" 2>&1 | tee "$RUN_DIR/tile_stress.txt" || LOAD_PASSED=false - else - bash "$BENCH_DIR/load_tests/scenarios/tile_stress.sh" \ - --url "$SERVER_URL" \ - --concurrent 20 \ - --duration 30 \ - --output "$RUN_DIR/tile_stress.json" 2>&1 | tee "$RUN_DIR/tile_stress.txt" || LOAD_PASSED=false - fi - - # Overlay stress test - log_info "Running overlay stress test..." - if [[ "$QUICK_MODE" == "true" ]]; then - bash "$BENCH_DIR/load_tests/scenarios/overlay_stress.sh" \ - --url "$SERVER_URL" \ - --quick \ - --output "$RUN_DIR/overlay_stress.json" 2>&1 | tee "$RUN_DIR/overlay_stress.txt" || LOAD_PASSED=false - else - bash "$BENCH_DIR/load_tests/scenarios/overlay_stress.sh" \ - --url "$SERVER_URL" \ - --concurrent 20 \ - --duration 30 \ - --output "$RUN_DIR/overlay_stress.json" 2>&1 | tee "$RUN_DIR/overlay_stress.txt" || LOAD_PASSED=false - fi - - if [[ "$LOAD_PASSED" == "true" ]]; then - log_success "HTTP load tests complete" - else - log_warn "HTTP load tests had issues" - fi - fi -else - log_info "Skipping HTTP load tests (--skip-load)" -fi - -# Phase 2: WebSocket load tests -if [[ "$SKIP_WEBSOCKET" != "true" ]]; then - log_header "Phase 2: WebSocket Load Tests" - - cd "$PROJECT_ROOT/server" - - log_info "Running WebSocket load tests..." - if [[ "$QUICK_MODE" == "true" ]]; then - cargo test --test perf_tests test_fanout_minimal --release -- --ignored --nocapture 2>&1 | tee "$RUN_DIR/websocket_load.txt" || WS_PASSED=false - else - cargo test --test perf_tests test_fanout_standard --release -- --ignored --nocapture 2>&1 | tee "$RUN_DIR/websocket_load.txt" || WS_PASSED=false - fi - - if [[ "$WS_PASSED" == "true" ]]; then - log_success "WebSocket load tests complete" - else - log_warn "WebSocket load tests had issues" - fi -else - log_info "Skipping WebSocket load tests (--skip-websocket)" -fi - -# Phase 3: Collect metrics -log_header "Phase 3: Collecting Metrics" - -log_info "Fetching server metrics..." -curl -sf "$SERVER_URL/metrics" > "$RUN_DIR/server_metrics.json" 2>/dev/null || true -curl -sf "$SERVER_URL/metrics/prometheus" > "$RUN_DIR/prometheus_metrics.txt" 2>/dev/null || true -log_success "Metrics collected" - -# Phase 4: Generate report -log_header "Phase 4: Generating Report" - -python3 "$BENCH_DIR/scripts/generate_report.py" \ - --input-dir "$RUN_DIR" \ - --output "$RUN_DIR/REPORT.md" 2>&1 || log_warn "Report generation had issues" - -if [[ -f "$RUN_DIR/REPORT.md" ]]; then - log_success "Report generated: $RUN_DIR/REPORT.md" -fi - -# Phase 5: Baseline comparison (if requested) -if [[ "$COMPARE_BASELINE" == "true" ]] && [[ -f "$RUN_DIR/tile_stress.json" ]]; then - log_header "Phase 5: Baseline Comparison" - - BASELINE_FILE="$BENCH_DIR/baselines/tile_baseline.json" - - if [[ -f "$BASELINE_FILE" ]]; then - python3 "$BENCH_DIR/scripts/compare_baseline.py" \ - --current "$RUN_DIR/tile_stress.json" \ - --baseline "$BASELINE_FILE" \ - --threshold 10 2>&1 | tee "$RUN_DIR/baseline_comparison.txt" - - if [[ ${PIPESTATUS[0]} -ne 0 ]]; then - LOAD_PASSED=false - fi - else - log_warn "No baseline found at $BASELINE_FILE" - log_info "Create baseline with: --save-baseline" - fi -fi - -# Save baseline (if requested) -if [[ "$SAVE_BASELINE" == "true" ]] && [[ -f "$RUN_DIR/tile_stress.json" ]]; then - log_info "Saving new baseline..." - python3 "$BENCH_DIR/scripts/compare_baseline.py" \ - --save-baseline "$RUN_DIR/tile_stress.json" \ - --output "$BENCH_DIR/baselines/tile_baseline.json" \ - --description "Baseline from run $TIMESTAMP" -fi - -# Summary -log_header "Summary" - -echo "Results saved to: $RUN_DIR" -echo "" -echo "Test Results:" -echo " HTTP load tests: $([ "$LOAD_PASSED" == "true" ] && echo "✅ PASS" || echo "❌ FAIL")" -echo " WebSocket tests: $([ "$WS_PASSED" == "true" ] && echo "✅ PASS" || echo "⚠️ ISSUES")" -echo "" - -# Create symlink to latest run -ln -sfn "run_$TIMESTAMP" "$OUTPUT_DIR/latest" -echo "Latest results linked: $OUTPUT_DIR/latest" - -# Exit with appropriate code -if [[ "$LOAD_PASSED" == "true" ]] && [[ "$WS_PASSED" == "true" ]]; then - log_success "All benchmarks passed!" - exit 0 -else - log_error "Some benchmarks failed" - exit 1 -fi diff --git a/server/.benchmark-baseline.json b/server/.benchmark-baseline.json new file mode 100644 index 0000000..76086b7 --- /dev/null +++ b/server/.benchmark-baseline.json @@ -0,0 +1,12 @@ +{ + "SMOKE": { + "tier": "SMOKE", + "timestamp": "2026-01-22T11:21:45.573874+00:00", + "tile_p99_ms": 0.6844583333333333, + "overlay_p99_ms": 0.653153, + "cursor_p99_ms": null, + "viewport_p99_ms": null, + "error_rate_pct": 0.0, + "throughput": 105.66382458464953 + } +} \ No newline at end of file diff --git a/server/Cargo.toml b/server/Cargo.toml index b980bab..68dfcc7 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -60,3 +60,4 @@ prost-build = "0.13" [dev-dependencies] tokio-tungstenite = "0.26" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] } +chrono = { version = "0.4", features = ["serde"] } diff --git a/server/src/session/manager.rs b/server/src/session/manager.rs index 4b7617b..068f5a5 100644 --- a/server/src/session/manager.rs +++ b/server/src/session/manager.rs @@ -11,7 +11,7 @@ use metrics::{counter, histogram}; use std::collections::HashMap; use std::time::Instant; use thiserror::Error; -use tracing::{debug, error, info, warn}; +use tracing::{debug, info, warn}; use uuid::Uuid; /// Session manager errors diff --git a/server/src/slide/cache.rs b/server/src/slide/cache.rs index 96a342d..85b0275 100644 --- a/server/src/slide/cache.rs +++ b/server/src/slide/cache.rs @@ -121,7 +121,7 @@ impl SlideCache { // Probabilistic LRU update: only update every N accesses // This dramatically reduces write lock contention under load let count = self.access_counter.fetch_add(1, Ordering::Relaxed); - if count % LRU_UPDATE_FREQUENCY == 0 { + if count.is_multiple_of(LRU_UPDATE_FREQUENCY) { // Drop read lock before taking write lock drop(slides); // Update LRU order (best effort - may race but that's OK) diff --git a/server/src/slide/tile_cache.rs b/server/src/slide/tile_cache.rs index 60c6004..5c201d6 100644 --- a/server/src/slide/tile_cache.rs +++ b/server/src/slide/tile_cache.rs @@ -106,7 +106,7 @@ impl TileCache { counter!("pathcollab_tile_cache_hits_total").increment(1); // Update hit rate gauge periodically (every 100 hits) - if hits % 100 == 0 { + if hits.is_multiple_of(100) { self.update_hit_rate_gauge(); } } else { diff --git a/server/tests/load_tests/benchmark.rs b/server/tests/load_tests/benchmark.rs new file mode 100644 index 0000000..18c4f7a --- /dev/null +++ b/server/tests/load_tests/benchmark.rs @@ -0,0 +1,570 @@ +//! Benchmark runner with warm-up, multiple iterations, and baseline comparison +//! +//! Provides a production-grade benchmark system that: +//! - Runs a warm-up phase to prime caches and connection pools +//! - Executes multiple iterations for statistical significance +//! - Compares against stored baseline and detects regressions + +use super::BenchmarkTier; +use super::scenarios::{ComprehensiveStressConfig, ComprehensiveStressScenario}; +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; +use std::time::Duration; + +/// Configuration for benchmark runs +#[derive(Debug, Clone)] +pub struct BenchmarkRunConfig { + /// Benchmark tier + pub tier: BenchmarkTier, + /// Number of iterations to run (default: 3) + pub iterations: usize, + /// Warm-up duration before measuring (default: 3s for smoke, 5s for others) + pub warmup_duration: Duration, + /// Path to baseline file (default: .benchmark-baseline.json in project root) + pub baseline_path: PathBuf, + /// Regression threshold as percentage (default: 15%) + pub regression_threshold_pct: f64, +} + +impl BenchmarkRunConfig { + pub fn for_tier(tier: BenchmarkTier) -> Self { + let (iterations, warmup) = match tier { + BenchmarkTier::Smoke => (3, Duration::from_secs(2)), + BenchmarkTier::Standard => (3, Duration::from_secs(5)), + BenchmarkTier::Stress => (3, Duration::from_secs(5)), + }; + + Self { + tier, + iterations, + warmup_duration: warmup, + baseline_path: PathBuf::from(".benchmark-baseline.json"), + regression_threshold_pct: 15.0, + } + } +} + +/// Metrics extracted from a single benchmark run +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BenchmarkMetrics { + pub tile_p99_ms: Option, + pub overlay_p99_ms: Option, + pub cursor_p99_ms: Option, + pub viewport_p99_ms: Option, + pub error_rate: f64, + pub throughput: f64, +} + +impl BenchmarkMetrics { + /// Extract metrics from comprehensive stress results + pub fn from_results( + results: &super::scenarios::comprehensive::ComprehensiveStressResults, + ) -> Self { + let throughput = if results.duration.as_secs_f64() > 0.0 { + (results.ws_messages_sent + results.http_requests_sent) as f64 + / results.duration.as_secs_f64() + } else { + 0.0 + }; + + Self { + tile_p99_ms: results + .tile_latencies + .p99() + .map(|d| d.as_secs_f64() * 1000.0), + overlay_p99_ms: results + .overlay_latencies + .p99() + .map(|d| d.as_secs_f64() * 1000.0), + cursor_p99_ms: results + .cursor_latencies + .p99() + .map(|d| d.as_secs_f64() * 1000.0), + viewport_p99_ms: results + .viewport_latencies + .p99() + .map(|d| d.as_secs_f64() * 1000.0), + error_rate: results.error_rate(), + throughput, + } + } +} + +/// Statistical summary of a metric across iterations +#[derive(Debug, Clone)] +pub struct MetricStats { + pub mean: f64, + pub stddev: f64, +} + +impl MetricStats { + pub fn from_samples(samples: &[f64]) -> Option { + if samples.is_empty() { + return None; + } + + let n = samples.len() as f64; + let mean = samples.iter().sum::() / n; + + let variance = if samples.len() > 1 { + samples.iter().map(|x| (x - mean).powi(2)).sum::() / (n - 1.0) + } else { + 0.0 + }; + let stddev = variance.sqrt(); + + Some(Self { mean, stddev }) + } + + /// Format as "mean ± stddev" + pub fn format(&self) -> String { + if self.stddev < 0.1 { + format!("{:.1}ms", self.mean) + } else { + format!("{:.1}ms ± {:.1}ms", self.mean, self.stddev) + } + } +} + +/// Aggregated results from multiple benchmark iterations +#[derive(Debug)] +pub struct BenchmarkReport { + pub tier: BenchmarkTier, + pub iterations: usize, + pub warmup_duration: Duration, + pub tile_p99: Option, + pub overlay_p99: Option, + pub cursor_p99: Option, + pub viewport_p99: Option, + pub error_rate: MetricStats, + pub throughput: MetricStats, + pub all_passed: bool, +} + +impl BenchmarkReport { + /// Aggregate metrics from multiple runs + pub fn from_metrics( + tier: BenchmarkTier, + warmup_duration: Duration, + metrics: Vec, + all_passed: bool, + ) -> Self { + let iterations = metrics.len(); + + let tile_samples: Vec = metrics.iter().filter_map(|m| m.tile_p99_ms).collect(); + let overlay_samples: Vec = metrics.iter().filter_map(|m| m.overlay_p99_ms).collect(); + let cursor_samples: Vec = metrics.iter().filter_map(|m| m.cursor_p99_ms).collect(); + let viewport_samples: Vec = metrics.iter().filter_map(|m| m.viewport_p99_ms).collect(); + let error_samples: Vec = metrics.iter().map(|m| m.error_rate * 100.0).collect(); + let throughput_samples: Vec = metrics.iter().map(|m| m.throughput).collect(); + + Self { + tier, + iterations, + warmup_duration, + tile_p99: MetricStats::from_samples(&tile_samples), + overlay_p99: MetricStats::from_samples(&overlay_samples), + cursor_p99: MetricStats::from_samples(&cursor_samples), + viewport_p99: MetricStats::from_samples(&viewport_samples), + error_rate: MetricStats::from_samples(&error_samples).unwrap(), + throughput: MetricStats::from_samples(&throughput_samples).unwrap(), + all_passed, + } + } + + /// Convert to baseline format for storage + pub fn to_baseline(&self) -> Baseline { + Baseline { + tier: self.tier.name().to_string(), + timestamp: chrono::Utc::now().to_rfc3339(), + tile_p99_ms: self.tile_p99.as_ref().map(|s| s.mean), + overlay_p99_ms: self.overlay_p99.as_ref().map(|s| s.mean), + cursor_p99_ms: self.cursor_p99.as_ref().map(|s| s.mean), + viewport_p99_ms: self.viewport_p99.as_ref().map(|s| s.mean), + error_rate_pct: self.error_rate.mean, + throughput: self.throughput.mean, + } + } +} + +/// Stored baseline for comparison +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Baseline { + pub tier: String, + pub timestamp: String, + pub tile_p99_ms: Option, + pub overlay_p99_ms: Option, + pub cursor_p99_ms: Option, + pub viewport_p99_ms: Option, + pub error_rate_pct: f64, + pub throughput: f64, +} + +impl Baseline { + /// Load baseline from file + pub fn load(path: &PathBuf, tier: &str) -> Option { + let content = std::fs::read_to_string(path).ok()?; + let baselines: std::collections::HashMap = + serde_json::from_str(&content).ok()?; + baselines.get(tier).cloned() + } + + /// Save baseline to file (preserves other tiers) + pub fn save(&self, path: &PathBuf) -> std::io::Result<()> { + let mut baselines: std::collections::HashMap = + std::fs::read_to_string(path) + .ok() + .and_then(|c| serde_json::from_str(&c).ok()) + .unwrap_or_default(); + + baselines.insert(self.tier.clone(), self.clone()); + + let json = serde_json::to_string_pretty(&baselines)?; + std::fs::write(path, json) + } +} + +/// Comparison result between current run and baseline +#[derive(Debug)] +pub struct Comparison { + pub metric_name: &'static str, + pub current: Option, + pub baseline: Option, + pub change_pct: Option, + pub is_regression: bool, + pub higher_is_worse: bool, // true for latency/error, false for throughput +} + +impl Comparison { + fn new( + metric_name: &'static str, + current: Option, + baseline: Option, + threshold_pct: f64, + higher_is_worse: bool, + ) -> Self { + let change_pct = match (current, baseline) { + (Some(c), Some(b)) if b > 0.0 => Some((c - b) / b * 100.0), + _ => None, + }; + + let is_regression = change_pct + .map(|pct| { + if higher_is_worse { + pct > threshold_pct + } else { + pct < -threshold_pct + } + }) + .unwrap_or(false); + + Self { + metric_name, + current, + baseline, + change_pct, + is_regression, + higher_is_worse, + } + } + + fn format_value(&self, value: Option) -> String { + match value { + Some(v) => { + if self.metric_name.contains("P99") { + format!("{:.1}ms", v) + } else if self.metric_name == "Error Rate" { + format!("{:.2}%", v) + } else { + format!("{:.1}", v) + } + } + None => "N/A".to_string(), + } + } + + fn format_change(&self) -> String { + match self.change_pct { + Some(pct) => { + let sign = if pct >= 0.0 { "+" } else { "" }; + let status = if self.is_regression { + "[REGRESSION]" + } else if pct.abs() < 5.0 { + "[OK]" + } else if (self.higher_is_worse && pct < 0.0) + || (!self.higher_is_worse && pct > 0.0) + { + "[IMPROVED]" + } else { + "[WARNING]" + }; + format!("({}{:.1}%) {}", sign, pct, status) + } + None => "".to_string(), + } + } +} + +/// Benchmark runner that handles warm-up, iterations, and comparison +pub struct BenchmarkRunner { + config: BenchmarkRunConfig, +} + +impl BenchmarkRunner { + pub fn new(config: BenchmarkRunConfig) -> Self { + Self { config } + } + + /// Run the full benchmark with warm-up, iterations, and comparison + pub async fn run(&self) -> Result> { + let stress_config = ComprehensiveStressConfig::for_tier(self.config.tier); + + println!(); + println!("═══════════════════════════════════════════════════════════════"); + println!( + " BENCHMARK: {} ({} iterations)", + self.config.tier.name(), + self.config.iterations + ); + println!("═══════════════════════════════════════════════════════════════"); + + // Run warm-up phase + if self.config.warmup_duration > Duration::ZERO { + println!(); + println!( + " ─── Warm-up ({:.0}s) ───────────────────────────────────────────", + self.config.warmup_duration.as_secs_f64() + ); + + let warmup_config = ComprehensiveStressConfig { + duration: self.config.warmup_duration, + ..stress_config.clone() + }; + let warmup_scenario = ComprehensiveStressScenario::new(warmup_config); + let _ = warmup_scenario.run().await?; + println!(" Warm-up complete, starting measured iterations..."); + } + + // Run iterations + let mut metrics = Vec::new(); + let mut all_passed = true; + + for i in 0..self.config.iterations { + println!(); + println!( + " ─── Iteration {}/{} ─────────────────────────────────────────────", + i + 1, + self.config.iterations + ); + + let scenario = ComprehensiveStressScenario::new(stress_config.clone()); + let results = scenario.run().await?; + + let passed = results.meets_budgets(); + if !passed { + all_passed = false; + } + + let m = BenchmarkMetrics::from_results(&results); + println!( + " Tile P99: {:.1}ms | Error: {:.2}% | Throughput: {:.0} ops/s | {}", + m.tile_p99_ms.unwrap_or(0.0), + m.error_rate * 100.0, + m.throughput, + if passed { "PASS" } else { "FAIL" } + ); + + metrics.push(m); + } + + // Generate report + let report = BenchmarkReport::from_metrics( + self.config.tier, + self.config.warmup_duration, + metrics, + all_passed, + ); + + // Load baseline and compare + let baseline = Baseline::load(&self.config.baseline_path, self.config.tier.name()); + let comparisons = self.compare(&report, &baseline); + + // Print comparison + self.print_comparison(&report, &baseline, &comparisons); + + // Check for regressions + let has_regression = comparisons.iter().any(|c| c.is_regression); + + Ok(BenchmarkResult { + report, + has_regression, + all_passed, + }) + } + + fn compare(&self, report: &BenchmarkReport, baseline: &Option) -> Vec { + let threshold = self.config.regression_threshold_pct; + let baseline = baseline.as_ref(); + + vec![ + Comparison::new( + "Tile P99", + report.tile_p99.as_ref().map(|s| s.mean), + baseline.and_then(|b| b.tile_p99_ms), + threshold, + true, + ), + Comparison::new( + "Overlay P99", + report.overlay_p99.as_ref().map(|s| s.mean), + baseline.and_then(|b| b.overlay_p99_ms), + threshold, + true, + ), + Comparison::new( + "Error Rate", + Some(report.error_rate.mean), + baseline.map(|b| b.error_rate_pct), + threshold, + true, + ), + Comparison::new( + "Throughput", + Some(report.throughput.mean), + baseline.map(|b| b.throughput), + threshold, + false, + ), + ] + } + + #[allow(clippy::print_literal)] + fn print_comparison( + &self, + report: &BenchmarkReport, + baseline: &Option, + comparisons: &[Comparison], + ) { + println!(); + println!("═══════════════════════════════════════════════════════════════"); + println!( + " RESULTS: {} ({} iterations, {:.0}s warm-up)", + self.config.tier.name(), + report.iterations, + report.warmup_duration.as_secs_f64() + ); + println!("═══════════════════════════════════════════════════════════════"); + println!(); + + if baseline.is_some() { + println!(" ─── Comparison vs Baseline ──────────────────────────────────"); + println!(); + println!( + " {:12} {:>14} {:>14} {}", + "Metric", "Current", "Baseline", "Change" + ); + println!( + " {:12} {:>14} {:>14} {}", + "──────", "───────", "────────", "──────" + ); + + for c in comparisons { + if c.current.is_some() || c.baseline.is_some() { + println!( + " {:12} {:>14} {:>14} {}", + c.metric_name, + c.format_value(c.current), + c.format_value(c.baseline), + c.format_change() + ); + } + } + } else { + println!(" ─── Results (no baseline) ───────────────────────────────────"); + println!(); + if let Some(ref stats) = report.tile_p99 { + println!(" Tile P99: {}", stats.format()); + } + if let Some(ref stats) = report.overlay_p99 { + println!(" Overlay P99: {}", stats.format()); + } + println!( + " Error Rate: {:.2}% ± {:.2}%", + report.error_rate.mean, report.error_rate.stddev + ); + println!( + " Throughput: {:.0} ± {:.0} ops/s", + report.throughput.mean, report.throughput.stddev + ); + println!(); + println!(" (Run again to establish baseline, or use --save-baseline)"); + } + + println!(); + println!("═══════════════════════════════════════════════════════════════"); + + let has_regression = comparisons.iter().any(|c| c.is_regression); + let overall = if !report.all_passed { + "FAIL (budget exceeded)" + } else if has_regression { + "FAIL (regression detected)" + } else { + "PASS" + }; + println!(" OVERALL: {}", overall); + println!("═══════════════════════════════════════════════════════════════"); + println!(); + } + + /// Save current results as the new baseline + pub fn save_baseline(&self, report: &BenchmarkReport) -> std::io::Result<()> { + let baseline = report.to_baseline(); + baseline.save(&self.config.baseline_path)?; + println!( + "Baseline saved to {:?} for tier {}", + self.config.baseline_path, + self.config.tier.name() + ); + Ok(()) + } +} + +/// Full benchmark result +pub struct BenchmarkResult { + pub report: BenchmarkReport, + pub has_regression: bool, + pub all_passed: bool, +} + +impl BenchmarkResult { + /// Returns true if benchmark passed (no budget violations and no regressions) + pub fn passed(&self) -> bool { + self.all_passed && !self.has_regression + } + + /// Generate JSON output for CI + pub fn to_json(&self) -> String { + let tile_p99 = self.report.tile_p99.as_ref().map(|s| s.mean); + let overlay_p99 = self.report.overlay_p99.as_ref().map(|s| s.mean); + + let tile_str = tile_p99 + .map(|v| format!("{:.2}", v)) + .unwrap_or_else(|| "null".to_string()); + let overlay_str = overlay_p99 + .map(|v| format!("{:.2}", v)) + .unwrap_or_else(|| "null".to_string()); + + format!( + r#"{{"passed":{},"tier":"{}","iterations":{},"warmup_secs":{:.0},"tile_p99_ms":{},"overlay_p99_ms":{},"error_rate_pct":{:.2},"throughput":{:.1},"has_regression":{}}}"#, + self.passed(), + self.report.tier.name(), + self.report.iterations, + self.report.warmup_duration.as_secs_f64(), + tile_str, + overlay_str, + self.report.error_rate.mean, + self.report.throughput.mean, + self.has_regression + ) + } +} diff --git a/server/tests/load_tests/client.rs b/server/tests/load_tests/client.rs index e53f02d..388b78e 100644 --- a/server/tests/load_tests/client.rs +++ b/server/tests/load_tests/client.rs @@ -7,11 +7,9 @@ use futures_util::{SinkExt, StreamExt}; use serde::{Deserialize, Serialize}; -use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::{Duration, Instant}; +use std::time::Duration; use tokio::net::TcpStream; -use tokio::sync::mpsc; use tokio_tungstenite::{MaybeTlsStream, WebSocketStream, connect_async, tungstenite::Message}; /// Client message types (mirror of server protocol) @@ -89,8 +87,6 @@ pub enum ServerMessage { pub struct LoadTestClient { ws: WebSocketStream>, seq: AtomicU64, - /// Timestamps of sent messages for latency calculation - pending_acks: Arc>>, /// Session info after join/create pub session_id: Option, pub join_secret: Option, @@ -104,7 +100,6 @@ impl LoadTestClient { Ok(Self { ws, seq: AtomicU64::new(1), - pending_acks: Arc::new(tokio::sync::RwLock::new(std::collections::HashMap::new())), session_id: None, join_secret: None, presenter_key: None, @@ -116,7 +111,7 @@ impl LoadTestClient { self.seq.fetch_add(1, Ordering::SeqCst) } - /// Send a message and track for latency measurement + /// Send a message and return the sequence number pub async fn send( &mut self, msg: ClientMessage, @@ -130,12 +125,6 @@ impl LoadTestClient { ClientMessage::Ping { seq } => *seq, }; - // Track send time for latency calculation - { - let mut pending = self.pending_acks.write().await; - pending.insert(seq, Instant::now()); - } - let json = serde_json::to_string(&msg)?; self.ws.send(Message::Text(json.into())).await?; Ok(seq) @@ -274,84 +263,8 @@ impl LoadTestClient { } } -/// Spawn a client that sends updates at specified rates -pub async fn spawn_update_client( - mut client: LoadTestClient, - cursor_hz: u32, - viewport_hz: u32, - duration: Duration, - results_tx: mpsc::Sender, -) { - let cursor_interval = if cursor_hz > 0 { - Duration::from_secs_f64(1.0 / cursor_hz as f64) - } else { - Duration::from_secs(3600) // Effectively disabled - }; - - let viewport_interval = if viewport_hz > 0 { - Duration::from_secs_f64(1.0 / viewport_hz as f64) - } else { - Duration::from_secs(3600) - }; - - let start = Instant::now(); - let mut cursor_ticker = tokio::time::interval(cursor_interval); - let mut viewport_ticker = tokio::time::interval(viewport_interval); - let mut x = 0.5f64; - let mut y = 0.5f64; - - loop { - if start.elapsed() >= duration { - break; - } - - tokio::select! { - _ = cursor_ticker.tick() => { - // Simulate cursor movement - x = (x + 0.001).min(1.0); - y = (y + 0.001).min(1.0); - if x >= 1.0 { x = 0.0; } - if y >= 1.0 { y = 0.0; } - - match client.send_cursor(x, y).await { - Ok(_) => { - let _ = results_tx.send(ClientEvent::MessageSent).await; - } - Err(_) => { - let _ = results_tx.send(ClientEvent::Error).await; - } - } - } - _ = viewport_ticker.tick() => { - match client.send_viewport(0.5, 0.5, 1.0).await { - Ok(_) => { - let _ = results_tx.send(ClientEvent::MessageSent).await; - } - Err(_) => { - let _ = results_tx.send(ClientEvent::Error).await; - } - } - } - } - } - - let _ = client.close().await; -} - -/// Events from client tasks -#[derive(Debug)] -pub enum ClientEvent { - MessageSent, - MessageReceived { - latency: Option, - msg_type: &'static str, - }, - Error, -} - /// Slide info returned from the API #[derive(Debug, Clone, Deserialize)] -#[allow(dead_code)] pub struct SlideInfo { pub id: String, pub name: String, diff --git a/server/tests/load_tests/mod.rs b/server/tests/load_tests/mod.rs index f8942a3..80305c7 100644 --- a/server/tests/load_tests/mod.rs +++ b/server/tests/load_tests/mod.rs @@ -1,59 +1,49 @@ //! Load testing module for PathCollab //! -//! This module provides load testing infrastructure to validate -//! that PathCollab can handle activity spikes with 20 followers -//! per session at 30Hz cursor + 10Hz viewport updates. +//! Provides a unified benchmark system with three tiers: +//! - **Smoke**: Quick CI validation on every push (<30s) +//! - **Standard**: PR merge gate (~2min) +//! - **Stress**: Manual/release testing (~5min) +//! +//! ## Running Benchmarks +//! +//! ```bash +//! # Smoke test (CI) +//! cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture +//! +//! # Standard test (PR gate) +//! cargo test --test perf_tests bench_standard --release -- --ignored --nocapture +//! +//! # Stress test (release) +//! cargo test --test perf_tests bench_stress --release -- --ignored --nocapture +//! ``` #![allow(clippy::collapsible_if)] +pub mod benchmark; pub mod client; pub mod scenarios; use std::time::Duration; -/// Performance budget thresholds -pub mod budgets { - use std::time::Duration; - - /// Maximum acceptable P99 cursor broadcast latency - pub const CURSOR_P99_MAX: Duration = Duration::from_millis(100); - - /// Maximum acceptable P99 viewport broadcast latency - pub const VIEWPORT_P99_MAX: Duration = Duration::from_millis(150); - - /// Maximum acceptable message handling time - pub const MESSAGE_HANDLING_MAX: Duration = Duration::from_millis(10); -} - -/// Load test configuration -#[derive(Debug, Clone)] -pub struct LoadTestConfig { - /// Number of sessions to create - pub num_sessions: usize, - /// Number of followers per session - pub followers_per_session: usize, - /// Cursor update rate (Hz) - pub cursor_hz: u32, - /// Viewport update rate (Hz) - pub viewport_hz: u32, - /// Test duration - pub duration: Duration, - /// Server WebSocket URL - pub ws_url: String, - /// Server HTTP base URL (for fetching slide info) - pub http_url: String, +/// Benchmark tier for different testing scenarios +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BenchmarkTier { + /// Quick CI validation: 5 sessions, 10 users, 10s + Smoke, + /// PR merge gate: 25 sessions, 50 users, 30s + Standard, + /// Manual/release testing: 100 sessions, 200 users, 60s + Stress, } -impl Default for LoadTestConfig { - fn default() -> Self { - Self { - num_sessions: 5, - followers_per_session: 20, - cursor_hz: 30, - viewport_hz: 10, - duration: Duration::from_secs(60), - ws_url: "ws://127.0.0.1:8080/ws".to_string(), - http_url: "http://127.0.0.1:8080".to_string(), +impl BenchmarkTier { + /// Get the tier name for display + pub fn name(&self) -> &'static str { + match self { + BenchmarkTier::Smoke => "SMOKE", + BenchmarkTier::Standard => "STANDARD", + BenchmarkTier::Stress => "STRESS", } } } @@ -76,7 +66,7 @@ impl LatencyStats { } /// Calculate percentile (0-100) - pub fn percentile(&self, p: f64) -> Option { + fn percentile(&self, p: f64) -> Option { if self.samples.is_empty() { return None; } @@ -88,141 +78,8 @@ impl LatencyStats { Some(sorted[idx.min(sorted.len() - 1)]) } - /// Calculate P50 (median) - pub fn p50(&self) -> Option { - self.percentile(50.0) - } - - /// Calculate P95 - pub fn p95(&self) -> Option { - self.percentile(95.0) - } - /// Calculate P99 pub fn p99(&self) -> Option { self.percentile(99.0) } } - -/// Load test results -#[derive(Debug)] -pub struct LoadTestResults { - /// Cursor broadcast latencies - pub cursor_latencies: LatencyStats, - /// Viewport broadcast latencies - pub viewport_latencies: LatencyStats, - /// Message handling latencies - pub message_latencies: LatencyStats, - /// Total messages sent - pub messages_sent: u64, - /// Total messages received - pub messages_received: u64, - /// Connection errors - pub connection_errors: u64, - /// Test duration - pub duration: Duration, -} - -impl LoadTestResults { - pub fn new() -> Self { - Self { - cursor_latencies: LatencyStats::new(), - viewport_latencies: LatencyStats::new(), - message_latencies: LatencyStats::new(), - messages_sent: 0, - messages_received: 0, - connection_errors: 0, - duration: Duration::ZERO, - } - } - - /// Check if results meet performance budgets - pub fn meets_budgets(&self) -> bool { - let cursor_ok = self - .cursor_latencies - .p99() - .map(|p| p <= budgets::CURSOR_P99_MAX) - .unwrap_or(true); - - let viewport_ok = self - .viewport_latencies - .p99() - .map(|p| p <= budgets::VIEWPORT_P99_MAX) - .unwrap_or(true); - - let message_ok = self - .message_latencies - .p99() - .map(|p| p <= budgets::MESSAGE_HANDLING_MAX) - .unwrap_or(true); - - cursor_ok && viewport_ok && message_ok - } - - /// Generate a summary report - pub fn report(&self) -> String { - let mut report = String::new(); - report.push_str("=== Load Test Results ===\n\n"); - - report.push_str(&format!("Duration: {:.2}s\n", self.duration.as_secs_f64())); - report.push_str(&format!("Messages sent: {}\n", self.messages_sent)); - report.push_str(&format!("Messages received: {}\n", self.messages_received)); - report.push_str(&format!( - "Connection errors: {}\n\n", - self.connection_errors - )); - - report.push_str("Cursor Latencies:\n"); - if let Some(p50) = self.cursor_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.cursor_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.cursor_latencies.p99() { - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budgets::CURSOR_P99_MAX, - if p99 <= budgets::CURSOR_P99_MAX { - "OK" - } else { - "EXCEEDED" - } - )); - } - - report.push_str("\nViewport Latencies:\n"); - if let Some(p50) = self.viewport_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.viewport_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.viewport_latencies.p99() { - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budgets::VIEWPORT_P99_MAX, - if p99 <= budgets::VIEWPORT_P99_MAX { - "OK" - } else { - "EXCEEDED" - } - )); - } - - report.push_str(&format!( - "\nOverall: {}\n", - if self.meets_budgets() { "PASS" } else { "FAIL" } - )); - - report - } -} - -impl Default for LoadTestResults { - fn default() -> Self { - Self::new() - } -} diff --git a/server/tests/load_tests/scenarios/comprehensive.rs b/server/tests/load_tests/scenarios/comprehensive.rs index dd808c1..01c06b2 100644 --- a/server/tests/load_tests/scenarios/comprehensive.rs +++ b/server/tests/load_tests/scenarios/comprehensive.rs @@ -1,15 +1,24 @@ //! Comprehensive stress test scenario //! -//! Simulates 1000 concurrent users (500 sessions × 2 users each) hitting all server routes: +//! Simulates concurrent users hitting all server routes: //! - WebSocket sessions with cursor/viewport updates //! - HTTP tile requests //! - HTTP overlay requests (cell and tissue) //! - Metadata endpoints //! //! This tests the server's ability to handle realistic production-like load. +//! +//! ## Benchmark Tiers +//! +//! | Tier | Sessions | Users | Duration | +//! |----------|----------|-------|----------| +//! | Smoke | 5 | 10 | 10s | +//! | Standard | 25 | 50 | 30s | +//! | Stress | 100 | 200 | 60s | #![allow(clippy::collapsible_if)] +use super::super::BenchmarkTier; use super::super::LatencyStats; use super::super::client::{LoadTestClient, ServerMessage, fetch_first_slide}; use reqwest::Client; @@ -54,6 +63,41 @@ impl Default for ComprehensiveStressConfig { } } +impl ComprehensiveStressConfig { + /// Create configuration for a specific benchmark tier + pub fn for_tier(tier: BenchmarkTier) -> Self { + match tier { + BenchmarkTier::Smoke => Self { + num_sessions: 5, // 10 users + duration: Duration::from_secs(10), + cursor_hz: 10, + viewport_hz: 5, + tile_request_hz: 2, + overlay_request_hz: 1, + ..Default::default() + }, + BenchmarkTier::Standard => Self { + num_sessions: 25, // 50 users + duration: Duration::from_secs(30), + cursor_hz: 30, + viewport_hz: 10, + tile_request_hz: 5, + overlay_request_hz: 2, + ..Default::default() + }, + BenchmarkTier::Stress => Self { + num_sessions: 100, // 200 users + duration: Duration::from_secs(60), + cursor_hz: 30, + viewport_hz: 10, + tile_request_hz: 5, + overlay_request_hz: 2, + ..Default::default() + }, + } + } +} + /// Extended results for comprehensive stress test #[derive(Debug)] pub struct ComprehensiveStressResults { @@ -81,6 +125,22 @@ pub struct ComprehensiveStressResults { pub duration: Duration, } +/// Performance budgets for benchmarks +pub mod budgets { + use std::time::Duration; + + /// Maximum acceptable P99 cursor broadcast latency + pub const CURSOR_P99_MAX: Duration = Duration::from_millis(100); + /// Maximum acceptable P99 viewport broadcast latency + pub const VIEWPORT_P99_MAX: Duration = Duration::from_millis(150); + /// Maximum acceptable P99 tile serving latency + pub const TILE_P99_MAX: Duration = Duration::from_millis(500); + /// Maximum acceptable P99 overlay latency + pub const OVERLAY_P99_MAX: Duration = Duration::from_millis(1000); + /// Maximum acceptable error rate + pub const ERROR_RATE_MAX: f64 = 0.01; // 1% +} + impl ComprehensiveStressResults { pub fn new() -> Self { Self { @@ -100,176 +160,60 @@ impl ComprehensiveStressResults { } } + /// Calculate error rate as a fraction (0.0 to 1.0) + pub fn error_rate(&self) -> f64 { + let total_requests = self.http_requests_sent + self.ws_messages_sent; + let total_errors = self.http_requests_failed + self.ws_connection_errors; + if total_requests > 0 { + total_errors as f64 / total_requests as f64 + } else { + 0.0 + } + } + + /// Minimum samples required to consider a latency measurement valid + const MIN_LATENCY_SAMPLES: usize = 10; + /// Check if results meet performance budgets pub fn meets_budgets(&self) -> bool { // WebSocket latency budgets + // Note: The server doesn't send Acks for cursor/viewport updates (fire-and-forget + // for performance), so latency samples may be empty. That's OK - we check if + // we have samples, and only fail if samples exceed budget. let cursor_ok = self .cursor_latencies .p99() - .map(|p| p <= Duration::from_millis(100)) - .unwrap_or(true); + .map(|p| p <= budgets::CURSOR_P99_MAX) + .unwrap_or(true); // OK if no samples (server doesn't Ack cursor updates) let viewport_ok = self .viewport_latencies .p99() - .map(|p| p <= Duration::from_millis(150)) - .unwrap_or(true); - - // HTTP latency budgets - let tile_ok = self - .tile_latencies - .p99() - .map(|p| p <= Duration::from_millis(500)) - .unwrap_or(true); + .map(|p| p <= budgets::VIEWPORT_P99_MAX) + .unwrap_or(true); // OK if no samples (server doesn't Ack viewport updates) + + // HTTP latency budgets - require samples if we had successful requests + let tile_ok = if self.http_requests_success > 0 { + self.tile_latencies + .p99() + .map(|p| p <= budgets::TILE_P99_MAX) + .unwrap_or_else(|| self.tile_latencies.samples.len() >= Self::MIN_LATENCY_SAMPLES) + } else { + true + }; + // Overlay is optional - many test setups don't have overlay data let overlay_ok = self .overlay_latencies .p99() - .map(|p| p <= Duration::from_millis(1000)) - .unwrap_or(true); + .map(|p| p <= budgets::OVERLAY_P99_MAX) + .unwrap_or(true); // OK if no overlay data - // Error rate budget: < 1% - let total_requests = self.http_requests_sent + self.ws_messages_sent; - let total_errors = self.http_requests_failed + self.ws_connection_errors; - let error_rate_ok = if total_requests > 0 { - (total_errors as f64 / total_requests as f64) < 0.01 - } else { - true - }; + // Error rate budget + let error_rate_ok = self.error_rate() < budgets::ERROR_RATE_MAX; cursor_ok && viewport_ok && tile_ok && overlay_ok && error_rate_ok } - - /// Generate a summary report - pub fn report(&self) -> String { - let mut report = String::new(); - report.push_str("=== Comprehensive Stress Test Results ===\n\n"); - - report.push_str(&format!("Duration: {:.2}s\n", self.duration.as_secs_f64())); - report.push_str(&format!( - "Total users: {} (sessions: {}, joined: {})\n", - self.sessions_created + self.sessions_joined, - self.sessions_created, - self.sessions_joined - )); - - report.push_str("\n--- WebSocket Stats ---\n"); - report.push_str(&format!("Messages sent: {}\n", self.ws_messages_sent)); - report.push_str(&format!( - "Messages received: {}\n", - self.ws_messages_received - )); - report.push_str(&format!( - "Connection errors: {}\n", - self.ws_connection_errors - )); - - let ws_throughput = self.ws_messages_sent as f64 / self.duration.as_secs_f64(); - report.push_str(&format!("WS throughput: {:.1} msg/s\n", ws_throughput)); - - report.push_str("\n--- HTTP Stats ---\n"); - report.push_str(&format!("Requests sent: {}\n", self.http_requests_sent)); - report.push_str(&format!( - "Requests success: {}\n", - self.http_requests_success - )); - report.push_str(&format!("Requests failed: {}\n", self.http_requests_failed)); - - let http_throughput = self.http_requests_sent as f64 / self.duration.as_secs_f64(); - report.push_str(&format!("HTTP throughput: {:.1} req/s\n", http_throughput)); - - let total_throughput = ws_throughput + http_throughput; - report.push_str(&format!( - "\nTotal throughput: {:.1} ops/s\n", - total_throughput - )); - - report.push_str("\n--- Latencies ---\n"); - - report.push_str("\nCursor (WS) Latencies:\n"); - if let Some(p50) = self.cursor_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.cursor_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.cursor_latencies.p99() { - let budget = Duration::from_millis(100); - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budget, - if p99 <= budget { "OK" } else { "EXCEEDED" } - )); - } - - report.push_str("\nViewport (WS) Latencies:\n"); - if let Some(p50) = self.viewport_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.viewport_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.viewport_latencies.p99() { - let budget = Duration::from_millis(150); - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budget, - if p99 <= budget { "OK" } else { "EXCEEDED" } - )); - } - - report.push_str("\nTile (HTTP) Latencies:\n"); - if let Some(p50) = self.tile_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.tile_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.tile_latencies.p99() { - let budget = Duration::from_millis(500); - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budget, - if p99 <= budget { "OK" } else { "EXCEEDED" } - )); - } - - report.push_str("\nOverlay (HTTP) Latencies:\n"); - if let Some(p50) = self.overlay_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.overlay_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.overlay_latencies.p99() { - let budget = Duration::from_millis(1000); - report.push_str(&format!( - " P99: {:?} (budget: {:?}) {}\n", - p99, - budget, - if p99 <= budget { "OK" } else { "EXCEEDED" } - )); - } - - let error_rate = if self.http_requests_sent + self.ws_messages_sent > 0 { - (self.http_requests_failed + self.ws_connection_errors) as f64 - / (self.http_requests_sent + self.ws_messages_sent) as f64 - * 100.0 - } else { - 0.0 - }; - report.push_str(&format!("\nError rate: {:.3}% (budget: <1%)\n", error_rate)); - - report.push_str(&format!( - "\nOverall: {}\n", - if self.meets_budgets() { "PASS" } else { "FAIL" } - )); - - report - } } impl Default for ComprehensiveStressResults { @@ -280,15 +224,11 @@ impl Default for ComprehensiveStressResults { /// Event types for comprehensive test #[derive(Debug)] -#[allow(dead_code)] pub enum ComprehensiveEvent { - WsMessageSent, - WsMessageReceived { msg_type: &'static str }, - WsError, + WsCursorAck { latency: Duration }, + WsViewportAck { latency: Duration }, HttpTileRequest { latency: Duration, success: bool }, HttpOverlayRequest { latency: Duration, success: bool }, - SessionCreated, - SessionJoined, } /// Comprehensive stress test scenario @@ -378,6 +318,8 @@ impl ComprehensiveStressScenario { true, // is_presenter http_client.clone(), slide.id.clone(), + slide.width, + slide.height, tx.clone(), ws_sent.clone(), ws_recv.clone(), @@ -411,6 +353,8 @@ impl ComprehensiveStressScenario { false, // is_presenter http_client.clone(), slide.id.clone(), + slide.width, + slide.height, tx.clone(), ws_sent.clone(), ws_recv.clone(), @@ -431,6 +375,8 @@ impl ComprehensiveStressScenario { drop(tx); // Collect events + let mut cursor_latencies = LatencyStats::new(); + let mut viewport_latencies = LatencyStats::new(); let mut tile_latencies = LatencyStats::new(); let mut overlay_latencies = LatencyStats::new(); @@ -440,6 +386,12 @@ impl ComprehensiveStressScenario { while collect_start.elapsed() < collect_duration { match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await { Ok(Some(event)) => match event { + ComprehensiveEvent::WsCursorAck { latency } => { + cursor_latencies.record(latency); + } + ComprehensiveEvent::WsViewportAck { latency } => { + viewport_latencies.record(latency); + } ComprehensiveEvent::HttpTileRequest { latency, success: true, @@ -474,6 +426,8 @@ impl ComprehensiveStressScenario { results.http_requests_failed = http_failed.load(Ordering::SeqCst); results.sessions_created = sessions_created.load(Ordering::SeqCst); results.sessions_joined = sessions_joined.load(Ordering::SeqCst); + results.cursor_latencies = cursor_latencies; + results.viewport_latencies = viewport_latencies; results.tile_latencies = tile_latencies; results.overlay_latencies = overlay_latencies; results.duration = start.elapsed(); @@ -482,12 +436,15 @@ impl ComprehensiveStressScenario { } /// Spawn a user task that does both WebSocket and HTTP operations + #[allow(clippy::too_many_arguments)] fn spawn_user_task( &self, mut client: LoadTestClient, is_presenter: bool, http_client: Client, slide_id: String, + slide_width: u64, + slide_height: u64, tx: mpsc::Sender, ws_sent: Arc, ws_recv: Arc, @@ -511,6 +468,19 @@ impl ComprehensiveStressScenario { let overlay_hz = self.config.overlay_request_hz; let http_url = self.config.http_url.clone(); + // Calculate valid tile range based on slide dimensions + // DZI convention: max_level = ceil(log2(max(width, height))) + // At level N, dimensions are width/2^(max_level-N) x height/2^(max_level-N) + let tile_size = 256u64; + let max_level = (slide_width.max(slide_height) as f64).log2().ceil() as u32; + // Use a level 3-4 below max to get ~50-200 tiles (good for testing) + let test_level = max_level.saturating_sub(3); + let level_scale = 1u64 << (max_level - test_level); + let level_width = slide_width / level_scale.max(1); + let level_height = slide_height / level_scale.max(1); + let max_tile_x = level_width.div_ceil(tile_size).max(1) as u32; + let max_tile_y = level_height.div_ceil(tile_size).max(1) as u32; + tokio::spawn(async move { let cursor_interval = if cursor_hz > 0 { Duration::from_secs_f64(1.0 / cursor_hz as f64) @@ -548,6 +518,11 @@ impl ComprehensiveStressScenario { let mut tile_x = 0u32; let mut tile_y = 0u32; + // Track pending operations for latency measurement + // Key: seq number, Value: (send_time, is_cursor) + let mut pending_ws: std::collections::HashMap = + std::collections::HashMap::new(); + loop { if start.elapsed() >= duration { break; @@ -561,45 +536,46 @@ impl ComprehensiveStressScenario { if x >= 1.0 { x = 0.0; } if y >= 1.0 { y = 0.0; } - match client.send_cursor(x * 100000.0, y * 100000.0).await { - Ok(_) => { + let send_time = Instant::now(); + match client.send_cursor(x * slide_width as f64, y * slide_height as f64).await { + Ok(seq) => { ws_sent.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(ComprehensiveEvent::WsMessageSent).await; + pending_ws.insert(seq, (send_time, true)); // true = cursor } Err(_) => { ws_errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(ComprehensiveEvent::WsError).await; } } } // Presenter sends viewport updates _ = viewport_ticker.tick(), if is_presenter => { + let send_time = Instant::now(); match client.send_viewport(0.5, 0.5, 1.0).await { - Ok(_) => { + Ok(seq) => { ws_sent.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(ComprehensiveEvent::WsMessageSent).await; + pending_ws.insert(seq, (send_time, false)); // false = viewport } Err(_) => { ws_errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(ComprehensiveEvent::WsError).await; } } } - // Both users request tiles + // Both users request tiles - use valid coordinates _ = tile_ticker.tick() => { http_sent.fetch_add(1, Ordering::SeqCst); - let level = 5; let url = format!( "{}/api/slide/{}/tile/{}/{}/{}", - http_url, slide_id, level, tile_x, tile_y + http_url, slide_id, test_level, tile_x % max_tile_x, tile_y % max_tile_y ); let req_start = Instant::now(); match http_client.get(&url).send().await { Ok(resp) => { let latency = req_start.elapsed(); + // 200 = success, 404 = tile doesn't exist but server responded correctly + // Both count as successful server responses for latency measurement if resp.status().is_success() || resp.status().as_u16() == 404 { http_success.fetch_add(1, Ordering::SeqCst); let _ = tx.send(ComprehensiveEvent::HttpTileRequest { @@ -608,6 +584,10 @@ impl ComprehensiveStressScenario { }).await; } else { http_failed.fetch_add(1, Ordering::SeqCst); + let _ = tx.send(ComprehensiveEvent::HttpTileRequest { + latency, + success: false, + }).await; } } Err(_) => { @@ -615,9 +595,9 @@ impl ComprehensiveStressScenario { } } - tile_x = (tile_x + 1) % 40; - if tile_x == 0 { - tile_y = (tile_y + 1) % 40; + tile_x = tile_x.wrapping_add(1); + if tile_x.is_multiple_of(max_tile_x) { + tile_y = tile_y.wrapping_add(1); } } @@ -626,16 +606,18 @@ impl ComprehensiveStressScenario { http_sent.fetch_add(1, Ordering::SeqCst); // Alternate between tissue tiles and cell queries - let is_tissue = tile_x % 2 == 0; + let is_tissue = tile_x.is_multiple_of(2); let url = if is_tissue { format!( "{}/api/slide/{}/overlay/tissue/{}/{}/{}", - http_url, slide_id, 3, tile_x % 20, tile_y % 20 + http_url, slide_id, test_level.saturating_sub(2), tile_x % max_tile_x, tile_y % max_tile_y ) } else { format!( "{}/api/slide/{}/overlay/cells?x={}&y={}&width=5000&height=5000", - http_url, slide_id, (tile_x as f64) * 1000.0, (tile_y as f64) * 1000.0 + http_url, slide_id, + ((tile_x % max_tile_x) as f64) * 256.0 * (level_scale as f64), + ((tile_y % max_tile_y) as f64) * 256.0 * (level_scale as f64) ) }; @@ -643,6 +625,7 @@ impl ComprehensiveStressScenario { match http_client.get(&url).send().await { Ok(resp) => { let latency = req_start.elapsed(); + // Overlays may legitimately 404 if no overlay data exists if resp.status().is_success() || resp.status().as_u16() == 404 { http_success.fetch_add(1, Ordering::SeqCst); let _ = tx.send(ComprehensiveEvent::HttpOverlayRequest { @@ -659,18 +642,23 @@ impl ComprehensiveStressScenario { } } - // Receive WebSocket messages (followers receive presence updates) + // Receive WebSocket messages - track Ack latencies _ = ws_recv_interval.tick() => { match client.recv_timeout(Duration::from_millis(10)).await { Ok(Some(msg)) => { ws_recv.fetch_add(1, Ordering::SeqCst); - let msg_type = match &msg { - ServerMessage::PresenceDelta { .. } => "presence", - ServerMessage::PresenterViewport { .. } => "viewport", - ServerMessage::Ack { .. } => "ack", - _ => "other", - }; - let _ = tx.send(ComprehensiveEvent::WsMessageReceived { msg_type }).await; + if let ServerMessage::Ack { ack_seq, status, .. } = &msg { + if status == "ok" { + if let Some((send_time, is_cursor)) = pending_ws.remove(ack_seq) { + let latency = send_time.elapsed(); + if is_cursor { + let _ = tx.send(ComprehensiveEvent::WsCursorAck { latency }).await; + } else { + let _ = tx.send(ComprehensiveEvent::WsViewportAck { latency }).await; + } + } + } + } } Ok(None) => {} Err(_) => { @@ -679,6 +667,9 @@ impl ComprehensiveStressScenario { } } } + + // Clean up old pending entries (older than 5 seconds - likely missed) + pending_ws.retain(|_, (time, _)| time.elapsed() < Duration::from_secs(5)); } let _ = client.close().await; @@ -686,54 +677,4 @@ impl ComprehensiveStressScenario { } } -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - #[ignore = "requires running server"] - async fn test_comprehensive_minimal() { - let config = ComprehensiveStressConfig { - num_sessions: 5, // 10 users - duration: Duration::from_secs(10), - cursor_hz: 10, - viewport_hz: 5, - tile_request_hz: 2, - overlay_request_hz: 1, - ..Default::default() - }; - - let scenario = ComprehensiveStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.ws_messages_sent > 0, "Should have sent WS messages"); - assert!( - results.http_requests_sent > 0, - "Should have sent HTTP requests" - ); - } - - #[tokio::test] - #[ignore = "requires running server - long running"] - async fn test_comprehensive_1000_users() { - let config = ComprehensiveStressConfig { - num_sessions: 500, // 1000 users - duration: Duration::from_secs(60), - cursor_hz: 30, - viewport_hz: 10, - tile_request_hz: 5, - overlay_request_hz: 2, - ..Default::default() - }; - - let scenario = ComprehensiveStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!( - results.meets_budgets(), - "Should meet performance budgets under 1000 user load" - ); - } -} +// Tests are in perf_tests.rs using the tier-based approach diff --git a/server/tests/load_tests/scenarios/fanout.rs b/server/tests/load_tests/scenarios/fanout.rs deleted file mode 100644 index 1b66597..0000000 --- a/server/tests/load_tests/scenarios/fanout.rs +++ /dev/null @@ -1,257 +0,0 @@ -//! Fan-out load test scenario -//! -//! Validates that PathCollab can handle N sessions with 20 followers each, -//! where the presenter sends 30Hz cursor updates and 10Hz viewport updates. -//! All followers should receive broadcasts with P99 < 100ms for cursors. - -#![allow(clippy::collapsible_if)] - -use super::super::client::{ - ClientEvent, LoadTestClient, ServerMessage, fetch_first_slide, spawn_update_client, -}; -use super::super::{LatencyStats, LoadTestConfig, LoadTestResults}; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::{Duration, Instant}; -use tokio::sync::mpsc; - -/// Fan-out load test scenario -pub struct FanOutScenario { - config: LoadTestConfig, -} - -impl FanOutScenario { - pub fn new(config: LoadTestConfig) -> Self { - Self { config } - } - - /// Run the fan-out scenario - /// - /// Creates N sessions, each with 1 presenter + 20 followers. - /// Presenter sends 30Hz cursor + 10Hz viewport updates. - /// Measures broadcast latency across all followers. - pub async fn run(&self) -> Result> { - let start = Instant::now(); - let mut results = LoadTestResults::new(); - - // Fetch available slide from server - let slide = fetch_first_slide(&self.config.http_url).await?; - println!("Using slide: {} ({})", slide.name, slide.id); - - // Channel for collecting events from all clients - let (tx, mut rx) = mpsc::channel::(10000); - - // Atomic counters for quick stats - let messages_sent = Arc::new(AtomicU64::new(0)); - let messages_received = Arc::new(AtomicU64::new(0)); - let connection_errors = Arc::new(AtomicU64::new(0)); - - let mut join_handles = Vec::new(); - - // Create sessions and spawn presenter + follower tasks - for session_idx in 0..self.config.num_sessions { - println!( - "Setting up session {}/{}", - session_idx + 1, - self.config.num_sessions - ); - - // Create presenter client - let presenter = match LoadTestClient::connect(&self.config.ws_url).await { - Ok(mut client) => { - // Create session with the discovered slide - if let Err(e) = client.create_session(&slide.id).await { - eprintln!("Failed to create session {}: {}", session_idx, e); - connection_errors.fetch_add(1, Ordering::SeqCst); - continue; - } - client - } - Err(e) => { - eprintln!("Failed to connect presenter {}: {}", session_idx, e); - connection_errors.fetch_add(1, Ordering::SeqCst); - continue; - } - }; - - let session_id = presenter.session_id.clone().unwrap(); - let join_secret = presenter.join_secret.clone().unwrap(); - - // Spawn presenter task (sends updates) - let presenter_tx = tx.clone(); - let cursor_hz = self.config.cursor_hz; - let viewport_hz = self.config.viewport_hz; - let duration = self.config.duration; - let handle = tokio::spawn(async move { - spawn_update_client(presenter, cursor_hz, viewport_hz, duration, presenter_tx) - .await; - }); - join_handles.push(handle); - - // Create follower clients - for follower_idx in 0..self.config.followers_per_session { - let follower_tx = tx.clone(); - let ws_url = self.config.ws_url.clone(); - let session_id = session_id.clone(); - let join_secret = join_secret.clone(); - let duration = self.config.duration; - let errors = connection_errors.clone(); - let recv_count = messages_received.clone(); - - let handle = tokio::spawn(async move { - // Connect and join session - let client = match LoadTestClient::connect(&ws_url).await { - Ok(mut c) => { - if let Err(e) = c.join_session(&session_id, &join_secret).await { - eprintln!("Follower {} failed to join: {}", follower_idx, e); - errors.fetch_add(1, Ordering::SeqCst); - return; - } - c - } - Err(e) => { - eprintln!("Follower {} failed to connect: {}", follower_idx, e); - errors.fetch_add(1, Ordering::SeqCst); - return; - } - }; - - // Receive messages for duration - let start = Instant::now(); - let mut ws = client; - while start.elapsed() < duration { - match ws.recv_timeout(Duration::from_millis(100)).await { - Ok(Some(msg)) => { - recv_count.fetch_add(1, Ordering::SeqCst); - // Track message type for latency if it's an Ack - let msg_type = match &msg { - ServerMessage::PresenceDelta { .. } => "presence", - ServerMessage::PresenterViewport { .. } => "viewport", - ServerMessage::Ack { .. } => "ack", - _ => "other", - }; - let _ = follower_tx - .send(ClientEvent::MessageReceived { - latency: None, // We track latency on presenter side - msg_type, - }) - .await; - } - Ok(None) => {} - Err(_) => { - let _ = follower_tx.send(ClientEvent::Error).await; - } - } - } - - let _ = ws.close().await; - }); - join_handles.push(handle); - } - - // Small delay between session setups to avoid thundering herd - tokio::time::sleep(Duration::from_millis(50)).await; - } - - // Drop the original sender so rx completes when all tasks are done - drop(tx); - - // Collect events from all clients - let mut cursor_latencies = LatencyStats::new(); - let mut viewport_latencies = LatencyStats::new(); - - // Process events as they come in (but don't block forever) - let collect_duration = self.config.duration + Duration::from_secs(5); - let collect_start = Instant::now(); - - while collect_start.elapsed() < collect_duration { - match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await { - Ok(Some(event)) => match event { - ClientEvent::MessageSent => { - messages_sent.fetch_add(1, Ordering::SeqCst); - } - ClientEvent::MessageReceived { latency, msg_type } => { - // Note: messages_received is already incremented in the follower tasks - // via recv_count, so we don't increment here to avoid double-counting - if let Some(lat) = latency { - match msg_type { - "presence" | "cursor" => cursor_latencies.record(lat), - "viewport" => viewport_latencies.record(lat), - _ => {} - } - } - } - ClientEvent::Error => { - connection_errors.fetch_add(1, Ordering::SeqCst); - } - }, - Ok(None) => break, // Channel closed - Err(_) => {} // Timeout, continue - } - } - - // Wait for all tasks to complete - for handle in join_handles { - let _ = handle.await; - } - - results.cursor_latencies = cursor_latencies; - results.viewport_latencies = viewport_latencies; - results.messages_sent = messages_sent.load(Ordering::SeqCst); - results.messages_received = messages_received.load(Ordering::SeqCst); - results.connection_errors = connection_errors.load(Ordering::SeqCst); - results.duration = start.elapsed(); - - Ok(results) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - // Note: These tests require a running server - // Run with: cargo test --test perf_tests -- --ignored - - #[tokio::test] - #[ignore = "requires running server"] - async fn test_fanout_single_session() { - let config = LoadTestConfig { - num_sessions: 1, - followers_per_session: 5, - cursor_hz: 10, - viewport_hz: 5, - duration: Duration::from_secs(5), - ..Default::default() - }; - - let scenario = FanOutScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.messages_sent > 0, "Should have sent messages"); - assert!( - results.messages_received > 0, - "Should have received messages" - ); - } - - #[tokio::test] - #[ignore = "requires running server"] - async fn test_fanout_full_load() { - let config = LoadTestConfig { - num_sessions: 5, - followers_per_session: 20, - cursor_hz: 30, - viewport_hz: 10, - duration: Duration::from_secs(60), - ..Default::default() - }; - - let scenario = FanOutScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.meets_budgets(), "Should meet performance budgets"); - } -} diff --git a/server/tests/load_tests/scenarios/mod.rs b/server/tests/load_tests/scenarios/mod.rs index 7fcb8a9..653cf0d 100644 --- a/server/tests/load_tests/scenarios/mod.rs +++ b/server/tests/load_tests/scenarios/mod.rs @@ -1,9 +1,10 @@ //! Load test scenarios +//! +//! Single comprehensive benchmark that tests all hot paths: +//! - WebSocket cursor/viewport broadcasts +//! - HTTP tile serving +//! - HTTP overlay requests pub mod comprehensive; -pub mod fanout; -pub mod overlay; pub use comprehensive::{ComprehensiveStressConfig, ComprehensiveStressScenario}; -pub use fanout::FanOutScenario; -pub use overlay::{OverlayStressConfig, OverlayStressScenario}; diff --git a/server/tests/load_tests/scenarios/overlay.rs b/server/tests/load_tests/scenarios/overlay.rs deleted file mode 100644 index 8840c3c..0000000 --- a/server/tests/load_tests/scenarios/overlay.rs +++ /dev/null @@ -1,402 +0,0 @@ -//! Overlay stress test scenario -//! -//! Validates that PathCollab can handle concurrent requests for: -//! - Tissue overlay tiles (GET /api/slide/:id/overlay/tissue/:level/:x/:y) -//! - Cell overlay queries (GET /api/slide/:id/overlay/cells?x=...&y=...&width=...&height=...) -//! - Overlay metadata endpoints -//! -//! This scenario focuses specifically on the HTTP overlay endpoints under load. - -#![allow(clippy::collapsible_if)] - -use super::super::client::fetch_first_slide; -use super::super::{LatencyStats, LoadTestResults}; -use reqwest::Client; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::{Duration, Instant}; -use tokio::sync::mpsc; - -/// Configuration for overlay stress test -#[derive(Debug, Clone)] -pub struct OverlayStressConfig { - /// Number of concurrent clients - pub num_clients: usize, - /// Test duration - pub duration: Duration, - /// Server base URL (e.g., "http://127.0.0.1:8080") - pub base_url: String, - /// Rate of tissue tile requests per client (Hz) - pub tissue_tile_hz: u32, - /// Rate of cell query requests per client (Hz) - pub cell_query_hz: u32, -} - -impl Default for OverlayStressConfig { - fn default() -> Self { - Self { - num_clients: 50, - duration: Duration::from_secs(30), - base_url: "http://127.0.0.1:8080".to_string(), - tissue_tile_hz: 10, - cell_query_hz: 2, - } - } -} - -/// Extended results for overlay stress test -#[derive(Debug)] -pub struct OverlayStressResults { - /// Base results - pub base: LoadTestResults, - /// Tissue tile request latencies - pub tissue_tile_latencies: LatencyStats, - /// Cell query latencies - pub cell_query_latencies: LatencyStats, - /// Metadata request latencies - pub metadata_latencies: LatencyStats, - /// Number of 404 responses (expected for non-existent tiles) - pub not_found_count: u64, - /// Number of successful requests - pub success_count: u64, -} - -impl OverlayStressResults { - pub fn new() -> Self { - Self { - base: LoadTestResults::new(), - tissue_tile_latencies: LatencyStats::new(), - cell_query_latencies: LatencyStats::new(), - metadata_latencies: LatencyStats::new(), - not_found_count: 0, - success_count: 0, - } - } - - /// Generate a summary report - pub fn report(&self) -> String { - let mut report = String::new(); - report.push_str("=== Overlay Stress Test Results ===\n\n"); - - report.push_str(&format!( - "Duration: {:.2}s\n", - self.base.duration.as_secs_f64() - )); - report.push_str(&format!("Total requests: {}\n", self.base.messages_sent)); - report.push_str(&format!("Successful: {}\n", self.success_count)); - report.push_str(&format!("Not found (404): {}\n", self.not_found_count)); - report.push_str(&format!("Errors: {}\n\n", self.base.connection_errors)); - - let throughput = self.base.messages_sent as f64 / self.base.duration.as_secs_f64(); - report.push_str(&format!("Throughput: {:.1} req/s\n\n", throughput)); - - report.push_str("Tissue Tile Latencies:\n"); - if let Some(p50) = self.tissue_tile_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.tissue_tile_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.tissue_tile_latencies.p99() { - report.push_str(&format!(" P99: {:?}\n", p99)); - } - - report.push_str("\nCell Query Latencies:\n"); - if let Some(p50) = self.cell_query_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.cell_query_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.cell_query_latencies.p99() { - report.push_str(&format!(" P99: {:?}\n", p99)); - } - - report.push_str("\nMetadata Latencies:\n"); - if let Some(p50) = self.metadata_latencies.p50() { - report.push_str(&format!(" P50: {:?}\n", p50)); - } - if let Some(p95) = self.metadata_latencies.p95() { - report.push_str(&format!(" P95: {:?}\n", p95)); - } - if let Some(p99) = self.metadata_latencies.p99() { - report.push_str(&format!(" P99: {:?}\n", p99)); - } - - report - } -} - -impl Default for OverlayStressResults { - fn default() -> Self { - Self::new() - } -} - -/// Event types from overlay client tasks -#[derive(Debug)] -#[allow(dead_code)] -pub enum OverlayEvent { - TissueTileRequest { latency: Duration, success: bool }, - CellQueryRequest { latency: Duration, success: bool }, - MetadataRequest { latency: Duration, success: bool }, - NotFound, - Error, -} - -/// Overlay stress test scenario -pub struct OverlayStressScenario { - config: OverlayStressConfig, -} - -impl OverlayStressScenario { - pub fn new(config: OverlayStressConfig) -> Self { - Self { config } - } - - /// Run the overlay stress test scenario - pub async fn run( - &self, - ) -> Result> { - let start = Instant::now(); - let mut results = OverlayStressResults::new(); - - // Fetch available slide from server - let slide = fetch_first_slide(&self.config.base_url).await?; - println!("Using slide: {} ({})", slide.name, slide.id); - - // Channel for collecting events - let (tx, mut rx) = mpsc::channel::(10000); - - // Atomic counters - let requests_sent = Arc::new(AtomicU64::new(0)); - let success_count = Arc::new(AtomicU64::new(0)); - let not_found_count = Arc::new(AtomicU64::new(0)); - let error_count = Arc::new(AtomicU64::new(0)); - - let mut join_handles = Vec::new(); - - // Create HTTP client with connection pooling - let http_client = Client::builder() - .pool_max_idle_per_host(100) - .timeout(Duration::from_secs(30)) - .build()?; - - println!( - "Starting overlay stress test with {} clients for {:?}", - self.config.num_clients, self.config.duration - ); - - // Spawn client tasks - for client_idx in 0..self.config.num_clients { - let client = http_client.clone(); - let tx = tx.clone(); - let base_url = self.config.base_url.clone(); - let slide_id = slide.id.clone(); - let duration = self.config.duration; - let tissue_hz = self.config.tissue_tile_hz; - let cell_hz = self.config.cell_query_hz; - let sent = requests_sent.clone(); - let success = success_count.clone(); - let not_found = not_found_count.clone(); - let errors = error_count.clone(); - - let handle = tokio::spawn(async move { - let tissue_interval = if tissue_hz > 0 { - Duration::from_secs_f64(1.0 / tissue_hz as f64) - } else { - Duration::from_secs(3600) - }; - - let cell_interval = if cell_hz > 0 { - Duration::from_secs_f64(1.0 / cell_hz as f64) - } else { - Duration::from_secs(3600) - }; - - let start = Instant::now(); - let mut tissue_ticker = tokio::time::interval(tissue_interval); - let mut cell_ticker = tokio::time::interval(cell_interval); - - // Vary tile coordinates to simulate realistic access patterns - let mut tile_x = client_idx as u32 % 10; - let mut tile_y = 0u32; - let level = 3; // Mid-level tiles - - loop { - if start.elapsed() >= duration { - break; - } - - tokio::select! { - _ = tissue_ticker.tick() => { - sent.fetch_add(1, Ordering::SeqCst); - - // Request tissue tile - let url = format!( - "{}/api/slide/{}/overlay/tissue/{}/{}/{}", - base_url, slide_id, level, tile_x, tile_y - ); - - let req_start = Instant::now(); - match client.get(&url).send().await { - Ok(resp) => { - let latency = req_start.elapsed(); - if resp.status().is_success() { - success.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::TissueTileRequest { - latency, - success: true, - }).await; - } else if resp.status().as_u16() == 404 { - not_found.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::NotFound).await; - } else { - errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::TissueTileRequest { - latency, - success: false, - }).await; - } - } - Err(_) => { - errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::Error).await; - } - } - - // Move to next tile - tile_x = (tile_x + 1) % 20; - if tile_x == 0 { - tile_y = (tile_y + 1) % 20; - } - } - _ = cell_ticker.tick() => { - sent.fetch_add(1, Ordering::SeqCst); - - // Request cells in region (varying region) - let region_x = (client_idx as f64 * 1000.0) % 50000.0; - let region_y = (client_idx as f64 * 500.0) % 50000.0; - let url = format!( - "{}/api/slide/{}/overlay/cells?x={}&y={}&width=5000&height=5000", - base_url, slide_id, region_x, region_y - ); - - let req_start = Instant::now(); - match client.get(&url).send().await { - Ok(resp) => { - let latency = req_start.elapsed(); - if resp.status().is_success() { - success.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::CellQueryRequest { - latency, - success: true, - }).await; - } else if resp.status().as_u16() == 404 { - not_found.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::NotFound).await; - } else { - errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::CellQueryRequest { - latency, - success: false, - }).await; - } - } - Err(_) => { - errors.fetch_add(1, Ordering::SeqCst); - let _ = tx.send(OverlayEvent::Error).await; - } - } - } - } - } - }); - join_handles.push(handle); - - // Small stagger to avoid thundering herd - if client_idx % 10 == 9 { - tokio::time::sleep(Duration::from_millis(10)).await; - } - } - - // Drop the original sender - drop(tx); - - // Collect events - let mut tissue_latencies = LatencyStats::new(); - let mut cell_latencies = LatencyStats::new(); - let mut metadata_latencies = LatencyStats::new(); - - let collect_duration = self.config.duration + Duration::from_secs(5); - let collect_start = Instant::now(); - - while collect_start.elapsed() < collect_duration { - match tokio::time::timeout(Duration::from_millis(100), rx.recv()).await { - Ok(Some(event)) => match event { - OverlayEvent::TissueTileRequest { - latency, - success: true, - } => { - tissue_latencies.record(latency); - } - OverlayEvent::CellQueryRequest { - latency, - success: true, - } => { - cell_latencies.record(latency); - } - OverlayEvent::MetadataRequest { - latency, - success: true, - } => { - metadata_latencies.record(latency); - } - _ => {} - }, - Ok(None) => break, - Err(_) => {} - } - } - - // Wait for all tasks - for handle in join_handles { - let _ = handle.await; - } - - // Populate results - results.base.messages_sent = requests_sent.load(Ordering::SeqCst); - results.success_count = success_count.load(Ordering::SeqCst); - results.not_found_count = not_found_count.load(Ordering::SeqCst); - results.base.connection_errors = error_count.load(Ordering::SeqCst); - results.base.duration = start.elapsed(); - results.tissue_tile_latencies = tissue_latencies; - results.cell_query_latencies = cell_latencies; - results.metadata_latencies = metadata_latencies; - - Ok(results) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - #[ignore = "requires running server"] - async fn test_overlay_stress_minimal() { - let config = OverlayStressConfig { - num_clients: 5, - duration: Duration::from_secs(5), - tissue_tile_hz: 5, - cell_query_hz: 1, - ..Default::default() - }; - - let scenario = OverlayStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.base.messages_sent > 0, "Should have sent requests"); - } -} diff --git a/server/tests/perf_tests.rs b/server/tests/perf_tests.rs index 51a7d5d..b906c9f 100644 --- a/server/tests/perf_tests.rs +++ b/server/tests/perf_tests.rs @@ -1,305 +1,108 @@ -//! Load test entry point +//! Unified Benchmark Suite for PathCollab //! -//! Run with: cargo test --test perf_tests -- --ignored --nocapture -//! Or for quick test: cargo test --test perf_tests test_connection -- --ignored --nocapture +//! This module provides a three-tier benchmark system for validating +//! server performance under load. //! -//! Available tests: -//! - test_connection: Quick connectivity test -//! - test_create_session: Session creation test -//! - test_fanout_minimal: Quick fan-out test (1 session, 3 followers, 3s) -//! - test_fanout_standard: Standard fan-out (5 sessions, 20 followers, 30s) -//! - test_fanout_extended: Extended fan-out (5 sessions, 20 followers, 5min) -//! - test_overlay_stress_minimal: Quick overlay stress test (5 clients, 5s) -//! - test_overlay_stress_standard: Standard overlay stress (50 clients, 30s) -//! - test_comprehensive_minimal: Quick comprehensive test (10 users, 10s) -//! - test_comprehensive_100_users: 100 users stress test (50 sessions, 30s) -//! - test_comprehensive_1000_users: Full 1000 users stress test (500 sessions, 60s) +//! ## Features +//! +//! - **Warm-up phase**: Primes caches and connection pools before measuring +//! - **Multiple iterations**: Runs 3 times for statistical significance +//! - **Baseline comparison**: Compares against stored baseline, detects regressions +//! +//! ## Benchmark Tiers +//! +//! | Tier | Purpose | Duration | Config | +//! |------------|-------------------|----------|---------------------------| +//! | `smoke` | CI on every push | <30s | 5 sessions, 10 users, 10s | +//! | `standard` | PR merge gate | ~2min | 25 sessions, 50 users, 30s| +//! | `stress` | Manual/release | ~5min | 100 sessions, 200 users | +//! +//! ## Running Benchmarks +//! +//! ```bash +//! # Quick smoke test (CI) - 3 iterations with warm-up +//! cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture +//! +//! # Standard test (PR merge gate) +//! cargo test --test perf_tests bench_standard --release -- --ignored --nocapture +//! +//! # Full stress test (manual/release) +//! cargo test --test perf_tests bench_stress --release -- --ignored --nocapture +//! +//! # Save current results as baseline +//! SAVE_BASELINE=1 cargo test --test perf_tests bench_smoke --release -- --ignored --nocapture +//! ``` +//! +//! ## Baseline Management +//! +//! Baselines are stored in `.benchmark-baseline.json`. Set `SAVE_BASELINE=1` to update. #![allow(clippy::collapsible_if)] mod load_tests; -use load_tests::scenarios::{ - ComprehensiveStressConfig, ComprehensiveStressScenario, FanOutScenario, OverlayStressConfig, - OverlayStressScenario, -}; -use load_tests::{LoadTestConfig, LoadTestResults}; -use std::time::Duration; +use load_tests::BenchmarkTier; +use load_tests::benchmark::{BenchmarkRunConfig, BenchmarkRunner}; -/// Quick connectivity test -#[tokio::test] -#[ignore = "requires running server"] -async fn test_connection() { - use load_tests::client::LoadTestClient; - - let url = "ws://127.0.0.1:8080/ws"; - let client: LoadTestClient = LoadTestClient::connect(url) - .await - .expect("Should connect to server"); - - println!("Connected successfully to {}", url); - client.close().await.expect("Should close cleanly"); -} - -/// Quick session creation test -#[tokio::test] -#[ignore = "requires running server"] -async fn test_create_session() { - use load_tests::client::{LoadTestClient, fetch_first_slide}; - - // Fetch available slide from server - let slide = fetch_first_slide("http://127.0.0.1:8080") - .await - .expect("Should have slides available"); - println!("Using slide: {} ({})", slide.name, slide.id); +/// Run a benchmark for the given tier with warm-up, iterations, and comparison +async fn run_benchmark(tier: BenchmarkTier) { + let config = BenchmarkRunConfig::for_tier(tier); + let runner = BenchmarkRunner::new(config.clone()); - let url = "ws://127.0.0.1:8080/ws"; - let mut client: LoadTestClient = LoadTestClient::connect(url) - .await - .expect("Should connect to server"); + let result = runner.run().await.expect("Benchmark should complete"); - client - .create_session(&slide.id) - .await - .expect("Should create session"); + // Print JSON for CI parsing + println!("JSON: {}", result.to_json()); - println!("Session created: {:?}", client.session_id); - assert!(client.session_id.is_some()); - assert!(client.join_secret.is_some()); - assert!(client.presenter_key.is_some()); - - client.close().await.expect("Should close cleanly"); -} - -/// Quick fan-out test with minimal load -#[tokio::test] -#[ignore = "requires running server"] -async fn test_fanout_minimal() { - let config = LoadTestConfig { - num_sessions: 1, - followers_per_session: 3, - cursor_hz: 10, - viewport_hz: 5, - duration: Duration::from_secs(3), - ..Default::default() - }; - - let scenario = FanOutScenario::new(config); - let results: LoadTestResults = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.messages_sent > 0, "Should have sent messages"); -} - -/// Standard fan-out test: 5 sessions, 20 followers each, 30 seconds -#[tokio::test] -#[ignore = "requires running server"] -async fn test_fanout_standard() { - let config = LoadTestConfig { - num_sessions: 5, - followers_per_session: 20, - cursor_hz: 30, - viewport_hz: 10, - duration: Duration::from_secs(30), - ..Default::default() - }; - - let scenario = FanOutScenario::new(config); - let results: LoadTestResults = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - - // Verify basic functionality - assert!(results.messages_sent > 0, "Should have sent messages"); - assert!( - results.messages_received > 0, - "Should have received messages" - ); - - // Check performance budgets - if !results.meets_budgets() { - println!("WARNING: Performance budgets exceeded!"); - // Don't fail the test yet, just warn + // Save baseline if requested + if std::env::var("SAVE_BASELINE").is_ok() { + runner + .save_baseline(&result.report) + .expect("Failed to save baseline"); } -} - -/// Extended fan-out test: 5 sessions, 20 followers each, 5 minutes -#[tokio::test] -#[ignore = "requires running server - long running"] -async fn test_fanout_extended() { - let config = LoadTestConfig { - num_sessions: 5, - followers_per_session: 20, - cursor_hz: 30, - viewport_hz: 10, - duration: Duration::from_secs(300), // 5 minutes - ..Default::default() - }; - - let scenario = FanOutScenario::new(config); - let results: LoadTestResults = scenario.run().await.expect("Scenario should complete"); - println!("{}", results.report()); - - // This is the primary performance validation + // Assert no regressions and budgets met assert!( - results.meets_budgets(), - "Should meet performance budgets under sustained load" + result.all_passed, + "Performance budgets not met for {} tier", + tier.name() ); -} - -/// Quick overlay stress test: 5 clients, 5 seconds -#[tokio::test] -#[ignore = "requires running server"] -async fn test_overlay_stress_minimal() { - let config = OverlayStressConfig { - num_clients: 5, - duration: Duration::from_secs(5), - tissue_tile_hz: 5, - cell_query_hz: 1, - ..Default::default() - }; - - let scenario = OverlayStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.base.messages_sent > 0, "Should have sent requests"); -} - -/// Standard overlay stress test: 50 clients, 30 seconds -#[tokio::test] -#[ignore = "requires running server"] -async fn test_overlay_stress_standard() { - let config = OverlayStressConfig { - num_clients: 50, - duration: Duration::from_secs(30), - tissue_tile_hz: 10, - cell_query_hz: 2, - ..Default::default() - }; - - let scenario = OverlayStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - - // Basic validation - ensure we actually did work - assert!(results.base.messages_sent > 0, "Should have sent requests"); - - // Most requests should succeed (allow for 404s on non-existent overlays) - let success_rate = (results.success_count + results.not_found_count) as f64 - / results.base.messages_sent as f64; assert!( - success_rate > 0.95, - "Success rate should be > 95%, was {:.1}%", - success_rate * 100.0 + !result.has_regression, + "Performance regression detected for {} tier", + tier.name() ); } -// ============================================================================ -// Comprehensive Stress Tests -// ============================================================================ - -/// Quick comprehensive test: 10 users (5 sessions), 10 seconds +/// Smoke benchmark: Quick CI validation on every push +/// +/// - Duration: ~30 seconds (2s warm-up + 3 × 10s iterations) +/// - Config: 5 sessions, 10 users +/// - Purpose: Fast feedback on obvious regressions #[tokio::test] #[ignore = "requires running server"] -async fn test_comprehensive_minimal() { - let config = ComprehensiveStressConfig { - num_sessions: 5, // 10 users - duration: Duration::from_secs(10), - cursor_hz: 10, - viewport_hz: 5, - tile_request_hz: 2, - overlay_request_hz: 1, - ..Default::default() - }; - - let scenario = ComprehensiveStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - assert!(results.ws_messages_sent > 0, "Should have sent WS messages"); - assert!( - results.http_requests_sent > 0, - "Should have sent HTTP requests" - ); +async fn bench_smoke() { + run_benchmark(BenchmarkTier::Smoke).await; } -/// 100 users comprehensive test: 50 sessions × 2 users, 30 seconds +/// Standard benchmark: PR merge gate +/// +/// - Duration: ~2 minutes (5s warm-up + 3 × 30s iterations) +/// - Config: 25 sessions, 50 users +/// - Purpose: Validate performance before merging PRs #[tokio::test] #[ignore = "requires running server"] -async fn test_comprehensive_100_users() { - let config = ComprehensiveStressConfig { - num_sessions: 50, // 100 users - duration: Duration::from_secs(30), - cursor_hz: 30, - viewport_hz: 10, - tile_request_hz: 5, - overlay_request_hz: 2, - ..Default::default() - }; - - let scenario = ComprehensiveStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - - // Basic validation - assert!(results.ws_messages_sent > 0, "Should have sent WS messages"); - assert!( - results.http_requests_sent > 0, - "Should have sent HTTP requests" - ); - - // Check we created and joined sessions successfully - assert!( - results.sessions_created >= 40, - "Should have created at least 40 sessions (got {})", - results.sessions_created - ); - assert!( - results.sessions_joined >= 40, - "Should have at least 40 followers (got {})", - results.sessions_joined - ); +async fn bench_standard() { + run_benchmark(BenchmarkTier::Standard).await; } -/// Full 1000 users stress test: 500 sessions × 2 users, 60 seconds -/// This is the primary performance validation for production readiness. +/// Stress benchmark: Manual/release testing +/// +/// - Duration: ~4 minutes (5s warm-up + 3 × 60s iterations) +/// - Config: 100 sessions, 200 users +/// - Purpose: Full stress test for releases #[tokio::test] #[ignore = "requires running server - long running"] -async fn test_comprehensive_1000_users() { - let config = ComprehensiveStressConfig { - num_sessions: 500, // 1000 users - duration: Duration::from_secs(60), - cursor_hz: 30, - viewport_hz: 10, - tile_request_hz: 5, - overlay_request_hz: 2, - ..Default::default() - }; - - let scenario = ComprehensiveStressScenario::new(config); - let results = scenario.run().await.expect("Scenario should complete"); - - println!("{}", results.report()); - - // This is the primary performance validation - assert!( - results.meets_budgets(), - "Should meet performance budgets under 1000 user load" - ); - - // Verify we actually achieved the target load - assert!( - results.sessions_created >= 450, - "Should have created at least 450 sessions (got {})", - results.sessions_created - ); - assert!( - results.sessions_joined >= 450, - "Should have at least 450 followers (got {})", - results.sessions_joined - ); +async fn bench_stress() { + run_benchmark(BenchmarkTier::Stress).await; }