diff --git a/.github/workflows/ci-bench.yml b/.github/workflows/ci-bench.yml new file mode 100644 index 00000000..81bfcd68 --- /dev/null +++ b/.github/workflows/ci-bench.yml @@ -0,0 +1,90 @@ +name: Graph Bench + +on: + pull_request: + types: [opened, synchronize, reopened, labeled, unlabeled, edited] + paths: + - "crates/**/*.rs" + - "tools/**/*.rs" + - "Cargo.toml" + - "crates/**/Cargo.toml" + - "tools/**/Cargo.toml" + - "Cargo.lock" + - "bench/baselines.json" + - "bench/check_baseline.sh" + - "bench/run_graph_bench_ci.sh" + - ".github/workflows/ci-bench.yml" + +permissions: + contents: read + +env: + CARGO_TERM_COLOR: always + GRAPH_BENCH_RUNS: 3 + GRAPH_BENCH_RUNNER_IMAGE: ubuntu-24.04 + +jobs: + ci-bench: + name: Graph Bench + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + ref: ${{ github.head_ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-05-07) + + - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-graph-bench-cargo-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-graph-bench-cargo- + ${{ runner.os }}-cargo- + + - name: Verify runner profile + run: | + set -euo pipefail + cores="$(getconf _NPROCESSORS_ONLN)" + memory_mib="$(awk '/MemTotal/ { print int($2 / 1024) }' /proc/meminfo)" + if [[ "$cores" -ne 4 || "$memory_mib" -lt 15000 || "$memory_mib" -gt 17000 ]]; then + echo "::error::Graph benchmark requires ubuntu-24.04 with 4 vCPU and approximately 16GB RAM; saw ${cores} cores and ${memory_mib} MiB." + exit 1 + fi + + - name: Run graph benchmark + run: ./bench/run_graph_bench_ci.sh + + - name: Upload graph benchmark results + if: always() && hashFiles('target/bench/results.json') != '' + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: graph-bench-results + path: target/bench/results.json + if-no-files-found: error + + - name: Validate baseline bump justification + id: baseline_bump + run: | + set -euo pipefail + has_bump_label="$(jq -r 'any(.pull_request.labels[]?; .name == "bench-baseline-bump")' "$GITHUB_EVENT_PATH")" + echo "has_bump_label=${has_bump_label}" >> "$GITHUB_OUTPUT" + + if [[ "$has_bump_label" == "true" ]]; then + if ! jq -e '(.pull_request.body // "") | test("^\\s*bench-baseline-bump\\s*:\\s*\\S"; "im")' "$GITHUB_EVENT_PATH" >/dev/null; then + echo "::error::PRs labeled bench-baseline-bump must include a one-line PR body justification like 'bench-baseline-bump: '." + exit 1 + fi + fi + + - name: Check graph benchmark baseline + if: steps.baseline_bump.outputs.has_bump_label != 'true' + run: ./bench/check_baseline.sh --results target/bench/results.json --baseline bench/baselines.json + + - name: Note baseline bump bypass + if: steps.baseline_bump.outputs.has_bump_label == 'true' + run: echo "bench-baseline-bump label present; baseline regression check bypassed after PR-body justification validation." diff --git a/bench/README.md b/bench/README.md index 0be06a40..d4f63d47 100644 --- a/bench/README.md +++ b/bench/README.md @@ -36,9 +36,42 @@ P6.2 owns wiring the permanent `graph_bench.rs` CI gate. Once that lands, use the gate artifact under `target/bench/` as the capture source instead of an ad-hoc capture script. +## CI Regression Gate + +The `Graph Bench / Graph Bench` PR job runs on the pinned +`ubuntu-24.04` runner profile from the spec. The workflow is path-filtered to +Rust source, Cargo, graph-bench, and workflow files so docs-only PRs do not +spend a benchmark runner. + +The job builds the release benchmark binaries, runs the graph benchmark three +times, and writes `target/bench/results.json`. Each reported row stores the raw +samples and the median value used for the gate. The artifact is uploaded as +`graph-bench-results` on the PR's Actions page. + +Gate decision for ORB-00321: v2 `orbit-graph` rows are gated against +`bench/baselines.json`; v1 `orbit-knowledge` rows from `graph_bench.rs` remain +informational in the artifact. GRAPH_SPEC section 12 sets budgets for the v2 +implementation, while the v1 rows are retained during the transition so +reviewers can compare old and new behavior. + +`bench/check_baseline.sh` compares every gated row in +`target/bench/results.json` against the committed baseline row with the same +ID. A row fails when its median value is more than 20 percent slower than the +baseline value. + ## Updating Baselines A PR that changes `bench/baselines.json` must have the `bench-baseline-bump` label and a one-line justification in the PR body. +Use a line in this exact form so CI can verify it: + +```text +bench-baseline-bump: +``` + +When the label and justification are present, CI still runs the benchmark and +uploads `target/bench/results.json`, but skips the regression check. Without +the label, or with the label but no justification line, the gate runs normally +or fails the PR-body validation. Routine performance wins should bump the baseline down. Routine performance drift should not bump the baseline; fix the regression instead. diff --git a/bench/check_baseline.sh b/bench/check_baseline.sh new file mode 100755 index 00000000..73e00702 --- /dev/null +++ b/bench/check_baseline.sh @@ -0,0 +1,144 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "$0")/.." && pwd)" +cd "$repo_root" + +baseline_path="bench/baselines.json" +results_path="target/bench/results.json" +threshold_percent="20" + +usage() { + cat <<'EOF' +Usage: bench/check_baseline.sh [--baseline PATH] [--results PATH] [--threshold-percent N] + +Compares gated rows in target/bench/results.json against bench/baselines.json. +Rows fail when their median value is more than N percent slower than baseline. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --baseline) + baseline_path="$2" + shift 2 + ;; + --results) + results_path="$2" + shift 2 + ;; + --threshold-percent) + threshold_percent="$2" + shift 2 + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown argument: $1" >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ ! -f "$baseline_path" ]]; then + echo "baseline file not found: $baseline_path" >&2 + exit 2 +fi + +if [[ ! -f "$results_path" ]]; then + echo "results file not found: $results_path" >&2 + exit 2 +fi + +report="$( + jq -n \ + --slurpfile baseline "$baseline_path" \ + --slurpfile results "$results_path" \ + --argjson threshold_percent "$threshold_percent" ' + ($baseline[0].rows // []) as $baseline_rows + | ($results[0].rows // []) as $result_rows + | (($threshold_percent + 100) / 100) as $multiplier + | def baseline_row($id): + first($baseline_rows[] | select(.id == $id)); + [ $result_rows[] | select(.gate != false) ] as $gated + | { + checked: [ + $gated[] as $row + | ($row.baseline_id // $row.id) as $baseline_id + | (baseline_row($baseline_id)) as $baseline + | select($baseline != null and (($baseline.baseline.value // null) != null)) + | { + id: $row.id, + baseline_id: $baseline_id, + implementation: ($row.implementation // "unknown"), + value: $row.value, + baseline_value: $baseline.baseline.value, + unit: ($row.unit // ""), + baseline_unit: ($baseline.baseline.unit // ""), + limit: ($baseline.baseline.value * $multiplier), + slower_percent: ( + if ($baseline.baseline.value | tonumber) > 0 then + (($row.value - $baseline.baseline.value) / $baseline.baseline.value * 100) + else + null + end + ) + } + ], + missing: [ + $gated[] as $row + | ($row.baseline_id // $row.id) as $baseline_id + | select((baseline_row($baseline_id)) == null) + | { id: $row.id, baseline_id: $baseline_id } + ], + invalid_baselines: [ + $gated[] as $row + | ($row.baseline_id // $row.id) as $baseline_id + | (baseline_row($baseline_id)) as $baseline + | select($baseline != null and (($baseline.baseline.value // null) == null or ($baseline.baseline.value | tonumber) <= 0)) + | { id: $row.id, baseline_id: $baseline_id } + ] + } + | . + { + unit_mismatches: [ + .checked[] + | select(.unit != .baseline_unit) + | { id, unit, baseline_unit } + ], + regressions: [ + .checked[] + | select(.value > .limit) + ] + } + ' +)" + +echo "$report" | jq -r ' + .checked[] + | "\(.id) [\(.implementation)]: \(.value)\(.unit) vs baseline \(.baseline_value)\(.baseline_unit) (limit \(.limit | floor)\(.baseline_unit))" +' + +if [[ "$(echo "$report" | jq '.checked | length')" -eq 0 ]]; then + echo "no gated result rows found in $results_path" >&2 + exit 1 +fi + +if ! echo "$report" | jq -e ' + (.missing | length) == 0 + and (.invalid_baselines | length) == 0 + and (.unit_mismatches | length) == 0 + and (.regressions | length) == 0 +' >/dev/null; then + echo "$report" | jq -r ' + (.missing[]? | "missing baseline for result row \(.id) (baseline_id=\(.baseline_id))"), + (.invalid_baselines[]? | "invalid baseline for result row \(.id) (baseline_id=\(.baseline_id))"), + (.unit_mismatches[]? | "unit mismatch for \(.id): result \(.unit), baseline \(.baseline_unit)"), + (.regressions[]? | "regression: \(.id) [\(.implementation)] \(.value)\(.unit) is \(.slower_percent | floor)% slower than baseline \(.baseline_value)\(.baseline_unit)") + ' >&2 + exit 1 +fi + +echo "graph benchmark baseline check passed" diff --git a/bench/run_graph_bench_ci.sh b/bench/run_graph_bench_ci.sh new file mode 100755 index 00000000..f9021417 --- /dev/null +++ b/bench/run_graph_bench_ci.sh @@ -0,0 +1,241 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(cd "$(dirname "$0")/.." && pwd)" +cd "$repo_root" + +run_count="${GRAPH_BENCH_RUNS:-3}" +if ! [[ "$run_count" =~ ^[0-9]+$ ]] || (( run_count < 1 )); then + echo "GRAPH_BENCH_RUNS must be a positive integer" >&2 + exit 2 +fi + +target_dir="${GRAPH_BENCH_TARGET_DIR:-target/bench}" +samples_path="$target_dir/samples.ndjson" +results_path="$target_dir/results.json" +touch_path="crates/orbit-graph-cli/src/__graph_bench_touch.rs" + +cleanup() { + rm -f "$touch_path" +} +trap cleanup EXIT + +rm -rf "$target_dir" +mkdir -p "$target_dir" +: > "$samples_path" +cleanup + +cargo build -p orbit-knowledge --example graph_build --release +cargo build -p orbit-graph-cli --release + +graph_build_bin="$repo_root/target/release/examples/graph_build" +graph_cli_bin="$repo_root/target/release/orbit-graph-cli" + +now_ms() { + perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000' +} + +file_size_bytes() { + local path="$1" + if stat -c %s "$path" >/dev/null 2>&1; then + stat -c %s "$path" + else + stat -f %z "$path" + fi +} + +memory_mib() { + if [[ -r /proc/meminfo ]]; then + awk '/MemTotal/ { print int($2 / 1024) }' /proc/meminfo + else + echo 0 + fi +} + +measure_ms() { + local output_path="$1" + shift + local start_ms end_ms + start_ms="$(now_ms)" + "$@" > "$output_path" + end_ms="$(now_ms)" + echo $((end_ms - start_ms)) +} + +measure_peak_rss_kib() { + local output_path="$1" + shift + local rss_path="${output_path}.rss" + if /usr/bin/time -f "%M" -o "$rss_path" true >/dev/null 2>&1; then + /usr/bin/time -f "%M" -o "$rss_path" "$@" > "$output_path" + cat "$rss_path" + else + "$@" > "$output_path" + echo 0 + fi +} + +add_sample() { + local run="$1" + local id="$2" + local baseline_id="$3" + local implementation="$4" + local gate="$5" + local unit="$6" + local value="$7" + + jq -nc \ + --argjson run "$run" \ + --arg id "$id" \ + --arg baseline_id "$baseline_id" \ + --arg implementation "$implementation" \ + --argjson gate "$gate" \ + --arg unit "$unit" \ + --argjson value "$value" \ + '{ + run: $run, + id: $id, + baseline_id: $baseline_id, + implementation: $implementation, + gate: $gate, + unit: $unit, + value: $value + }' >> "$samples_path" +} + +for run_index in $(seq 1 "$run_count"); do + run_dir="$target_dir/run-$run_index" + mkdir -p "$run_dir" + + rm -rf .orbit/graph + cleanup + + "$graph_build_bin" \ + --workspace "$repo_root" \ + --knowledge-dir "$run_dir/v1-knowledge" \ + --scoreboard "$run_dir/v1-scoreboard.json" \ + > "$run_dir/v1-summary.txt" + + jq -c --argjson run "$run_index" ' + .[-1] as $record + | [ + { + id: "v1_cold_build", + baseline_id: "cold_full_build", + implementation: "v1", + gate: false, + unit: "ms", + value: $record.scenarios.cold_build.wall_time_ms + }, + { + id: "v1_incremental_no_changes", + baseline_id: "incremental_no_changes", + implementation: "v1", + gate: false, + unit: "ms", + value: $record.scenarios.warm_incremental_noop.wall_time_ms + }, + { + id: "v1_resident_memory", + baseline_id: "resident_memory", + implementation: "v1", + gate: false, + unit: "KiB", + value: ($record.scenarios.warm_incremental_noop.peak_rss_kib // 0) + } + ] + | .[] + { run: $run } + ' "$run_dir/v1-scoreboard.json" >> "$samples_path" + + v2_full_sync_json="$run_dir/v2-full-sync.json" + v2_rss_kib="$(measure_peak_rss_kib "$v2_full_sync_json" "$graph_cli_bin" sync --full)" + add_sample "$run_index" "cold_full_build" "cold_full_build" "v2" "true" "ms" "$(jq -r '.duration_ms' "$v2_full_sync_json")" + add_sample "$run_index" "resident_memory" "resident_memory" "v2" "true" "KiB" "$v2_rss_kib" + + v2_noop_sync_json="$run_dir/v2-noop-sync.json" + "$graph_cli_bin" sync > "$v2_noop_sync_json" + add_sample "$run_index" "incremental_no_changes" "incremental_no_changes" "v2" "true" "ms" "$(jq -r '.duration_ms' "$v2_noop_sync_json")" + + cat > "$touch_path" <<'RS' +pub fn graph_bench_touch() -> u8 { + 1 +} +RS + v2_one_file_json="$run_dir/v2-one-file-sync.json" + "$graph_cli_bin" sync > "$v2_one_file_json" + add_sample "$run_index" "incremental_one_file_changed" "incremental_one_file_changed" "v2" "true" "ms" "$(jq -r '.duration_ms' "$v2_one_file_json")" + + search_ms="$(measure_ms "$run_dir/v2-search.json" "$graph_cli_bin" search GraphBenchOptions --kind symbol --limit 20)" + add_sample "$run_index" "search" "search" "v2" "true" "ms" "$search_ms" + + refs_selector="symbol:crates/orbit-knowledge/src/graph_bench.rs#run_benchmark_with_child_process:function" + refs_ms="$(measure_ms "$run_dir/v2-refs.json" "$graph_cli_bin" refs "$refs_selector" --confidence same_module)" + add_sample "$run_index" "refs" "refs" "v2" "true" "ms" "$refs_ms" + + impact_selector="symbol:crates/orbit-knowledge/src/graph_bench.rs#append_scoreboard:function" + impact_ms="$(measure_ms "$run_dir/v2-impact.json" "$graph_cli_bin" impact "$impact_selector" --depth 3)" + add_sample "$run_index" "impact_depth_3" "impact_depth_3" "v2" "true" "ms" "$impact_ms" + + v2_db_path_json="$run_dir/v2-db-path.json" + "$graph_cli_bin" db-path > "$v2_db_path_json" + db_path="$(jq -r '.path' "$v2_db_path_json")" + add_sample "$run_index" "db_size" "db_size" "v2" "true" "bytes" "$(file_size_bytes "$db_path")" +done + +generated_at="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" +git_sha="$(git rev-parse HEAD 2>/dev/null || echo unknown)" +logical_core_count="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 0)" +runner_image="${GRAPH_BENCH_RUNNER_IMAGE:-ubuntu-24.04}" +runner_os_release="$(if [[ -r /etc/os-release ]]; then . /etc/os-release && echo "${PRETTY_NAME:-unknown}"; else echo unknown; fi)" + +jq -s \ + --arg generated_at "$generated_at" \ + --arg git_sha "$git_sha" \ + --argjson run_count "$run_count" \ + --arg runner_image "$runner_image" \ + --arg runner_os_release "$runner_os_release" \ + --argjson logical_core_count "$logical_core_count" \ + --argjson memory_mib "$(memory_mib)" ' + def median: + sort + | .[((length - 1) / 2 | floor)]; + + sort_by(.id) + | group_by(.id) + | map( + sort_by(.run) as $items + | $items[0] as $first + | ($items | map(.value) | median) as $median + | { + id: $first.id, + baseline_id: $first.baseline_id, + implementation: $first.implementation, + gate: $first.gate, + unit: $first.unit, + statistic: ("median_of_" + ($items | length | tostring)), + value: $median, + samples: ($items | map({ run, value })) + } + ) + | sort_by((.gate == false), .id) + | { + schema_version: 1, + generated_at: $generated_at, + git_sha: $git_sha, + run_count: $run_count, + gate: { + implementation: "v2", + regression_threshold_percent: 20, + statistic: "median" + }, + runner: { + image: $runner_image, + os_release: $runner_os_release, + logical_core_count: $logical_core_count, + memory_mib: $memory_mib + }, + rows: . + } + ' "$samples_path" > "$results_path" + +echo "Wrote $results_path"