Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions .github/workflows/ci-bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: Graph Bench

on:
pull_request:
types: [opened, synchronize, reopened, labeled, unlabeled, edited]
paths:
- "crates/**/*.rs"
- "tools/**/*.rs"
- "Cargo.toml"
- "crates/**/Cargo.toml"
- "tools/**/Cargo.toml"
- "Cargo.lock"
- "bench/baselines.json"
- "bench/check_baseline.sh"
- "bench/run_graph_bench_ci.sh"
- ".github/workflows/ci-bench.yml"

permissions:
contents: read

env:
CARGO_TERM_COLOR: always
GRAPH_BENCH_RUNS: 3
GRAPH_BENCH_RUNNER_IMAGE: ubuntu-24.04

jobs:
ci-bench:
name: Graph Bench
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
with:
ref: ${{ github.head_ref }}
token: ${{ secrets.GITHUB_TOKEN }}

- uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-05-07)

- uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-graph-bench-cargo-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-graph-bench-cargo-
${{ runner.os }}-cargo-

- name: Verify runner profile
run: |
set -euo pipefail
cores="$(getconf _NPROCESSORS_ONLN)"
memory_mib="$(awk '/MemTotal/ { print int($2 / 1024) }' /proc/meminfo)"
if [[ "$cores" -ne 4 || "$memory_mib" -lt 15000 || "$memory_mib" -gt 17000 ]]; then
echo "::error::Graph benchmark requires ubuntu-24.04 with 4 vCPU and approximately 16GB RAM; saw ${cores} cores and ${memory_mib} MiB."
exit 1
fi

- name: Run graph benchmark
run: ./bench/run_graph_bench_ci.sh

- name: Upload graph benchmark results
if: always() && hashFiles('target/bench/results.json') != ''
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: graph-bench-results
path: target/bench/results.json
if-no-files-found: error

- name: Validate baseline bump justification
id: baseline_bump
run: |
set -euo pipefail
has_bump_label="$(jq -r 'any(.pull_request.labels[]?; .name == "bench-baseline-bump")' "$GITHUB_EVENT_PATH")"
echo "has_bump_label=${has_bump_label}" >> "$GITHUB_OUTPUT"

if [[ "$has_bump_label" == "true" ]]; then
if ! jq -e '(.pull_request.body // "") | test("^\\s*bench-baseline-bump\\s*:\\s*\\S"; "im")' "$GITHUB_EVENT_PATH" >/dev/null; then
echo "::error::PRs labeled bench-baseline-bump must include a one-line PR body justification like 'bench-baseline-bump: <reason>'."
exit 1
fi
fi

- name: Check graph benchmark baseline
if: steps.baseline_bump.outputs.has_bump_label != 'true'
run: ./bench/check_baseline.sh --results target/bench/results.json --baseline bench/baselines.json

- name: Note baseline bump bypass
if: steps.baseline_bump.outputs.has_bump_label == 'true'
run: echo "bench-baseline-bump label present; baseline regression check bypassed after PR-body justification validation."
33 changes: 33 additions & 0 deletions bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,42 @@ P6.2 owns wiring the permanent `graph_bench.rs` CI gate. Once that lands, use
the gate artifact under `target/bench/` as the capture source instead of an
ad-hoc capture script.

## CI Regression Gate

The `Graph Bench / Graph Bench` PR job runs on the pinned
`ubuntu-24.04` runner profile from the spec. The workflow is path-filtered to
Rust source, Cargo, graph-bench, and workflow files so docs-only PRs do not
spend a benchmark runner.

The job builds the release benchmark binaries, runs the graph benchmark three
times, and writes `target/bench/results.json`. Each reported row stores the raw
samples and the median value used for the gate. The artifact is uploaded as
`graph-bench-results` on the PR's Actions page.

Gate decision for ORB-00321: v2 `orbit-graph` rows are gated against
`bench/baselines.json`; v1 `orbit-knowledge` rows from `graph_bench.rs` remain
informational in the artifact. GRAPH_SPEC section 12 sets budgets for the v2
implementation, while the v1 rows are retained during the transition so
reviewers can compare old and new behavior.

`bench/check_baseline.sh` compares every gated row in
`target/bench/results.json` against the committed baseline row with the same
ID. A row fails when its median value is more than 20 percent slower than the
baseline value.

## Updating Baselines

A PR that changes `bench/baselines.json` must have the
`bench-baseline-bump` label and a one-line justification in the PR body.
Use a line in this exact form so CI can verify it:

```text
bench-baseline-bump: <why this baseline change is intentional>
```

When the label and justification are present, CI still runs the benchmark and
uploads `target/bench/results.json`, but skips the regression check. Without
the label, or with the label but no justification line, the gate runs normally
or fails the PR-body validation.
Routine performance wins should bump the baseline down. Routine performance
drift should not bump the baseline; fix the regression instead.
144 changes: 144 additions & 0 deletions bench/check_baseline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env bash
set -euo pipefail

repo_root="$(cd "$(dirname "$0")/.." && pwd)"
cd "$repo_root"

baseline_path="bench/baselines.json"
results_path="target/bench/results.json"
threshold_percent="20"

usage() {
cat <<'EOF'
Usage: bench/check_baseline.sh [--baseline PATH] [--results PATH] [--threshold-percent N]

Compares gated rows in target/bench/results.json against bench/baselines.json.
Rows fail when their median value is more than N percent slower than baseline.
EOF
}

while [[ $# -gt 0 ]]; do
case "$1" in
--baseline)
baseline_path="$2"
shift 2
;;
--results)
results_path="$2"
shift 2
;;
--threshold-percent)
threshold_percent="$2"
shift 2
;;
-h|--help)
usage
exit 0
;;
*)
echo "unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done

if [[ ! -f "$baseline_path" ]]; then
echo "baseline file not found: $baseline_path" >&2
exit 2
fi

if [[ ! -f "$results_path" ]]; then
echo "results file not found: $results_path" >&2
exit 2
fi

report="$(
jq -n \
--slurpfile baseline "$baseline_path" \
--slurpfile results "$results_path" \
--argjson threshold_percent "$threshold_percent" '
($baseline[0].rows // []) as $baseline_rows
| ($results[0].rows // []) as $result_rows
| (($threshold_percent + 100) / 100) as $multiplier
| def baseline_row($id):
first($baseline_rows[] | select(.id == $id));
[ $result_rows[] | select(.gate != false) ] as $gated
| {
checked: [
$gated[] as $row
| ($row.baseline_id // $row.id) as $baseline_id
| (baseline_row($baseline_id)) as $baseline
| select($baseline != null and (($baseline.baseline.value // null) != null))
| {
id: $row.id,
baseline_id: $baseline_id,
implementation: ($row.implementation // "unknown"),
value: $row.value,
baseline_value: $baseline.baseline.value,
unit: ($row.unit // ""),
baseline_unit: ($baseline.baseline.unit // ""),
limit: ($baseline.baseline.value * $multiplier),
slower_percent: (
if ($baseline.baseline.value | tonumber) > 0 then
(($row.value - $baseline.baseline.value) / $baseline.baseline.value * 100)
else
null
end
)
}
],
missing: [
$gated[] as $row
| ($row.baseline_id // $row.id) as $baseline_id
| select((baseline_row($baseline_id)) == null)
| { id: $row.id, baseline_id: $baseline_id }
],
invalid_baselines: [
$gated[] as $row
| ($row.baseline_id // $row.id) as $baseline_id
| (baseline_row($baseline_id)) as $baseline
| select($baseline != null and (($baseline.baseline.value // null) == null or ($baseline.baseline.value | tonumber) <= 0))
| { id: $row.id, baseline_id: $baseline_id }
]
}
| . + {
unit_mismatches: [
.checked[]
| select(.unit != .baseline_unit)
| { id, unit, baseline_unit }
],
regressions: [
.checked[]
| select(.value > .limit)
]
}
'
)"

echo "$report" | jq -r '
.checked[]
| "\(.id) [\(.implementation)]: \(.value)\(.unit) vs baseline \(.baseline_value)\(.baseline_unit) (limit \(.limit | floor)\(.baseline_unit))"
'

if [[ "$(echo "$report" | jq '.checked | length')" -eq 0 ]]; then
echo "no gated result rows found in $results_path" >&2
exit 1
fi

if ! echo "$report" | jq -e '
(.missing | length) == 0
and (.invalid_baselines | length) == 0
and (.unit_mismatches | length) == 0
and (.regressions | length) == 0
' >/dev/null; then
echo "$report" | jq -r '
(.missing[]? | "missing baseline for result row \(.id) (baseline_id=\(.baseline_id))"),
(.invalid_baselines[]? | "invalid baseline for result row \(.id) (baseline_id=\(.baseline_id))"),
(.unit_mismatches[]? | "unit mismatch for \(.id): result \(.unit), baseline \(.baseline_unit)"),
(.regressions[]? | "regression: \(.id) [\(.implementation)] \(.value)\(.unit) is \(.slower_percent | floor)% slower than baseline \(.baseline_value)\(.baseline_unit)")
' >&2
exit 1
fi

echo "graph benchmark baseline check passed"
Loading
Loading