danieljhkim · danieljhkim · May 25, 2026 · May 25, 2026
diff --git a/.github/workflows/ci-bench.yml b/.github/workflows/ci-bench.yml
@@ -0,0 +1,90 @@
+name: Graph Bench
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, labeled, unlabeled, edited]
+    paths:
+      - "crates/**/*.rs"
+      - "tools/**/*.rs"
+      - "Cargo.toml"
+      - "crates/**/Cargo.toml"
+      - "tools/**/Cargo.toml"
+      - "Cargo.lock"
+      - "bench/baselines.json"
+      - "bench/check_baseline.sh"
+      - "bench/run_graph_bench_ci.sh"
+      - ".github/workflows/ci-bench.yml"
+
+permissions:
+  contents: read
+
+env:
+  CARGO_TERM_COLOR: always
+  GRAPH_BENCH_RUNS: 3
+  GRAPH_BENCH_RUNNER_IMAGE: ubuntu-24.04
+
+jobs:
+  ci-bench:
+    name: Graph Bench
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
+        with:
+          ref: ${{ github.head_ref }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable (2026-05-07)
+
+      - uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-graph-bench-cargo-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-graph-bench-cargo-
+            ${{ runner.os }}-cargo-
+
+      - name: Verify runner profile
+        run: |
+          set -euo pipefail
+          cores="$(getconf _NPROCESSORS_ONLN)"
+          memory_mib="$(awk '/MemTotal/ { print int($2 / 1024) }' /proc/meminfo)"
+          if [[ "$cores" -ne 4 || "$memory_mib" -lt 15000 || "$memory_mib" -gt 17000 ]]; then
+            echo "::error::Graph benchmark requires ubuntu-24.04 with 4 vCPU and approximately 16GB RAM; saw ${cores} cores and ${memory_mib} MiB."
+            exit 1
+          fi
+
+      - name: Run graph benchmark
+        run: ./bench/run_graph_bench_ci.sh
+
+      - name: Upload graph benchmark results
+        if: always() && hashFiles('target/bench/results.json') != ''
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        with:
+          name: graph-bench-results
+          path: target/bench/results.json
+          if-no-files-found: error
+
+      - name: Validate baseline bump justification
+        id: baseline_bump
+        run: |
+          set -euo pipefail
+          has_bump_label="$(jq -r 'any(.pull_request.labels[]?; .name == "bench-baseline-bump")' "$GITHUB_EVENT_PATH")"
+          echo "has_bump_label=${has_bump_label}" >> "$GITHUB_OUTPUT"
+
+          if [[ "$has_bump_label" == "true" ]]; then
+            if ! jq -e '(.pull_request.body // "") | test("^\\s*bench-baseline-bump\\s*:\\s*\\S"; "im")' "$GITHUB_EVENT_PATH" >/dev/null; then
+              echo "::error::PRs labeled bench-baseline-bump must include a one-line PR body justification like 'bench-baseline-bump: <reason>'."
+              exit 1
+            fi
+          fi
+
+      - name: Check graph benchmark baseline
+        if: steps.baseline_bump.outputs.has_bump_label != 'true'
+        run: ./bench/check_baseline.sh --results target/bench/results.json --baseline bench/baselines.json
+
+      - name: Note baseline bump bypass
+        if: steps.baseline_bump.outputs.has_bump_label == 'true'
+        run: echo "bench-baseline-bump label present; baseline regression check bypassed after PR-body justification validation."
diff --git a/bench/README.md b/bench/README.md
@@ -36,9 +36,42 @@ P6.2 owns wiring the permanent `graph_bench.rs` CI gate. Once that lands, use
 the gate artifact under `target/bench/` as the capture source instead of an
 ad-hoc capture script.
 
+## CI Regression Gate
+
+The `Graph Bench / Graph Bench` PR job runs on the pinned
+`ubuntu-24.04` runner profile from the spec. The workflow is path-filtered to
+Rust source, Cargo, graph-bench, and workflow files so docs-only PRs do not
+spend a benchmark runner.
+
+The job builds the release benchmark binaries, runs the graph benchmark three
+times, and writes `target/bench/results.json`. Each reported row stores the raw
+samples and the median value used for the gate. The artifact is uploaded as
+`graph-bench-results` on the PR's Actions page.
+
+Gate decision for ORB-00321: v2 `orbit-graph` rows are gated against
+`bench/baselines.json`; v1 `orbit-knowledge` rows from `graph_bench.rs` remain
+informational in the artifact. GRAPH_SPEC section 12 sets budgets for the v2
+implementation, while the v1 rows are retained during the transition so
+reviewers can compare old and new behavior.
+
+`bench/check_baseline.sh` compares every gated row in
+`target/bench/results.json` against the committed baseline row with the same
+ID. A row fails when its median value is more than 20 percent slower than the
+baseline value.
+
 ## Updating Baselines
 
 A PR that changes `bench/baselines.json` must have the
 `bench-baseline-bump` label and a one-line justification in the PR body.
+Use a line in this exact form so CI can verify it:
+
+```text
+bench-baseline-bump: <why this baseline change is intentional>
+```
+
+When the label and justification are present, CI still runs the benchmark and
+uploads `target/bench/results.json`, but skips the regression check. Without
+the label, or with the label but no justification line, the gate runs normally
+or fails the PR-body validation.
 Routine performance wins should bump the baseline down. Routine performance
 drift should not bump the baseline; fix the regression instead.
diff --git a/bench/check_baseline.sh b/bench/check_baseline.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+repo_root="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$repo_root"
+
+baseline_path="bench/baselines.json"
+results_path="target/bench/results.json"
+threshold_percent="20"
+
+usage() {
+  cat <<'EOF'
+Usage: bench/check_baseline.sh [--baseline PATH] [--results PATH] [--threshold-percent N]
+
+Compares gated rows in target/bench/results.json against bench/baselines.json.
+Rows fail when their median value is more than N percent slower than baseline.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --baseline)
+      baseline_path="$2"
+      shift 2
+      ;;
+    --results)
+      results_path="$2"
+      shift 2
+      ;;
+    --threshold-percent)
+      threshold_percent="$2"
+      shift 2
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "unknown argument: $1" >&2
+      usage >&2
+      exit 2
+      ;;
+  esac
+done
+
+if [[ ! -f "$baseline_path" ]]; then
+  echo "baseline file not found: $baseline_path" >&2
+  exit 2
+fi
+
+if [[ ! -f "$results_path" ]]; then
+  echo "results file not found: $results_path" >&2
+  exit 2
+fi
+
+report="$(
+  jq -n \
+    --slurpfile baseline "$baseline_path" \
+    --slurpfile results "$results_path" \
+    --argjson threshold_percent "$threshold_percent" '
+      ($baseline[0].rows // []) as $baseline_rows
+      | ($results[0].rows // []) as $result_rows
+      | (($threshold_percent + 100) / 100) as $multiplier
+      | def baseline_row($id):
+          first($baseline_rows[] | select(.id == $id));
+        [ $result_rows[] | select(.gate != false) ] as $gated
+      | {
+          checked: [
+            $gated[] as $row
+            | ($row.baseline_id // $row.id) as $baseline_id
+            | (baseline_row($baseline_id)) as $baseline
+            | select($baseline != null and (($baseline.baseline.value // null) != null))
+            | {
+                id: $row.id,
+                baseline_id: $baseline_id,
+                implementation: ($row.implementation // "unknown"),
+                value: $row.value,
+                baseline_value: $baseline.baseline.value,
+                unit: ($row.unit // ""),
+                baseline_unit: ($baseline.baseline.unit // ""),
+                limit: ($baseline.baseline.value * $multiplier),
+                slower_percent: (
+                  if ($baseline.baseline.value | tonumber) > 0 then
+                    (($row.value - $baseline.baseline.value) / $baseline.baseline.value * 100)
+                  else
+                    null
+                  end
+                )
+              }
+          ],
+          missing: [
+            $gated[] as $row
+            | ($row.baseline_id // $row.id) as $baseline_id
+            | select((baseline_row($baseline_id)) == null)
+            | { id: $row.id, baseline_id: $baseline_id }
+          ],
+          invalid_baselines: [
+            $gated[] as $row
+            | ($row.baseline_id // $row.id) as $baseline_id
+            | (baseline_row($baseline_id)) as $baseline
+            | select($baseline != null and (($baseline.baseline.value // null) == null or ($baseline.baseline.value | tonumber) <= 0))
+            | { id: $row.id, baseline_id: $baseline_id }
+          ]
+        }
+      | . + {
+          unit_mismatches: [
+            .checked[]
+            | select(.unit != .baseline_unit)
+            | { id, unit, baseline_unit }
+          ],
+          regressions: [
+            .checked[]
+            | select(.value > .limit)
+          ]
+        }
+    '
+)"
+
+echo "$report" | jq -r '
+  .checked[]
+  | "\(.id) [\(.implementation)]: \(.value)\(.unit) vs baseline \(.baseline_value)\(.baseline_unit) (limit \(.limit | floor)\(.baseline_unit))"
+'
+
+if [[ "$(echo "$report" | jq '.checked | length')" -eq 0 ]]; then
+  echo "no gated result rows found in $results_path" >&2
+  exit 1
+fi
+
+if ! echo "$report" | jq -e '
+  (.missing | length) == 0
+  and (.invalid_baselines | length) == 0
+  and (.unit_mismatches | length) == 0
+  and (.regressions | length) == 0
+' >/dev/null; then
+  echo "$report" | jq -r '
+    (.missing[]? | "missing baseline for result row \(.id) (baseline_id=\(.baseline_id))"),
+    (.invalid_baselines[]? | "invalid baseline for result row \(.id) (baseline_id=\(.baseline_id))"),
+    (.unit_mismatches[]? | "unit mismatch for \(.id): result \(.unit), baseline \(.baseline_unit)"),
+    (.regressions[]? | "regression: \(.id) [\(.implementation)] \(.value)\(.unit) is \(.slower_percent | floor)% slower than baseline \(.baseline_value)\(.baseline_unit)")
+  ' >&2
+  exit 1
+fi
+
+echo "graph benchmark baseline check passed"