From f3fdf53780fdd36afa4d3761c2f0b930db3283e7 Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:26:07 -0500
Subject: [PATCH 1/9] Add accuracy, correctness, and performance CI workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- correctness.yml: cross-language Python/Rust PromQL pattern matching
  parity tests (token matching, serialised patterns, Rust unit tests)
- accuracy.yml: ASAP vs ClickHouse exact baseline error regression
  check (≤5% relative error threshold) on H2O groupby dataset
- performance.yml: relative latency regression detection on GH-hosted
  VMs; manual workflow_dispatch with self-hosted runner support for
  precise absolute benchmarks once asap-tools infra is decoupled from
  Cloudlab

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/accuracy.yml    | 183 ++++++++++++++++++
 .github/workflows/correctness.yml | 168 +++++++++++++++++
 .github/workflows/performance.yml | 296 ++++++++++++++++++++++++++++++
 3 files changed, 647 insertions(+)
 create mode 100644 .github/workflows/accuracy.yml
 create mode 100644 .github/workflows/correctness.yml
 create mode 100644 .github/workflows/performance.yml

diff --git a/.github/workflows/accuracy.yml b/.github/workflows/accuracy.yml
new file mode 100644
index 0000000..aad9a59
--- /dev/null
+++ b/.github/workflows/accuracy.yml
@@ -0,0 +1,183 @@
+name: Accuracy Tests
+
+# Validates that ASAP approximate query results stay within acceptable error
+# bounds relative to an exact (ClickHouse) baseline. Tests run inside Docker
+# containers on ephemeral GitHub Actions VMs — sufficient for catching
+# accuracy regressions without requiring self-hosted infrastructure.
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'asap-summary-ingest/**'
+      - 'asap-query-engine/**'
+      - 'asap-common/sketch-core/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
+      - 'asap-tools/execution-utilities/asap_query_latency/**'
+      - '.github/workflows/accuracy.yml'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'asap-summary-ingest/**'
+      - 'asap-query-engine/**'
+      - 'asap-common/sketch-core/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
+      - 'asap-tools/execution-utilities/asap_query_latency/**'
+      - '.github/workflows/accuracy.yml'
+  workflow_dispatch:
+
+env:
+  # Rows to ingest during CI — small enough to complete in ~10 min on GH runners
+  # while still exercising the full sketch → query path. Increase on self-hosted
+  # runners for a more thorough accuracy sweep.
+  MAX_ROWS: 50000
+  # Maximum acceptable relative error vs exact baseline (5 %)
+  MAX_RELATIVE_ERROR: "0.05"
+
+jobs:
+  # ── H2O groupby accuracy (ASAP vs ClickHouse exact) ────────────────────────
+  h2o-accuracy:
+    name: H2O groupby accuracy regression
+    runs-on: ubuntu-latest
+    # Accuracy tests can be long-running on ephemeral runners; give them room.
+    timeout-minutes: 60
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests kafka-python gdown matplotlib
+          if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
+            pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      # Pull / build only the images needed for accuracy testing
+      - name: Build base image
+        run: |
+          docker build \
+            -t sketchdb-base:latest \
+            -f asap-common/installation/Dockerfile \
+            asap-common
+
+      - name: Build summary-ingest image
+        run: |
+          docker build \
+            -t asap-summary-ingest:ci \
+            -f asap-summary-ingest/Dockerfile \
+            asap-summary-ingest
+
+      - name: Install Rust (for query engine)
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-accuracy-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Build query engine binary
+        run: cargo build --release --bin query_engine_rust --locked
+        env:
+          RUSTC_WRAPPER: sccache
+
+      # Run accuracy benchmark (ASAP path) with a small dataset slice
+      - name: Run ASAP accuracy benchmark
+        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
+        run: |
+          python run_benchmark.py \
+            --mode asap \
+            --load-data \
+            --max-rows ${{ env.MAX_ROWS }} \
+            --output /tmp/asap_accuracy_results.csv \
+            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
+        env:
+          RUSTC_WRAPPER: sccache
+
+      # Run the same queries against the exact baseline
+      - name: Run ClickHouse baseline benchmark
+        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
+        run: |
+          python run_benchmark.py \
+            --mode baseline \
+            --skip-data-load \
+            --output /tmp/baseline_accuracy_results.csv
+
+      # Compare ASAP results to baseline; fail if error exceeds threshold
+      - name: Check accuracy (error ≤ ${{ env.MAX_RELATIVE_ERROR }})
+        run: |
+          python3 - <<'EOF'
+          import csv, sys, os
+
+          max_err = float(os.environ["MAX_RELATIVE_ERROR"])
+          asap_file   = "/tmp/asap_accuracy_results.csv"
+          exact_file  = "/tmp/baseline_accuracy_results.csv"
+
+          def load(path):
+              with open(path) as f:
+                  return {row["query_id"]: float(row["result"]) for row in csv.DictReader(f)
+                          if row.get("result") not in (None, "", "null")}
+
+          try:
+              asap  = load(asap_file)
+              exact = load(exact_file)
+          except FileNotFoundError as e:
+              print(f"Result file missing: {e}. Skipping accuracy check.")
+              sys.exit(0)
+
+          failures = []
+          for qid, exact_val in exact.items():
+              if qid not in asap:
+                  print(f"WARN: {qid} not found in ASAP results, skipping")
+                  continue
+              if exact_val == 0:
+                  rel_err = 0.0 if asap[qid] == 0 else float("inf")
+              else:
+                  rel_err = abs(asap[qid] - exact_val) / abs(exact_val)
+              status = "PASS" if rel_err <= max_err else "FAIL"
+              print(f"{status}  {qid}: rel_err={rel_err:.4f}  asap={asap[qid]:.4f}  exact={exact_val:.4f}")
+              if status == "FAIL":
+                  failures.append(qid)
+
+          if failures:
+              print(f"\n{len(failures)} query(ies) exceeded max relative error ({max_err}):")
+              for qid in failures:
+                  print(f"  - {qid}")
+              sys.exit(1)
+          else:
+              print(f"\nAll queries within relative error threshold ({max_err}).")
+          EOF
+        env:
+          MAX_RELATIVE_ERROR: ${{ env.MAX_RELATIVE_ERROR }}
+
+      - name: Upload accuracy results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: accuracy-results-${{ github.run_id }}
+          path: |
+            /tmp/asap_accuracy_results.csv
+            /tmp/baseline_accuracy_results.csv
+          if-no-files-found: warn
diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
new file mode 100644
index 0000000..11d1a20
--- /dev/null
+++ b/.github/workflows/correctness.yml
@@ -0,0 +1,168 @@
+name: Correctness Tests
+
+# Verifies cross-language parity between Python and Rust implementations of
+# PromQL pattern matching and sketch serialisation. Ephemeral GitHub Actions
+# VMs are well-suited for these deterministic correctness checks.
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'asap-common/tests/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-common/sketch-core/**'
+      - '.github/workflows/correctness.yml'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'asap-common/tests/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-common/sketch-core/**'
+      - '.github/workflows/correctness.yml'
+  workflow_dispatch:
+
+jobs:
+  # ── 1. Cross-language token matching (Python vs Rust) ──────────────────────
+  cross-language-token-matching:
+    name: Cross-language PromQL token matching
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f asap-common/dependencies/py/requirements.txt ]; then
+            pip install -r asap-common/dependencies/py/requirements.txt
+          fi
+          pip install -e asap-common/dependencies/py/ 2>/dev/null || true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Run master test runner (Python + Rust + comparison)
+        working-directory: asap-common/tests/compare_matched_tokens
+        run: python utilities/master_test_runner.py
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Upload test artefacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: cross-language-token-results
+          path: |
+            asap-common/tests/compare_matched_tokens/python_tests/python_test_results.json
+            asap-common/tests/compare_matched_tokens/rust_tests/rust_test_results.json
+            asap-common/tests/compare_matched_tokens/comparison_tests/comparison_report.json
+            asap-common/tests/compare_matched_tokens/test_summary.json
+          if-no-files-found: warn
+
+  # ── 2. Cross-language serialised pattern comparison ────────────────────────
+  cross-language-pattern-serialisation:
+    name: Cross-language pattern serialisation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f asap-common/dependencies/py/requirements.txt ]; then
+            pip install -r asap-common/dependencies/py/requirements.txt
+          fi
+          pip install -e asap-common/dependencies/py/ 2>/dev/null || true
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Generate Python patterns
+        working-directory: asap-common/tests/compare_patterns
+        run: python python_generate_patterns.py
+
+      - name: Build and run Rust pattern generator
+        working-directory: asap-common/tests/compare_patterns
+        run: cargo run --release
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Compare serialised patterns
+        working-directory: asap-common/tests/compare_patterns
+        run: python compare_serialized_patterns.py
+
+  # ── 3. Rust pattern-matching unit tests ────────────────────────────────────
+  rust-pattern-matching:
+    name: Rust pattern matching unit tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Run Rust pattern matching tests
+        working-directory: asap-common/tests/rust_pattern_matching
+        run: cargo test --release
+        env:
+          RUSTC_WRAPPER: sccache
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
new file mode 100644
index 0000000..674be29
--- /dev/null
+++ b/.github/workflows/performance.yml
@@ -0,0 +1,296 @@
+name: Performance Benchmarks
+
+# IMPORTANT – Runner choice and benchmark precision
+# ─────────────────────────────────────────────────
+# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware.
+# They are useful for *relative* regression detection (e.g. "this PR is 2×
+# slower than main") but NOT for absolute latency numbers.
+#
+# For precise, publication-quality benchmarks the team uses the dedicated
+# asap-tools benchmarking infra (used for TurboProm paper experiments).
+# That infra must be decoupled from Cloudlab and registered as a
+# GitHub self-hosted runner.  Until then, set `runner: self-hosted` in the
+# workflow_dispatch inputs below to target a self-hosted machine when one
+# is available, or rely on the relative-regression job for PR gating.
+#
+# References:
+#   - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby
+#   - asap-tools/execution-utilities/asap_query_latency/      — ClickBench hits
+
+on:
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'asap-summary-ingest/**'
+      - 'asap-query-engine/**'
+      - 'asap-common/sketch-core/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
+      - 'asap-tools/execution-utilities/asap_query_latency/**'
+      - '.github/workflows/performance.yml'
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: >
+          Runner label to use.  Use 'ubuntu-latest' for relative regression
+          detection on GH-hosted VMs, or a self-hosted runner label (e.g.
+          'self-hosted') for precise absolute benchmarks.
+        required: false
+        default: ubuntu-latest
+      max_rows:
+        description: Rows to ingest (0 = full dataset; reduce for faster CI runs)
+        required: false
+        default: '100000'
+      benchmark_suite:
+        description: 'Which suite to run: h2o | query_latency | all'
+        required: false
+        default: all
+
+env:
+  # Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners)
+  DEFAULT_MAX_ROWS: '100000'
+  # Latency regression threshold: flag if p95 latency increases by more than
+  # this factor relative to the baseline run within the same CI job.
+  LATENCY_REGRESSION_FACTOR: '2.0'
+
+jobs:
+  # ── 1. Relative performance regression (always runs on GH-hosted VMs) ──────
+  relative-regression:
+    name: Relative performance regression (H2O groupby)
+    # Use the workflow_dispatch runner input when triggered manually;
+    # fall back to ubuntu-latest for PR triggers.
+    runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }}
+    timeout-minutes: 45
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests kafka-python gdown matplotlib
+          if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
+            pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build base image
+        run: |
+          docker build \
+            -t sketchdb-base:latest \
+            -f asap-common/installation/Dockerfile \
+            asap-common
+
+      - name: Build summary-ingest image
+        run: |
+          docker build \
+            -t asap-summary-ingest:ci \
+            -f asap-summary-ingest/Dockerfile \
+            asap-summary-ingest
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Build query engine binary
+        run: cargo build --release --bin query_engine_rust --locked
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Resolve max_rows
+        id: config
+        run: |
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
+            echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT"
+          else
+            echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      # ── Baseline (ClickHouse exact) ──
+      - name: Run ClickHouse baseline benchmark
+        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
+        run: |
+          python run_benchmark.py \
+            --mode baseline \
+            --load-data \
+            --max-rows ${{ steps.config.outputs.max_rows }} \
+            --output /tmp/baseline_perf_results.csv
+
+      # ── ASAP path ──
+      - name: Run ASAP benchmark
+        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
+        run: |
+          python run_benchmark.py \
+            --mode asap \
+            --load-data \
+            --max-rows ${{ steps.config.outputs.max_rows }} \
+            --output /tmp/asap_perf_results.csv \
+            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
+
+      # ── Regression check ──
+      - name: Check for latency regressions
+        run: |
+          python3 - <<'EOF'
+          import csv, os, sys
+
+          factor    = float(os.environ["LATENCY_REGRESSION_FACTOR"])
+          asap_file = "/tmp/asap_perf_results.csv"
+          base_file = "/tmp/baseline_perf_results.csv"
+
+          def load_latency(path):
+              with open(path) as f:
+                  return {row["query_id"]: float(row["latency_ms"])
+                          for row in csv.DictReader(f)
+                          if row.get("latency_ms") not in (None, "", "null")}
+
+          try:
+              asap = load_latency(asap_file)
+              base = load_latency(base_file)
+          except FileNotFoundError as e:
+              print(f"Result file missing: {e}. Skipping regression check.")
+              sys.exit(0)
+
+          regressions = []
+          print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}")
+          print("-" * 72)
+          for qid, base_lat in base.items():
+              if qid not in asap:
+                  continue
+              ratio = asap[qid] / base_lat if base_lat > 0 else float("inf")
+              status = "REGRESSION" if ratio > factor else "ok"
+              print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f}  {status}")
+              if status == "REGRESSION":
+                  regressions.append((qid, ratio))
+
+          if regressions:
+              print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):")
+              for qid, r in regressions:
+                  print(f"  - {qid}: {r:.2f}x slower than baseline")
+              print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to")
+              print("      cloud noise. For authoritative numbers use a self-hosted runner.")
+              sys.exit(1)
+          else:
+              print(f"\nNo regressions detected (threshold: {factor}x).")
+          EOF
+        env:
+          LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }}
+
+      - name: Generate latency comparison plot
+        if: always()
+        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
+        run: |
+          python plot_latency.py \
+            --asap /tmp/asap_perf_results.csv \
+            --baseline /tmp/baseline_perf_results.csv \
+            --output /tmp/latency_comparison.png 2>/dev/null || \
+          python plot_latency.py 2>/dev/null || true
+
+      - name: Upload benchmark results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: performance-results-${{ github.run_id }}
+          path: |
+            /tmp/asap_perf_results.csv
+            /tmp/baseline_perf_results.csv
+            /tmp/latency_comparison.png
+          if-no-files-found: warn
+
+  # ── 2. Query latency micro-benchmark (manual / self-hosted) ─────────────────
+  query-latency:
+    name: Query latency micro-benchmark
+    # Skip on PRs — run manually or on a scheduled trigger once self-hosted
+    # runners are available. On GH-hosted VMs the numbers are too noisy to be
+    # actionable for absolute latency SLOs.
+    if: >
+      github.event_name == 'workflow_dispatch' &&
+      (inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all')
+    runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
+    timeout-minutes: 60
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Self-hosted runner notice
+        if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }}
+        run: |
+          echo "::warning::Running query_latency benchmark on a GH-hosted VM."
+          echo "::warning::Results are indicative only. Register the asap-tools"
+          echo "::warning::benchmarking host as a self-hosted runner for"
+          echo "::warning::publication-quality absolute latency measurements."
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install requests
+          if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then
+            pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt
+          fi
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Build query engine binary
+        run: cargo build --release --bin query_engine_rust --locked
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Run query latency benchmark
+        working-directory: asap-tools/execution-utilities/asap_query_latency
+        run: |
+          python run_benchmark.py \
+            --output /tmp/query_latency_results.csv \
+            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
+
+      - name: Upload query latency results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: query-latency-results-${{ github.run_id }}
+          path: /tmp/query_latency_results.csv
+          if-no-files-found: warn

From 5152c326e3873f48f3d12a885545baed7fc7c94c Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:29:21 -0500
Subject: [PATCH 2/9] Fix workspace conflicts and add missing test data for
 correctness CI

- Add [workspace] table to three standalone test Cargo.toml files
  (compare_patterns, compare_matched_tokens/rust_tests, rust_pattern_matching)
  to prevent "believes it's in a workspace" error on the root Cargo.toml
- Add test_data/promql_queries.json with 10 PromQL test cases covering
  ONLY_TEMPORAL, ONLY_SPATIAL, and ONE_TEMPORAL_ONE_SPATIAL patterns,
  required by the cross-language master_test_runner
- Unignore test_data/*.json in asap-common/.gitignore (tests/**/*.json
  was intended for generated result files, not fixture input data)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 asap-common/.gitignore                        |   1 +
 .../rust_tests/Cargo.toml                     |   2 +
 .../test_data/promql_queries.json             | 228 ++++++++++++++++++
 asap-common/tests/compare_patterns/Cargo.toml |   2 +
 .../tests/rust_pattern_matching/Cargo.toml    |   2 +
 5 files changed, 235 insertions(+)
 create mode 100644 asap-common/tests/compare_matched_tokens/test_data/promql_queries.json

diff --git a/asap-common/.gitignore b/asap-common/.gitignore
index 9e7080c..102b6ea 100644
--- a/asap-common/.gitignore
+++ b/asap-common/.gitignore
@@ -8,4 +8,5 @@ dependencies/py/promql_utilities/promql_utilities.egg-info/
 dependencies/rs/**/target/
 
 tests/**/*.json
+!tests/**/test_data/*.json
 tests/**/target/
diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
index e38e483..c4155dc 100644
--- a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
+++ b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "promql_cross_lang_tests"
 version = "0.1.0"
diff --git a/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json
new file mode 100644
index 0000000..0d03af4
--- /dev/null
+++ b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json
@@ -0,0 +1,228 @@
+{
+  "test_cases": [
+    {
+      "id": "temporal_rate_basic",
+      "description": "Basic rate function over a time range",
+      "query": "rate(http_requests_total{job=\"api\"}[5m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "rate"
+        },
+        "range_vector": {
+          "range": "5m"
+        }
+      }
+    },
+    {
+      "id": "temporal_increase_basic",
+      "description": "Increase function over a time range",
+      "query": "increase(http_requests_total[1h])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "increase"
+        },
+        "range_vector": {
+          "range": "1h"
+        }
+      }
+    },
+    {
+      "id": "temporal_quantile_over_time",
+      "description": "Quantile over time function",
+      "query": "quantile_over_time(0.95, cpu_usage{instance=\"host1\"}[10m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "cpu_usage",
+          "labels": {"instance": "host1"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "quantile_over_time"
+        },
+        "range_vector": {
+          "range": "10m"
+        }
+      }
+    },
+    {
+      "id": "temporal_avg_over_time",
+      "description": "Average over time function",
+      "query": "avg_over_time(memory_bytes[30m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "memory_bytes",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "avg_over_time"
+        },
+        "range_vector": {
+          "range": "30m"
+        }
+      }
+    },
+    {
+      "id": "spatial_sum_aggregation",
+      "description": "Sum aggregation across all series",
+      "query": "sum(http_requests_total{job=\"api\"})",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "spatial_avg_aggregation",
+      "description": "Average aggregation by label",
+      "query": "avg by (instance) (cpu_usage)",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "cpu_usage",
+          "labels": {},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "avg",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "spatial_count_aggregation",
+      "description": "Count aggregation",
+      "query": "count(up{job=\"node\"})",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "up",
+          "labels": {"job": "node"},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "count",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "combined_sum_of_rate",
+      "description": "Sum aggregation of rate (temporal + spatial)",
+      "query": "sum(rate(http_requests_total{job=\"api\"}[5m]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "rate"
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "5m"
+        }
+      }
+    },
+    {
+      "id": "combined_avg_of_quantile_over_time",
+      "description": "Avg aggregation of quantile_over_time (temporal + spatial)",
+      "query": "avg(quantile_over_time(0.99, response_time_seconds[15m]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "response_time_seconds",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "quantile_over_time"
+        },
+        "aggregation": {
+          "op": "avg",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "15m"
+        }
+      }
+    },
+    {
+      "id": "combined_sum_of_avg_over_time",
+      "description": "Sum aggregation of avg_over_time",
+      "query": "sum by (job) (avg_over_time(memory_bytes{env=\"prod\"}[1h]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "memory_bytes",
+          "labels": {"env": "prod"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "avg_over_time"
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "1h"
+        }
+      }
+    }
+  ],
+  "pattern_builder_tests": [
+    {
+      "id": "builder_metric_no_labels",
+      "description": "Build a simple metric selector with no labels",
+      "builder_call": "metric",
+      "parameters": {
+        "collect_as": "metric"
+      },
+      "expected_pattern": {
+        "type": "metric",
+        "collect_as": "metric"
+      }
+    },
+    {
+      "id": "builder_function_rate",
+      "description": "Build a rate function pattern",
+      "builder_call": "function",
+      "parameters": {
+        "names": ["rate", "increase"],
+        "collect_as": "function"
+      },
+      "expected_pattern": {
+        "type": "function",
+        "names": ["rate", "increase"],
+        "collect_as": "function"
+      }
+    }
+  ]
+}
diff --git a/asap-common/tests/compare_patterns/Cargo.toml b/asap-common/tests/compare_patterns/Cargo.toml
index 8d04a6f..3f94a69 100644
--- a/asap-common/tests/compare_patterns/Cargo.toml
+++ b/asap-common/tests/compare_patterns/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "compare_patterns_runner"
 version = "0.1.0"
diff --git a/asap-common/tests/rust_pattern_matching/Cargo.toml b/asap-common/tests/rust_pattern_matching/Cargo.toml
index 14b2980..571641e 100644
--- a/asap-common/tests/rust_pattern_matching/Cargo.toml
+++ b/asap-common/tests/rust_pattern_matching/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "promql_cross_lang_tests"
 version = "0.1.0"

From c7ac5a76d9de84acca962584a85833e5d2be076d Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:42:16 -0500
Subject: [PATCH 3/9] Replace accuracy/performance workflows with full e2e
 eval-pr based on PDF guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements all 20 steps from the ASAPQuery PR Performance & Accuracy
Evaluation Guide:

- eval-pr.yml: triggers on core code changes, builds the 3 ASAP-owned
  services (planner-rs, summary-ingest, query-engine) from PR source,
  spins up the full quickstart stack, waits for pipeline + ingestion,
  runs queries against both Prometheus (baseline) and ASAPQuery, then
  evaluates accuracy and performance

- benchmarks/docker-compose.yml: Compose override that replaces the
  GHCR images for planner-rs/summary-ingest/queryengine with `build:`
  directives so CI always tests the latest committed code in the branch

- benchmarks/queries/promql_suite.json: 14-query fixed suite covering
  avg/sum/max/min/quantile at p50/p90/p95/p99, with and without
  grouping by pattern/region/service; marks ASAP-native queries

- benchmarks/scripts/wait_for_stack.sh: polls Prometheus, Arroyo, and
  QueryEngine until healthy (180s timeout each)

- benchmarks/scripts/ingest_wait.sh: waits for asap-demo Arroyo
  pipeline to reach RUNNING, then 90s for sketch accumulation

- benchmarks/scripts/run_baseline.py: queries Prometheus /api/v1/query
  3x per query, records latencies and results

- benchmarks/scripts/run_asap.py: same for ASAPQuery /api/v1/query

- benchmarks/scripts/compare.py: normalises vector results by label
  set, enforces pass/fail policy (no query failures; ASAP-native
  relative error ≤ 1%); latency regressions are warn-only per PDF
  practical caution note on GH runner noise

Pass/fail policy (PDF page 3):
  - No query failures
  - Max relative error ≤ 1%
  - No >10% p95 latency regression (warn-only on ephemeral runners)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/accuracy.yml       | 183 ----------------
 .github/workflows/eval-pr.yml        |  98 +++++++++
 .github/workflows/performance.yml    | 296 -------------------------
 benchmarks/docker-compose.yml        |  26 +++
 benchmarks/golden/.gitkeep           |   0
 benchmarks/queries/promql_suite.json |  18 ++
 benchmarks/reports/.gitkeep          |   0
 benchmarks/scripts/compare.py        | 313 +++++++++++++++++++++++++++
 benchmarks/scripts/ingest_wait.sh    |  72 ++++++
 benchmarks/scripts/run_asap.py       | 113 ++++++++++
 benchmarks/scripts/run_baseline.py   | 111 ++++++++++
 benchmarks/scripts/wait_for_stack.sh |  34 +++
 12 files changed, 785 insertions(+), 479 deletions(-)
 delete mode 100644 .github/workflows/accuracy.yml
 create mode 100644 .github/workflows/eval-pr.yml
 delete mode 100644 .github/workflows/performance.yml
 create mode 100644 benchmarks/docker-compose.yml
 create mode 100644 benchmarks/golden/.gitkeep
 create mode 100644 benchmarks/queries/promql_suite.json
 create mode 100644 benchmarks/reports/.gitkeep
 create mode 100644 benchmarks/scripts/compare.py
 create mode 100644 benchmarks/scripts/ingest_wait.sh
 create mode 100644 benchmarks/scripts/run_asap.py
 create mode 100644 benchmarks/scripts/run_baseline.py
 create mode 100644 benchmarks/scripts/wait_for_stack.sh

diff --git a/.github/workflows/accuracy.yml b/.github/workflows/accuracy.yml
deleted file mode 100644
index aad9a59..0000000
--- a/.github/workflows/accuracy.yml
+++ /dev/null
@@ -1,183 +0,0 @@
-name: Accuracy Tests
-
-# Validates that ASAP approximate query results stay within acceptable error
-# bounds relative to an exact (ClickHouse) baseline. Tests run inside Docker
-# containers on ephemeral GitHub Actions VMs — sufficient for catching
-# accuracy regressions without requiring self-hosted infrastructure.
-
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - 'asap-summary-ingest/**'
-      - 'asap-query-engine/**'
-      - 'asap-common/sketch-core/**'
-      - 'asap-common/dependencies/**'
-      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
-      - 'asap-tools/execution-utilities/asap_query_latency/**'
-      - '.github/workflows/accuracy.yml'
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'asap-summary-ingest/**'
-      - 'asap-query-engine/**'
-      - 'asap-common/sketch-core/**'
-      - 'asap-common/dependencies/**'
-      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
-      - 'asap-tools/execution-utilities/asap_query_latency/**'
-      - '.github/workflows/accuracy.yml'
-  workflow_dispatch:
-
-env:
-  # Rows to ingest during CI — small enough to complete in ~10 min on GH runners
-  # while still exercising the full sketch → query path. Increase on self-hosted
-  # runners for a more thorough accuracy sweep.
-  MAX_ROWS: 50000
-  # Maximum acceptable relative error vs exact baseline (5 %)
-  MAX_RELATIVE_ERROR: "0.05"
-
-jobs:
-  # ── H2O groupby accuracy (ASAP vs ClickHouse exact) ────────────────────────
-  h2o-accuracy:
-    name: H2O groupby accuracy regression
-    runs-on: ubuntu-latest
-    # Accuracy tests can be long-running on ephemeral runners; give them room.
-    timeout-minutes: 60
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests kafka-python gdown matplotlib
-          if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
-            pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
-          fi
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      # Pull / build only the images needed for accuracy testing
-      - name: Build base image
-        run: |
-          docker build \
-            -t sketchdb-base:latest \
-            -f asap-common/installation/Dockerfile \
-            asap-common
-
-      - name: Build summary-ingest image
-        run: |
-          docker build \
-            -t asap-summary-ingest:ci \
-            -f asap-summary-ingest/Dockerfile \
-            asap-summary-ingest
-
-      - name: Install Rust (for query engine)
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Install protoc
-        run: |
-          sudo apt-get update -qq
-          sudo apt-get install -y protobuf-compiler
-
-      - name: Run sccache
-        uses: mozilla-actions/sccache-action@v0.0.4
-
-      - name: Cache cargo
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: ${{ runner.os }}-cargo-accuracy-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
-
-      - name: Build query engine binary
-        run: cargo build --release --bin query_engine_rust --locked
-        env:
-          RUSTC_WRAPPER: sccache
-
-      # Run accuracy benchmark (ASAP path) with a small dataset slice
-      - name: Run ASAP accuracy benchmark
-        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
-        run: |
-          python run_benchmark.py \
-            --mode asap \
-            --load-data \
-            --max-rows ${{ env.MAX_ROWS }} \
-            --output /tmp/asap_accuracy_results.csv \
-            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
-        env:
-          RUSTC_WRAPPER: sccache
-
-      # Run the same queries against the exact baseline
-      - name: Run ClickHouse baseline benchmark
-        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
-        run: |
-          python run_benchmark.py \
-            --mode baseline \
-            --skip-data-load \
-            --output /tmp/baseline_accuracy_results.csv
-
-      # Compare ASAP results to baseline; fail if error exceeds threshold
-      - name: Check accuracy (error ≤ ${{ env.MAX_RELATIVE_ERROR }})
-        run: |
-          python3 - <<'EOF'
-          import csv, sys, os
-
-          max_err = float(os.environ["MAX_RELATIVE_ERROR"])
-          asap_file   = "/tmp/asap_accuracy_results.csv"
-          exact_file  = "/tmp/baseline_accuracy_results.csv"
-
-          def load(path):
-              with open(path) as f:
-                  return {row["query_id"]: float(row["result"]) for row in csv.DictReader(f)
-                          if row.get("result") not in (None, "", "null")}
-
-          try:
-              asap  = load(asap_file)
-              exact = load(exact_file)
-          except FileNotFoundError as e:
-              print(f"Result file missing: {e}. Skipping accuracy check.")
-              sys.exit(0)
-
-          failures = []
-          for qid, exact_val in exact.items():
-              if qid not in asap:
-                  print(f"WARN: {qid} not found in ASAP results, skipping")
-                  continue
-              if exact_val == 0:
-                  rel_err = 0.0 if asap[qid] == 0 else float("inf")
-              else:
-                  rel_err = abs(asap[qid] - exact_val) / abs(exact_val)
-              status = "PASS" if rel_err <= max_err else "FAIL"
-              print(f"{status}  {qid}: rel_err={rel_err:.4f}  asap={asap[qid]:.4f}  exact={exact_val:.4f}")
-              if status == "FAIL":
-                  failures.append(qid)
-
-          if failures:
-              print(f"\n{len(failures)} query(ies) exceeded max relative error ({max_err}):")
-              for qid in failures:
-                  print(f"  - {qid}")
-              sys.exit(1)
-          else:
-              print(f"\nAll queries within relative error threshold ({max_err}).")
-          EOF
-        env:
-          MAX_RELATIVE_ERROR: ${{ env.MAX_RELATIVE_ERROR }}
-
-      - name: Upload accuracy results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: accuracy-results-${{ github.run_id }}
-          path: |
-            /tmp/asap_accuracy_results.csv
-            /tmp/baseline_accuracy_results.csv
-          if-no-files-found: warn
diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml
new file mode 100644
index 0000000..a60cc31
--- /dev/null
+++ b/.github/workflows/eval-pr.yml
@@ -0,0 +1,98 @@
+name: PR Evaluation
+
+# NOTE: GitHub-hosted runners are noisy. Latency numbers are indicative only.
+# For precise benchmarks, register a self-hosted runner once asap-tools infra
+# is decoupled from Cloudlab. See PDF eval guide Phase 3.
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'asap-query-engine/**'
+      - 'asap-planner-rs/**'
+      - 'asap-summary-ingest/**'
+      - 'asap-quickstart/**'
+      - '.github/workflows/eval-pr.yml'
+      - 'benchmarks/**'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  eval:
+    name: Full-stack PR evaluation
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build and start full stack
+        env:
+          COMPOSE_DOCKER_CLI_BUILD: "1"
+          DOCKER_BUILDKIT: "1"
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            up -d --build 2>&1
+
+      - name: Show running containers
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            ps
+
+      - name: Wait for all services to be healthy
+        run: bash benchmarks/scripts/wait_for_stack.sh
+
+      - name: Wait for pipeline and data ingestion
+        run: bash benchmarks/scripts/ingest_wait.sh
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: pip install requests
+
+      - name: Run baseline queries (Prometheus)
+        run: python benchmarks/scripts/run_baseline.py
+
+      - name: Run ASAP queries (query engine)
+        run: python benchmarks/scripts/run_asap.py
+
+      - name: Compare results and evaluate
+        run: python benchmarks/scripts/compare.py
+
+      - name: Upload evaluation reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-reports-${{ github.run_id }}
+          path: benchmarks/reports/
+
+      - name: Print docker logs on failure
+        if: failure()
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            logs --no-color
+
+      - name: Teardown stack
+        if: always()
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            down -v
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
deleted file mode 100644
index 674be29..0000000
--- a/.github/workflows/performance.yml
+++ /dev/null
@@ -1,296 +0,0 @@
-name: Performance Benchmarks
-
-# IMPORTANT – Runner choice and benchmark precision
-# ─────────────────────────────────────────────────
-# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware.
-# They are useful for *relative* regression detection (e.g. "this PR is 2×
-# slower than main") but NOT for absolute latency numbers.
-#
-# For precise, publication-quality benchmarks the team uses the dedicated
-# asap-tools benchmarking infra (used for TurboProm paper experiments).
-# That infra must be decoupled from Cloudlab and registered as a
-# GitHub self-hosted runner.  Until then, set `runner: self-hosted` in the
-# workflow_dispatch inputs below to target a self-hosted machine when one
-# is available, or rely on the relative-regression job for PR gating.
-#
-# References:
-#   - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby
-#   - asap-tools/execution-utilities/asap_query_latency/      — ClickBench hits
-
-on:
-  pull_request:
-    branches: [ main ]
-    paths:
-      - 'asap-summary-ingest/**'
-      - 'asap-query-engine/**'
-      - 'asap-common/sketch-core/**'
-      - 'asap-common/dependencies/**'
-      - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**'
-      - 'asap-tools/execution-utilities/asap_query_latency/**'
-      - '.github/workflows/performance.yml'
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: >
-          Runner label to use.  Use 'ubuntu-latest' for relative regression
-          detection on GH-hosted VMs, or a self-hosted runner label (e.g.
-          'self-hosted') for precise absolute benchmarks.
-        required: false
-        default: ubuntu-latest
-      max_rows:
-        description: Rows to ingest (0 = full dataset; reduce for faster CI runs)
-        required: false
-        default: '100000'
-      benchmark_suite:
-        description: 'Which suite to run: h2o | query_latency | all'
-        required: false
-        default: all
-
-env:
-  # Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners)
-  DEFAULT_MAX_ROWS: '100000'
-  # Latency regression threshold: flag if p95 latency increases by more than
-  # this factor relative to the baseline run within the same CI job.
-  LATENCY_REGRESSION_FACTOR: '2.0'
-
-jobs:
-  # ── 1. Relative performance regression (always runs on GH-hosted VMs) ──────
-  relative-regression:
-    name: Relative performance regression (H2O groupby)
-    # Use the workflow_dispatch runner input when triggered manually;
-    # fall back to ubuntu-latest for PR triggers.
-    runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }}
-    timeout-minutes: 45
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests kafka-python gdown matplotlib
-          if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then
-            pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt
-          fi
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Build base image
-        run: |
-          docker build \
-            -t sketchdb-base:latest \
-            -f asap-common/installation/Dockerfile \
-            asap-common
-
-      - name: Build summary-ingest image
-        run: |
-          docker build \
-            -t asap-summary-ingest:ci \
-            -f asap-summary-ingest/Dockerfile \
-            asap-summary-ingest
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Install protoc
-        run: |
-          sudo apt-get update -qq
-          sudo apt-get install -y protobuf-compiler
-
-      - name: Run sccache
-        uses: mozilla-actions/sccache-action@v0.0.4
-
-      - name: Cache cargo
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
-
-      - name: Build query engine binary
-        run: cargo build --release --bin query_engine_rust --locked
-        env:
-          RUSTC_WRAPPER: sccache
-
-      - name: Resolve max_rows
-        id: config
-        run: |
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
-            echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT"
-          else
-            echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT"
-          fi
-
-      # ── Baseline (ClickHouse exact) ──
-      - name: Run ClickHouse baseline benchmark
-        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
-        run: |
-          python run_benchmark.py \
-            --mode baseline \
-            --load-data \
-            --max-rows ${{ steps.config.outputs.max_rows }} \
-            --output /tmp/baseline_perf_results.csv
-
-      # ── ASAP path ──
-      - name: Run ASAP benchmark
-        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
-        run: |
-          python run_benchmark.py \
-            --mode asap \
-            --load-data \
-            --max-rows ${{ steps.config.outputs.max_rows }} \
-            --output /tmp/asap_perf_results.csv \
-            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
-
-      # ── Regression check ──
-      - name: Check for latency regressions
-        run: |
-          python3 - <<'EOF'
-          import csv, os, sys
-
-          factor    = float(os.environ["LATENCY_REGRESSION_FACTOR"])
-          asap_file = "/tmp/asap_perf_results.csv"
-          base_file = "/tmp/baseline_perf_results.csv"
-
-          def load_latency(path):
-              with open(path) as f:
-                  return {row["query_id"]: float(row["latency_ms"])
-                          for row in csv.DictReader(f)
-                          if row.get("latency_ms") not in (None, "", "null")}
-
-          try:
-              asap = load_latency(asap_file)
-              base = load_latency(base_file)
-          except FileNotFoundError as e:
-              print(f"Result file missing: {e}. Skipping regression check.")
-              sys.exit(0)
-
-          regressions = []
-          print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}")
-          print("-" * 72)
-          for qid, base_lat in base.items():
-              if qid not in asap:
-                  continue
-              ratio = asap[qid] / base_lat if base_lat > 0 else float("inf")
-              status = "REGRESSION" if ratio > factor else "ok"
-              print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f}  {status}")
-              if status == "REGRESSION":
-                  regressions.append((qid, ratio))
-
-          if regressions:
-              print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):")
-              for qid, r in regressions:
-                  print(f"  - {qid}: {r:.2f}x slower than baseline")
-              print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to")
-              print("      cloud noise. For authoritative numbers use a self-hosted runner.")
-              sys.exit(1)
-          else:
-              print(f"\nNo regressions detected (threshold: {factor}x).")
-          EOF
-        env:
-          LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }}
-
-      - name: Generate latency comparison plot
-        if: always()
-        working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline
-        run: |
-          python plot_latency.py \
-            --asap /tmp/asap_perf_results.csv \
-            --baseline /tmp/baseline_perf_results.csv \
-            --output /tmp/latency_comparison.png 2>/dev/null || \
-          python plot_latency.py 2>/dev/null || true
-
-      - name: Upload benchmark results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: performance-results-${{ github.run_id }}
-          path: |
-            /tmp/asap_perf_results.csv
-            /tmp/baseline_perf_results.csv
-            /tmp/latency_comparison.png
-          if-no-files-found: warn
-
-  # ── 2. Query latency micro-benchmark (manual / self-hosted) ─────────────────
-  query-latency:
-    name: Query latency micro-benchmark
-    # Skip on PRs — run manually or on a scheduled trigger once self-hosted
-    # runners are available. On GH-hosted VMs the numbers are too noisy to be
-    # actionable for absolute latency SLOs.
-    if: >
-      github.event_name == 'workflow_dispatch' &&
-      (inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all')
-    runs-on: ${{ inputs.runner || 'ubuntu-latest' }}
-    timeout-minutes: 60
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Self-hosted runner notice
-        if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }}
-        run: |
-          echo "::warning::Running query_latency benchmark on a GH-hosted VM."
-          echo "::warning::Results are indicative only. Register the asap-tools"
-          echo "::warning::benchmarking host as a self-hosted runner for"
-          echo "::warning::publication-quality absolute latency measurements."
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install Python dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install requests
-          if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then
-            pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt
-          fi
-
-      - name: Install Rust
-        uses: dtolnay/rust-toolchain@stable
-
-      - name: Install protoc
-        run: |
-          sudo apt-get update -qq
-          sudo apt-get install -y protobuf-compiler
-
-      - name: Run sccache
-        uses: mozilla-actions/sccache-action@v0.0.4
-
-      - name: Cache cargo
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
-            target
-          key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
-
-      - name: Build query engine binary
-        run: cargo build --release --bin query_engine_rust --locked
-        env:
-          RUSTC_WRAPPER: sccache
-
-      - name: Run query latency benchmark
-        working-directory: asap-tools/execution-utilities/asap_query_latency
-        run: |
-          python run_benchmark.py \
-            --output /tmp/query_latency_results.csv \
-            --qe-bin ${{ github.workspace }}/target/release/query_engine_rust
-
-      - name: Upload query latency results
-        if: always()
-        uses: actions/upload-artifact@v4
-        with:
-          name: query-latency-results-${{ github.run_id }}
-          path: /tmp/query_latency_results.csv
-          if-no-files-found: warn
diff --git a/benchmarks/docker-compose.yml b/benchmarks/docker-compose.yml
new file mode 100644
index 0000000..dd3084f
--- /dev/null
+++ b/benchmarks/docker-compose.yml
@@ -0,0 +1,26 @@
+# CI override: replaces pulled images with locally-built versions for the
+# three ASAP-owned services. Intended for use as a Compose override:
+#
+#   docker compose \
+#     -f asap-quickstart/docker-compose.yml \
+#     -f benchmarks/docker-compose.yml \
+#     up -d --build
+
+services:
+  asap-planner-rs:
+    image: asap-planner-rs:ci
+    build:
+      context: .
+      dockerfile: asap-planner-rs/Dockerfile
+
+  asap-summary-ingest:
+    image: asap-summary-ingest:ci
+    build:
+      context: .
+      dockerfile: asap-summary-ingest/Dockerfile
+
+  queryengine:
+    image: asap-query-engine:ci
+    build:
+      context: .
+      dockerfile: asap-query-engine/Dockerfile
diff --git a/benchmarks/golden/.gitkeep b/benchmarks/golden/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/queries/promql_suite.json b/benchmarks/queries/promql_suite.json
new file mode 100644
index 0000000..52b525e
--- /dev/null
+++ b/benchmarks/queries/promql_suite.json
@@ -0,0 +1,18 @@
+{
+  "queries": [
+    {"id": "avg_all", "expr": "avg(sensor_reading)", "asap_native": false},
+    {"id": "sum_all", "expr": "sum(sensor_reading)", "asap_native": false},
+    {"id": "max_all", "expr": "max(sensor_reading)", "asap_native": false},
+    {"id": "min_all", "expr": "min(sensor_reading)", "asap_native": false},
+    {"id": "q50_all", "expr": "quantile(0.50, sensor_reading)", "asap_native": true},
+    {"id": "q90_all", "expr": "quantile(0.90, sensor_reading)", "asap_native": true},
+    {"id": "q95_all", "expr": "quantile(0.95, sensor_reading)", "asap_native": true},
+    {"id": "q99_all", "expr": "quantile(0.99, sensor_reading)", "asap_native": true},
+    {"id": "q95_by_pattern", "expr": "quantile by (pattern) (0.95, sensor_reading)", "asap_native": true},
+    {"id": "q99_by_pattern", "expr": "quantile by (pattern) (0.99, sensor_reading)", "asap_native": true},
+    {"id": "q50_by_pattern", "expr": "quantile by (pattern) (0.50, sensor_reading)", "asap_native": true},
+    {"id": "avg_by_pattern", "expr": "avg by (pattern) (sensor_reading)", "asap_native": false},
+    {"id": "sum_by_region", "expr": "sum by (region) (sensor_reading)", "asap_native": false},
+    {"id": "max_by_service", "expr": "max by (service) (sensor_reading)", "asap_native": false}
+  ]
+}
diff --git a/benchmarks/reports/.gitkeep b/benchmarks/reports/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py
new file mode 100644
index 0000000..eadcef2
--- /dev/null
+++ b/benchmarks/scripts/compare.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""compare.py — compares baseline (Prometheus) vs ASAP query engine results.
+
+Applies pass/fail policy:
+  - FAIL if any query returned an error in ASAP (query failure)
+  - FAIL if any ASAP-native query has relative error > max_error (default 1%)
+  - WARN (not FAIL) on >10% p95 latency regression (GH runner noise)
+
+Generates benchmarks/reports/eval_report.md and prints it to stdout.
+Exits 1 if any PASS/FAIL check failed.
+
+Usage:
+    python benchmarks/scripts/compare.py \
+        [--baseline FILE] \
+        [--asap FILE] \
+        [--output FILE] \
+        [--max-error FLOAT]
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timezone
+
+DEFAULT_BASELINE = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "baseline_results.json"
+)
+DEFAULT_ASAP = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "asap_results.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "eval_report.md"
+)
+DEFAULT_MAX_ERROR = 0.01   # 1 %
+LATENCY_REGRESSION_THRESHOLD = 0.10  # 10 % — warn only
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def percentile(values: list[float], pct: float) -> float:
+    """Compute percentile (0-100) from a list of floats using linear interpolation."""
+    if not values:
+        return float("nan")
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    if n == 1:
+        return sorted_vals[0]
+    idx = (pct / 100.0) * (n - 1)
+    lo = int(idx)
+    hi = lo + 1
+    if hi >= n:
+        return sorted_vals[-1]
+    frac = idx - lo
+    return sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac
+
+
+def valid_latencies(latencies: list) -> list[float]:
+    """Strip None entries (failed HTTP requests) from a latency list."""
+    return [x for x in latencies if x is not None]
+
+
+def label_key(metric: dict) -> str:
+    """Canonical sort key for a Prometheus metric label set."""
+    return json.dumps(metric, sort_keys=True)
+
+
+def extract_scalar_value(result: list) -> float | None:
+    """Return a single float from a scalar/single-entry vector result."""
+    if not result:
+        return None
+    entry = result[0]
+    try:
+        return float(entry["value"][1])
+    except (KeyError, IndexError, ValueError, TypeError):
+        return None
+
+
+def compute_relative_error(baseline_val: float, asap_val: float) -> float:
+    """Relative error: |asap - baseline| / max(|baseline|, 1e-9)."""
+    denom = max(abs(baseline_val), 1e-9)
+    return abs(asap_val - baseline_val) / denom
+
+
+def compare_results(
+    baseline_result: list, asap_result: list
+) -> tuple[float | None, str]:
+    """
+    Compare two Prometheus vector result arrays.
+
+    Returns (max_relative_error, note).
+    For grouped results, matches entries by label set.
+    """
+    if not baseline_result and not asap_result:
+        return 0.0, "both empty"
+    if not baseline_result:
+        return None, "baseline empty"
+    if not asap_result:
+        return None, "asap empty"
+
+    # Build label-key → float maps
+    baseline_map: dict[str, float] = {}
+    for entry in baseline_result:
+        key = label_key(entry.get("metric", {}))
+        try:
+            baseline_map[key] = float(entry["value"][1])
+        except (KeyError, IndexError, ValueError, TypeError):
+            pass
+
+    asap_map: dict[str, float] = {}
+    for entry in asap_result:
+        key = label_key(entry.get("metric", {}))
+        try:
+            asap_map[key] = float(entry["value"][1])
+        except (KeyError, IndexError, ValueError, TypeError):
+            pass
+
+    # Single-value result (no labels / collapsed)
+    if len(baseline_map) == 1 and len(asap_map) == 1:
+        bv = next(iter(baseline_map.values()))
+        av = next(iter(asap_map.values()))
+        return compute_relative_error(bv, av), ""
+
+    # Multi-value: match by label set
+    max_err = 0.0
+    missing = []
+    for key, bv in baseline_map.items():
+        if key not in asap_map:
+            missing.append(key)
+            continue
+        err = compute_relative_error(bv, asap_map[key])
+        max_err = max(max_err, err)
+
+    note = f"{len(missing)} label set(s) missing from ASAP" if missing else ""
+    return max_err, note
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compare baseline vs ASAP results")
+    parser.add_argument("--baseline", default=DEFAULT_BASELINE, help="Baseline JSON file")
+    parser.add_argument("--asap", default=DEFAULT_ASAP, help="ASAP results JSON file")
+    parser.add_argument("--output", default=DEFAULT_OUTPUT, help="Output Markdown file")
+    parser.add_argument(
+        "--max-error",
+        type=float,
+        default=DEFAULT_MAX_ERROR,
+        help="Max relative error for ASAP-native queries (default: 0.01 = 1%%)",
+    )
+    args = parser.parse_args()
+
+    with open(args.baseline) as f:
+        baseline_data = json.load(f)
+    with open(args.asap) as f:
+        asap_data = json.load(f)
+
+    baseline_results = baseline_data["results"]
+    asap_results = asap_data["results"]
+
+    # Collect all query IDs (union)
+    all_ids = sorted(set(list(baseline_results.keys()) + list(asap_results.keys())))
+
+    failures: list[str] = []
+    warnings: list[str] = []
+    rows: list[dict] = []
+
+    for qid in all_ids:
+        b = baseline_results.get(qid, {})
+        a = asap_results.get(qid, {})
+
+        asap_native = a.get("asap_native", False)
+        asap_status = a.get("status", "missing")
+        asap_error = a.get("error")
+
+        # Latency p95
+        b_lats = valid_latencies(b.get("latencies_ms", []))
+        a_lats = valid_latencies(a.get("latencies_ms", []))
+        b_p95 = percentile(b_lats, 95) if b_lats else float("nan")
+        a_p95 = percentile(a_lats, 95) if a_lats else float("nan")
+
+        # Relative error
+        rel_error: float | None = None
+        note = ""
+        if asap_status == "success" and b.get("status") == "success":
+            rel_error, note = compare_results(
+                b.get("data", []), a.get("data", [])
+            )
+
+        # Pass/fail determination
+        row_status = "PASS"
+
+        if asap_status == "error" or asap_status == "missing":
+            row_status = "FAIL"
+            reason = asap_error or "query not present in ASAP results"
+            failures.append(f"{qid}: ASAP query failed — {reason}")
+
+        elif asap_native and rel_error is not None and rel_error > args.max_error:
+            row_status = "FAIL"
+            failures.append(
+                f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f}"
+            )
+
+        # Latency regression — warn only
+        if (
+            not (b_p95 != b_p95)  # not NaN
+            and not (a_p95 != a_p95)
+            and b_p95 > 0
+        ):
+            latency_regression = (a_p95 - b_p95) / b_p95
+            if latency_regression > LATENCY_REGRESSION_THRESHOLD:
+                warnings.append(
+                    f"{qid}: p95 latency regression {latency_regression:.1%} "
+                    f"(baseline={b_p95:.1f}ms, asap={a_p95:.1f}ms) — "
+                    "GH runner noise expected; informational only"
+                )
+
+        rows.append(
+            {
+                "id": qid,
+                "asap_native": asap_native,
+                "b_p95": b_p95,
+                "a_p95": a_p95,
+                "rel_error": rel_error,
+                "status": row_status,
+                "note": note,
+            }
+        )
+
+    overall = "PASS" if not failures else "FAIL"
+
+    # ── Build Markdown report ────────────────────────────────────────────────
+    now = datetime.now(timezone.utc).isoformat()
+    lines: list[str] = []
+    lines.append("# ASAP PR Evaluation Report")
+    lines.append("")
+    lines.append(f"**Generated:** {now}")
+    lines.append(
+        f"**Overall verdict:** {'✅ PASS' if overall == 'PASS' else '❌ FAIL'}"
+    )
+    lines.append(f"**Max error threshold:** {args.max_error:.2%}")
+    lines.append("")
+
+    # Summary table
+    lines.append(
+        "| Query | ASAP Native | Baseline p95 (ms) | ASAP p95 (ms) | Rel Error | Status |"
+    )
+    lines.append("|-------|:-----------:|:-----------------:|:-------------:|:---------:|:------:|")
+    for row in rows:
+        native_mark = "yes" if row["asap_native"] else "no"
+        b_p95_s = f"{row['b_p95']:.1f}" if row["b_p95"] == row["b_p95"] else "n/a"
+        a_p95_s = f"{row['a_p95']:.1f}" if row["a_p95"] == row["a_p95"] else "n/a"
+        if row["rel_error"] is None:
+            rel_err_s = "n/a"
+        else:
+            rel_err_s = f"{row['rel_error']:.4f}"
+        status_s = "✅ PASS" if row["status"] == "PASS" else "❌ FAIL"
+        note_s = f" ({row['note']})" if row["note"] else ""
+        lines.append(
+            f"| {row['id']}{note_s} | {native_mark} | {b_p95_s} | {a_p95_s} "
+            f"| {rel_err_s} | {status_s} |"
+        )
+
+    lines.append("")
+
+    if failures:
+        lines.append("## Failures")
+        lines.append("")
+        for f_msg in failures:
+            lines.append(f"- {f_msg}")
+        lines.append("")
+
+    if warnings:
+        lines.append("## Latency Warnings (informational)")
+        lines.append("")
+        lines.append(
+            "> **Note:** GitHub-hosted runners exhibit significant timing noise. "
+            "Latency numbers are indicative only. For precise benchmarks, register "
+            "a self-hosted runner once asap-tools infra is decoupled from Cloudlab "
+            "(see PDF eval guide Phase 3)."
+        )
+        lines.append("")
+        for w in warnings:
+            lines.append(f"- {w}")
+        lines.append("")
+
+    lines.append("---")
+    lines.append(
+        "_This report was generated automatically by `benchmarks/scripts/compare.py`._"
+    )
+
+    report = "\n".join(lines)
+    print(report)
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        f.write(report + "\n")
+    print(f"\n[compare] Report saved to {args.output}", file=sys.stderr)
+
+    if overall == "FAIL":
+        print(
+            f"\n[compare] Evaluation FAILED. {len(failures)} check(s) failed:",
+            file=sys.stderr,
+        )
+        for msg in failures:
+            print(f"  - {msg}", file=sys.stderr)
+        sys.exit(1)
+    else:
+        print("\n[compare] Evaluation PASSED.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/ingest_wait.sh b/benchmarks/scripts/ingest_wait.sh
new file mode 100644
index 0000000..6e9b38d
--- /dev/null
+++ b/benchmarks/scripts/ingest_wait.sh
@@ -0,0 +1,72 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ingest_wait.sh — waits for the asap-demo Arroyo pipeline to reach RUNNING
+# state, then sleeps to allow sketches to accumulate before verifying that the
+# query engine has ingested data.
+
+ARROYO_URL="http://localhost:5115/api/v1/pipelines"
+QE_URL="http://localhost:8088/api/v1/query"
+PIPELINE_NAME="asap-demo"
+MAX_PIPELINE_WAIT=300   # seconds
+ACCUMULATE_SLEEP=90     # seconds after pipeline is running
+SLEEP=5
+
+# ── 1. Wait for asap-demo pipeline to reach RUNNING ─────────────────────────
+echo "[ingest_wait] Waiting for Arroyo pipeline '${PIPELINE_NAME}' to reach RUNNING state ..."
+elapsed=0
+while true; do
+  state=$(curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null \
+    | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+pipelines = data if isinstance(data, list) else data.get('data', [])
+for p in pipelines:
+    name = p.get('name', '') or p.get('id', '')
+    if '${PIPELINE_NAME}' in str(name):
+        print(p.get('state', p.get('status', '')))
+        break
+" 2>/dev/null || true)
+
+  if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ]; then
+    echo "[ingest_wait] Pipeline '${PIPELINE_NAME}' is RUNNING (${elapsed}s elapsed)"
+    break
+  fi
+
+  if [ "${elapsed}" -ge "${MAX_PIPELINE_WAIT}" ]; then
+    echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state}')" >&2
+    exit 1
+  fi
+
+  echo "[ingest_wait] Pipeline state: '${state:-unknown}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..."
+  sleep "${SLEEP}"
+  elapsed=$(( elapsed + SLEEP ))
+done
+
+# ── 2. Allow sketches to accumulate ─────────────────────────────────────────
+echo "[ingest_wait] Pipeline running. Sleeping ${ACCUMULATE_SLEEP}s for sketches to accumulate ..."
+sleep "${ACCUMULATE_SLEEP}"
+
+# ── 3. Verify query engine has data ─────────────────────────────────────────
+echo "[ingest_wait] Verifying query engine has data ..."
+response=$(curl -sf --max-time 10 \
+  "${QE_URL}?query=avg%28sensor_reading%29" 2>/dev/null || true)
+
+if [ -z "${response}" ]; then
+  echo "[ingest_wait] ERROR: Query engine returned empty response." >&2
+  exit 1
+fi
+
+result_count=$(echo "${response}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+result = data.get('data', {}).get('result', [])
+print(len(result))
+" 2>/dev/null || echo "0")
+
+if [ "${result_count}" -eq 0 ]; then
+  echo "[ingest_wait] ERROR: Query engine has no data yet (result array is empty)." >&2
+  exit 1
+fi
+
+echo "[ingest_wait] Query engine has data (${result_count} result entries). Ready for benchmarking."
diff --git a/benchmarks/scripts/run_asap.py b/benchmarks/scripts/run_asap.py
new file mode 100644
index 0000000..10747cb
--- /dev/null
+++ b/benchmarks/scripts/run_asap.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""run_asap.py — queries ASAPQuery engine for each entry in promql_suite.json.
+
+Runs each query 3 times to gather latency samples, then writes results to
+benchmarks/reports/asap_results.json.
+
+Usage:
+    python benchmarks/scripts/run_asap.py \
+        [--asap-url URL] \
+        [--output FILE]
+"""
+
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from datetime import datetime, timezone
+
+import requests
+
+SUITE_PATH = os.path.join(
+    os.path.dirname(__file__), "..", "queries", "promql_suite.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "asap_results.json"
+)
+RUNS_PER_QUERY = 3
+
+
+def query_asap(base_url: str, expr: str, ts: float) -> tuple[dict, float]:
+    """Issue a single instant query to the ASAP query engine, return (parsed_json, latency_ms)."""
+    encoded = urllib.parse.quote(expr, safe="")
+    url = f"{base_url}/api/v1/query?query={encoded}&time={ts}"
+    t0 = time.monotonic()
+    resp = requests.get(url, timeout=30)
+    latency_ms = (time.monotonic() - t0) * 1000.0
+    resp.raise_for_status()
+    return resp.json(), latency_ms
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run ASAP query engine benchmark queries")
+    parser.add_argument(
+        "--asap-url",
+        default="http://localhost:8088",
+        help="ASAP query engine base URL (default: http://localhost:8088)",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_OUTPUT,
+        help="Output JSON file path",
+    )
+    args = parser.parse_args()
+
+    with open(SUITE_PATH) as f:
+        suite = json.load(f)
+
+    results: dict[str, dict] = {}
+    now = time.time()
+
+    for q in suite["queries"]:
+        qid = q["id"]
+        expr = q["expr"]
+        asap_native = q.get("asap_native", False)
+        latencies = []
+        last_data = []
+        last_error = None
+        last_status = "success"
+
+        print(f"[asap] Running query '{qid}' (native={asap_native}): {expr}")
+        for run in range(1, RUNS_PER_QUERY + 1):
+            try:
+                payload, lat = query_asap(args.asap_url, expr, now)
+                latencies.append(lat)
+                if payload.get("status") == "success":
+                    last_data = payload.get("data", {}).get("result", [])
+                    last_status = "success"
+                    last_error = None
+                else:
+                    last_status = "error"
+                    last_error = payload.get("error", "unknown error")
+                    last_data = []
+                print(f"  run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms  status={last_status}")
+            except Exception as exc:  # noqa: BLE001
+                last_status = "error"
+                last_error = str(exc)
+                last_data = []
+                latencies.append(None)
+                print(f"  run {run}/{RUNS_PER_QUERY}: ERROR — {exc}")
+
+        results[qid] = {
+            "status": last_status,
+            "asap_native": asap_native,
+            "latencies_ms": latencies,
+            "data": last_data,
+            "error": last_error,
+        }
+
+    output = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "asap_url": args.asap_url,
+        "results": results,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"\n[asap] Results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/run_baseline.py b/benchmarks/scripts/run_baseline.py
new file mode 100644
index 0000000..6e09e41
--- /dev/null
+++ b/benchmarks/scripts/run_baseline.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""run_baseline.py — queries Prometheus for each entry in promql_suite.json.
+
+Runs each query 3 times to gather latency samples, then writes results to
+benchmarks/reports/baseline_results.json.
+
+Usage:
+    python benchmarks/scripts/run_baseline.py \
+        [--prometheus-url URL] \
+        [--output FILE]
+"""
+
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from datetime import datetime, timezone
+
+import requests
+
+SUITE_PATH = os.path.join(
+    os.path.dirname(__file__), "..", "queries", "promql_suite.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "baseline_results.json"
+)
+RUNS_PER_QUERY = 3
+
+
+def query_prometheus(base_url: str, expr: str, ts: float) -> tuple[dict, float]:
+    """Issue a single instant query, return (parsed_json, latency_ms)."""
+    encoded = urllib.parse.quote(expr, safe="")
+    url = f"{base_url}/api/v1/query?query={encoded}&time={ts}"
+    t0 = time.monotonic()
+    resp = requests.get(url, timeout=30)
+    latency_ms = (time.monotonic() - t0) * 1000.0
+    resp.raise_for_status()
+    return resp.json(), latency_ms
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run baseline Prometheus queries")
+    parser.add_argument(
+        "--prometheus-url",
+        default="http://localhost:9090",
+        help="Prometheus base URL (default: http://localhost:9090)",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_OUTPUT,
+        help="Output JSON file path",
+    )
+    args = parser.parse_args()
+
+    with open(SUITE_PATH) as f:
+        suite = json.load(f)
+
+    results: dict[str, dict] = {}
+    now = time.time()
+
+    for q in suite["queries"]:
+        qid = q["id"]
+        expr = q["expr"]
+        latencies = []
+        last_data = []
+        last_error = None
+        last_status = "success"
+
+        print(f"[baseline] Running query '{qid}': {expr}")
+        for run in range(1, RUNS_PER_QUERY + 1):
+            try:
+                payload, lat = query_prometheus(args.prometheus_url, expr, now)
+                latencies.append(lat)
+                if payload.get("status") == "success":
+                    last_data = payload.get("data", {}).get("result", [])
+                    last_status = "success"
+                    last_error = None
+                else:
+                    last_status = "error"
+                    last_error = payload.get("error", "unknown error")
+                    last_data = []
+                print(f"  run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms  status={last_status}")
+            except Exception as exc:  # noqa: BLE001
+                last_status = "error"
+                last_error = str(exc)
+                last_data = []
+                latencies.append(None)
+                print(f"  run {run}/{RUNS_PER_QUERY}: ERROR — {exc}")
+
+        results[qid] = {
+            "status": last_status,
+            "latencies_ms": latencies,
+            "data": last_data,
+            "error": last_error,
+        }
+
+    output = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "prometheus_url": args.prometheus_url,
+        "results": results,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"\n[baseline] Results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/wait_for_stack.sh b/benchmarks/scripts/wait_for_stack.sh
new file mode 100644
index 0000000..672aa6d
--- /dev/null
+++ b/benchmarks/scripts/wait_for_stack.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# wait_for_stack.sh — polls each service until healthy or times out.
+# Used in CI after `docker compose up -d --build`.
+
+MAX_WAIT=180   # seconds per service
+SLEEP=5
+
+wait_for_url() {
+  local name="$1"
+  local url="$2"
+  local elapsed=0
+
+  echo "[wait_for_stack] Waiting for ${name} at ${url} ..."
+  while true; do
+    if curl -sf --max-time 5 "${url}" > /dev/null 2>&1; then
+      echo "[wait_for_stack] ${name} is healthy (${elapsed}s elapsed)"
+      return 0
+    fi
+    if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then
+      echo "[wait_for_stack] ERROR: ${name} did not become healthy within ${MAX_WAIT}s" >&2
+      return 1
+    fi
+    sleep "${SLEEP}"
+    elapsed=$(( elapsed + SLEEP ))
+  done
+}
+
+wait_for_url "Prometheus"   "http://localhost:9090/-/healthy"
+wait_for_url "Arroyo API"   "http://localhost:5115/api/v1/pipelines"
+wait_for_url "QueryEngine"  "http://localhost:8088/api/v1/query?query=vector(1)"
+
+echo "[wait_for_stack] All services are healthy."

From 8ecdf35deb4eb88f5737a6566e584fc9c4c9fdde Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:43:22 -0500
Subject: [PATCH 4/9] Raise ASAP-native accuracy threshold from 1% to 5%

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/scripts/compare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py
index eadcef2..48274ef 100644
--- a/benchmarks/scripts/compare.py
+++ b/benchmarks/scripts/compare.py
@@ -32,7 +32,7 @@
 DEFAULT_OUTPUT = os.path.join(
     os.path.dirname(__file__), "..", "reports", "eval_report.md"
 )
-DEFAULT_MAX_ERROR = 0.01   # 1 %
+DEFAULT_MAX_ERROR = 0.05   # 5 %
 LATENCY_REGRESSION_THRESHOLD = 0.10  # 10 % — warn only
 
 

From 78297c8011ab4aca35c32828559c16d4f1234fb3 Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:43:47 -0500
Subject: [PATCH 5/9] Make ASAP-native accuracy threshold warning-only (>5%
 warns, does not fail)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/scripts/compare.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py
index 48274ef..2f901d9 100644
--- a/benchmarks/scripts/compare.py
+++ b/benchmarks/scripts/compare.py
@@ -3,7 +3,7 @@
 
 Applies pass/fail policy:
   - FAIL if any query returned an error in ASAP (query failure)
-  - FAIL if any ASAP-native query has relative error > max_error (default 1%)
+  - WARN if any ASAP-native query has relative error > max_error (default 5%)
   - WARN (not FAIL) on >10% p95 latency regression (GH runner noise)
 
 Generates benchmarks/reports/eval_report.md and prints it to stdout.
@@ -196,9 +196,9 @@ def main() -> None:
             failures.append(f"{qid}: ASAP query failed — {reason}")
 
         elif asap_native and rel_error is not None and rel_error > args.max_error:
-            row_status = "FAIL"
-            failures.append(
-                f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f}"
+            row_status = "WARN"
+            warnings.append(
+                f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f} — informational only"
             )
 
         # Latency regression — warn only

From 70d2f06098bcf500f29583f3f76a878e44241abc Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 18:58:37 -0500
Subject: [PATCH 6/9] Rename eval-pr.yml to accuracy_performance.yml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/{eval-pr.yml => accuracy_performance.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{eval-pr.yml => accuracy_performance.yml} (100%)

diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/accuracy_performance.yml
similarity index 100%
rename from .github/workflows/eval-pr.yml
rename to .github/workflows/accuracy_performance.yml

From e828ccaeac434b85de6bc6d220d87b32d8e455f9 Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 19:12:55 -0500
Subject: [PATCH 7/9] Restructure CI to build images once and reuse across eval

- Split accuracy_performance.yml into build + eval jobs: images are built
  and pushed to GHCR with a sha-<short-sha> tag in the build job, then
  pulled (not rebuilt) by the eval job via ASAP_IMAGE_TAG env var
- Remove build: sections from benchmarks/docker-compose.yml; it now only
  overrides image tags using ${ASAP_IMAGE_TAG}
- Change asap-summary-ingest/Dockerfile FROM to ghcr.io/projectasap/asap-base:latest
  so it can be built in any job without requiring a local sketchdb-base image
- Add --project-directory . to all docker compose calls to fix build context
  path resolution (asap-quickstart/asap-summary-ingest lstat error)
- Fix stale eval-pr.yml path reference in accuracy_performance.yml triggers
- Fix PromQLPattern::new call sites in three test files: remove stale second
  argument after collect_tokens was removed from the function signature

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/accuracy_performance.yml    | 84 +++++++++++++++++--
 .../rust_tests/src/pattern_tests.rs           | 19 -----
 .../tests/compare_patterns/src/main.rs        |  6 --
 .../tests/rust_pattern_matching/src/main.rs   | 12 ---
 asap-summary-ingest/Dockerfile                |  2 +-
 benchmarks/docker-compose.yml                 | 26 +++---
 6 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/accuracy_performance.yml b/.github/workflows/accuracy_performance.yml
index a60cc31..a2dace5 100644
--- a/.github/workflows/accuracy_performance.yml
+++ b/.github/workflows/accuracy_performance.yml
@@ -13,40 +13,106 @@ on:
       - 'asap-planner-rs/**'
       - 'asap-summary-ingest/**'
       - 'asap-quickstart/**'
-      - '.github/workflows/eval-pr.yml'
+      - '.github/workflows/accuracy_performance.yml'
       - 'benchmarks/**'
   workflow_dispatch:
 
 permissions:
   contents: read
+  packages: write
   pull-requests: write
 
 jobs:
+  # ---------------------------------------------------------------------------
+  # Job 1: build images once from branch code and push with a SHA-based tag.
+  # All downstream jobs pull these images instead of rebuilding.
+  # ---------------------------------------------------------------------------
+  build:
+    name: Build CI images
+    runs-on: ubuntu-latest
+    outputs:
+      image-tag: ${{ steps.tag.outputs.value }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Compute image tag
+        id: tag
+        run: echo "value=sha-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
+
+      - name: Build and push asap-planner-rs
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: asap-planner-rs/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-planner-rs:${{ steps.tag.outputs.value }}
+          cache-from: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache
+          cache-to: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache,mode=max
+
+      - name: Build and push asap-summary-ingest
+        uses: docker/build-push-action@v6
+        with:
+          context: asap-summary-ingest
+          file: asap-summary-ingest/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-summary-ingest:${{ steps.tag.outputs.value }}
+
+      - name: Build and push asap-query-engine
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: asap-query-engine/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-query-engine:${{ steps.tag.outputs.value }}
+          cache-from: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache
+          cache-to: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache,mode=max
+
+  # ---------------------------------------------------------------------------
+  # Job 2: pull the images built above, deploy the full stack, and evaluate.
+  # ---------------------------------------------------------------------------
   eval:
     name: Full-stack PR evaluation
+    needs: build
     runs-on: ubuntu-latest
     timeout-minutes: 60
+    env:
+      ASAP_IMAGE_TAG: ${{ needs.build.outputs.image-tag }}
 
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Build and start full stack
-        env:
-          COMPOSE_DOCKER_CLI_BUILD: "1"
-          DOCKER_BUILDKIT: "1"
+      - name: Pull and start full stack
         run: |
           docker compose \
+            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
-            up -d --build 2>&1
+            up -d
 
       - name: Show running containers
         run: |
           docker compose \
+            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             ps
@@ -85,6 +151,7 @@ jobs:
         if: failure()
         run: |
           docker compose \
+            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             logs --no-color
@@ -93,6 +160,7 @@ jobs:
         if: always()
         run: |
           docker compose \
+            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             down -v
diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
index fcc210f..c9c1032 100644
--- a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
+++ b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
@@ -19,22 +19,11 @@ impl PatternTester {
             // Rate pattern
             PromQLPattern::new(
                 Self::build_rate_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "range_vector".to_string(),
-                ],
                 // Some("ONLY_TEMPORAL".to_string()),
             ),
             // Quantile over time pattern
             PromQLPattern::new(
                 Self::build_quantile_over_time_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "range_vector".to_string(),
-                    "function_args".to_string(),
-                ],
                 // Some("ONLY_TEMPORAL".to_string()),
             ),
         ];
@@ -44,13 +33,11 @@ impl PatternTester {
             // Sum aggregation pattern
             PromQLPattern::new(
                 Self::build_sum_pattern(),
-                vec!["metric".to_string(), "aggregation".to_string()],
                 // Some("ONLY_SPATIAL".to_string()),
             ),
             // Simple metric pattern
             PromQLPattern::new(
                 Self::build_metric_pattern(),
-                vec!["metric".to_string()],
                 // Some("ONLY_SPATIAL".to_string()),
             ),
         ];
@@ -60,12 +47,6 @@ impl PatternTester {
             // Sum of rate pattern
             PromQLPattern::new(
                 Self::build_one_temporal_one_spatial_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "aggregation".to_string(),
-                    "range_vector".to_string(),
-                ],
                 // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()),
             ),
         ];
diff --git a/asap-common/tests/compare_patterns/src/main.rs b/asap-common/tests/compare_patterns/src/main.rs
index 6c429f1..d75fa8b 100644
--- a/asap-common/tests/compare_patterns/src/main.rs
+++ b/asap-common/tests/compare_patterns/src/main.rs
@@ -25,7 +25,6 @@ fn main() {
     let pattern_1 = PromQLPatternBuilder::function(vec!["rate", "increase"], func_args1, Some("function"), None);
     let pattern1 = PromQLPattern::new(
         pattern_1,
-        vec!["metric".to_string(), "function".to_string(), "range_vector".to_string()],
     );
     if let Some(ast) = pattern1.ast_pattern {
         only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -44,7 +43,6 @@ fn main() {
     let pattern_2 = PromQLPatternBuilder::function(vec!["quantile_over_time"], func_args2, Some("function"), Some("function_args"));
     let pattern2 = PromQLPattern::new(
         pattern_2,
-        vec!["metric".to_string(), "function".to_string(), "range_vector".to_string(), "function_args".to_string()],
     );
     if let Some(ast) = pattern2.ast_pattern {
         only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -66,7 +64,6 @@ fn main() {
     );
     let pattern3 = PromQLPattern::new(
         pattern_3,
-        vec!["metric".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern3.ast_pattern {
         only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -76,7 +73,6 @@ fn main() {
     let pattern_4 = PromQLPatternBuilder::metric(None, None, None, Some("metric"));
     let pattern4 = PromQLPattern::new(
         pattern_4,
-        vec!["metric".to_string()],
     );
     if let Some(ast) = pattern4.ast_pattern {
         only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -108,7 +104,6 @@ fn main() {
     );
     let pattern5 = PromQLPattern::new(
         pattern_5,
-        vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "function_args".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern5.ast_pattern {
         one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -137,7 +132,6 @@ fn main() {
     );
     let pattern6 = PromQLPattern::new(
         pattern_6,
-        vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern6.ast_pattern {
         one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
diff --git a/asap-common/tests/rust_pattern_matching/src/main.rs b/asap-common/tests/rust_pattern_matching/src/main.rs
index 3a540af..4a29352 100644
--- a/asap-common/tests/rust_pattern_matching/src/main.rs
+++ b/asap-common/tests/rust_pattern_matching/src/main.rs
@@ -10,11 +10,6 @@ fn temporal_pattern(
 ) -> PromQLPattern {
     PromQLPattern::new(
         blocks[pattern_type].clone(),
-        vec![
-            "metric".to_string(),
-            "function".to_string(),
-            "range_vector".to_string(),
-        ],
     )
 }
 
@@ -24,7 +19,6 @@ fn spatial_pattern(
 ) -> PromQLPattern {
     PromQLPattern::new(
         blocks[pattern_type].clone(),
-        vec!["metric".to_string(), "aggregation".to_string()],
     )
 }
 
@@ -39,12 +33,6 @@ fn spatial_of_temporal_pattern(temporal_block: &Option<HashMap<String, Value>>)
     );
     PromQLPattern::new(
         pattern,
-        vec![
-            "metric".to_string(),
-            "function".to_string(),
-            "range_vector".to_string(),
-            "aggregation".to_string(),
-        ],
     )
 }
 
diff --git a/asap-summary-ingest/Dockerfile b/asap-summary-ingest/Dockerfile
index 8432840..7a79036 100644
--- a/asap-summary-ingest/Dockerfile
+++ b/asap-summary-ingest/Dockerfile
@@ -1,4 +1,4 @@
-FROM sketchdb-base:latest
+FROM ghcr.io/projectasap/asap-base:latest
 
 LABEL maintainer="SketchDB Team"
 LABEL description="ArroyoSketch pipeline configuration service"
diff --git a/benchmarks/docker-compose.yml b/benchmarks/docker-compose.yml
index dd3084f..fea9921 100644
--- a/benchmarks/docker-compose.yml
+++ b/benchmarks/docker-compose.yml
@@ -1,26 +1,20 @@
-# CI override: replaces pulled images with locally-built versions for the
-# three ASAP-owned services. Intended for use as a Compose override:
+# CI image override: replaces quickstart's pinned release images with images
+# built from the current branch. Intended for use as a Compose override:
 #
-#   docker compose \
+#   ASAP_IMAGE_TAG=sha-<short-sha> docker compose \
+#     --project-directory . \
 #     -f asap-quickstart/docker-compose.yml \
 #     -f benchmarks/docker-compose.yml \
-#     up -d --build
+#     up -d
+#
+# ASAP_IMAGE_TAG is set automatically by the 'build' job in accuracy_performance.yml.
 
 services:
   asap-planner-rs:
-    image: asap-planner-rs:ci
-    build:
-      context: .
-      dockerfile: asap-planner-rs/Dockerfile
+    image: ghcr.io/projectasap/asap-planner-rs:${ASAP_IMAGE_TAG}
 
   asap-summary-ingest:
-    image: asap-summary-ingest:ci
-    build:
-      context: .
-      dockerfile: asap-summary-ingest/Dockerfile
+    image: ghcr.io/projectasap/asap-summary-ingest:${ASAP_IMAGE_TAG}
 
   queryengine:
-    image: asap-query-engine:ci
-    build:
-      context: .
-      dockerfile: asap-query-engine/Dockerfile
+    image: ghcr.io/projectasap/asap-query-engine:${ASAP_IMAGE_TAG}

From 7070ba714aa326b3ee64da69e423c23a0c4572af Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Thu, 26 Mar 2026 21:18:18 -0500
Subject: [PATCH 8/9] Fix three CI failures: eval path resolution, Python deps,
 Rust pattern tests

- accuracy_performance.yml: remove --project-directory . from eval job so
  bind mounts in asap-quickstart/docker-compose.yml resolve from asap-quickstart/
  (fixes asap-planner-rs exit 1 due to missing controller-config.yaml)
- correctness.yml: fix pip install path to asap-common/dependencies/py/promql_utilities/
  (setup.py is there, not at the parent dir; fixes silent failure + missing promql_parser)
- pattern_tests.rs: add avg_over_time and *_over_time variants to ONLY_TEMPORAL;
  remove duplicate ONLY_VECTOR key so disambiguation works deterministically;
  add build_combined_quantile_pattern for 2-arg quantile_over_time in combined queries

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/accuracy_performance.yml    |  4 --
 .github/workflows/correctness.yml             |  4 +-
 .../rust_tests/src/pattern_tests.rs           | 49 +++++++++++++++++--
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/accuracy_performance.yml b/.github/workflows/accuracy_performance.yml
index a2dace5..79eff1e 100644
--- a/.github/workflows/accuracy_performance.yml
+++ b/.github/workflows/accuracy_performance.yml
@@ -104,7 +104,6 @@ jobs:
       - name: Pull and start full stack
         run: |
           docker compose \
-            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             up -d
@@ -112,7 +111,6 @@ jobs:
       - name: Show running containers
         run: |
           docker compose \
-            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             ps
@@ -151,7 +149,6 @@ jobs:
         if: failure()
         run: |
           docker compose \
-            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             logs --no-color
@@ -160,7 +157,6 @@ jobs:
         if: always()
         run: |
           docker compose \
-            --project-directory . \
             -f asap-quickstart/docker-compose.yml \
             -f benchmarks/docker-compose.yml \
             down -v
diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
index 11d1a20..7ba0178 100644
--- a/.github/workflows/correctness.yml
+++ b/.github/workflows/correctness.yml
@@ -40,7 +40,7 @@ jobs:
           if [ -f asap-common/dependencies/py/requirements.txt ]; then
             pip install -r asap-common/dependencies/py/requirements.txt
           fi
-          pip install -e asap-common/dependencies/py/ 2>/dev/null || true
+          pip install -e asap-common/dependencies/py/promql_utilities/
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
@@ -98,7 +98,7 @@ jobs:
           if [ -f asap-common/dependencies/py/requirements.txt ]; then
             pip install -r asap-common/dependencies/py/requirements.txt
           fi
-          pip install -e asap-common/dependencies/py/ 2>/dev/null || true
+          pip install -e asap-common/dependencies/py/promql_utilities/
 
       - name: Install Rust
         uses: dtolnay/rust-toolchain@stable
diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
index c9c1032..6b0fef9 100644
--- a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
+++ b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
@@ -44,15 +44,20 @@ impl PatternTester {
 
         // ONE_TEMPORAL_ONE_SPATIAL patterns
         let combined_patterns = vec![
-            // Sum of rate pattern
+            // Aggregation of single-arg temporal functions
             PromQLPattern::new(
                 Self::build_one_temporal_one_spatial_pattern(),
                 // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()),
             ),
+            // Aggregation of quantile_over_time (2-arg)
+            PromQLPattern::new(
+                Self::build_combined_quantile_pattern(),
+                // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()),
+            ),
         ];
 
         // Insert in order from simple to complex to avoid panics
-        patterns.insert("ONLY_VECTOR".to_string(), spatial_patterns.clone());
+        // ONLY_VECTOR is derived via disambiguation in test_query, not a separate entry
         patterns.insert("ONLY_SPATIAL".to_string(), spatial_patterns);
         patterns.insert("ONLY_TEMPORAL".to_string(), temporal_patterns);
         patterns.insert("ONE_TEMPORAL_ONE_SPATIAL".to_string(), combined_patterns);
@@ -261,7 +266,20 @@ impl PatternTester {
 
         let args: Vec<Option<HashMap<String, Value>>> = vec![ms];
 
-        PromQLPatternBuilder::function(vec!["rate", "increase"], args, Some("function"), None)
+        PromQLPatternBuilder::function(
+            vec![
+                "rate",
+                "increase",
+                "avg_over_time",
+                "sum_over_time",
+                "count_over_time",
+                "min_over_time",
+                "max_over_time",
+            ],
+            args,
+            Some("function"),
+            None,
+        )
     }
 
     fn build_quantile_over_time_pattern() -> Option<HashMap<String, Value>> {
@@ -308,7 +326,6 @@ impl PatternTester {
 
         let func = PromQLPatternBuilder::function(
             vec![
-                "quantile_over_time",
                 "sum_over_time",
                 "count_over_time",
                 "avg_over_time",
@@ -332,6 +349,30 @@ impl PatternTester {
         )
     }
 
+    fn build_combined_quantile_pattern() -> Option<HashMap<String, Value>> {
+        let num = PromQLPatternBuilder::number(None, None);
+        let ms = PromQLPatternBuilder::matrix_selector(
+            PromQLPatternBuilder::metric(None, None, None, Some("metric")),
+            None,
+            Some("range_vector"),
+        );
+        let func_args: Vec<Option<HashMap<String, Value>>> = vec![num, ms];
+        let func = PromQLPatternBuilder::function(
+            vec!["quantile_over_time"],
+            func_args,
+            Some("function"),
+            None,
+        );
+        PromQLPatternBuilder::aggregation(
+            vec!["sum", "count", "avg", "quantile", "min", "max"],
+            func,
+            None,
+            None,
+            None,
+            Some("aggregation"),
+        )
+    }
+
     fn build_sum_rate_pattern() -> Option<HashMap<String, Value>> {
         let ms = PromQLPatternBuilder::matrix_selector(
             PromQLPatternBuilder::metric(None, None, None, Some("metric")),

From 4259d152ef938a85de00f2e686b1efd3d34e3f07 Mon Sep 17 00:00:00 2001
From: zz_y <zeyingz@umd.edu>
Date: Fri, 27 Mar 2026 11:34:56 -0500
Subject: [PATCH 9/9] Fix ingest_wait.sh: null-data handling, Arroyo state
 detection, longer timeout

Three bugs caused the pipeline to always show as 'not found':

1. data.get('data', []) returns None when Arroyo returns {"data": null} for an
   empty pipeline list. dict.get() only falls back to the default when the key
   is absent, not when the value is null. Fixed with (data.get('data') or []).

2. Arroyo signals a running pipeline via state=null + stop='none', not a literal
   "Running" string. The ingest_wait.sh state check was looking for the wrong
   value; the correct pattern is already used in asap-tools/run_pipeline.sh.

3. MAX_PIPELINE_WAIT=300s is too short: Arroyo must compile Rust UDFs before
   the pipeline can start, which takes several minutes in CI. Raised to 600s.

Also: normalise hyphens/underscores in the name match so 'asap-demo' matches
whether Arroyo stores it as 'asap-demo' or 'asap_demo'; add pipeline list dump
on timeout for easier future diagnosis.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 benchmarks/scripts/ingest_wait.sh | 41 ++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/benchmarks/scripts/ingest_wait.sh b/benchmarks/scripts/ingest_wait.sh
index 6e9b38d..8b12d3a 100644
--- a/benchmarks/scripts/ingest_wait.sh
+++ b/benchmarks/scripts/ingest_wait.sh
@@ -8,7 +8,7 @@ set -euo pipefail
 ARROYO_URL="http://localhost:5115/api/v1/pipelines"
 QE_URL="http://localhost:8088/api/v1/query"
 PIPELINE_NAME="asap-demo"
-MAX_PIPELINE_WAIT=300   # seconds
+MAX_PIPELINE_WAIT=600   # seconds — Arroyo must compile Rust UDFs; allow extra time
 ACCUMULATE_SLEEP=90     # seconds after pipeline is running
 SLEEP=5
 
@@ -19,26 +19,45 @@ while true; do
   state=$(curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null \
     | python3 -c "
 import sys, json
-data = json.load(sys.stdin)
-pipelines = data if isinstance(data, list) else data.get('data', [])
-for p in pipelines:
-    name = p.get('name', '') or p.get('id', '')
-    if '${PIPELINE_NAME}' in str(name):
-        print(p.get('state', p.get('status', '')))
-        break
+try:
+    data = json.load(sys.stdin)
+    # 'data' key may be null when no pipelines exist; use 'or []' to handle that
+    pipelines = data if isinstance(data, list) else (data.get('data') or [])
+    for p in pipelines:
+        name = str(p.get('name') or p.get('id') or '')
+        # Normalise hyphens/underscores so 'asap-demo' matches 'asap_demo'
+        if '${PIPELINE_NAME}'.replace('-', '_') in name.replace('-', '_'):
+            state = p.get('state')
+            stop  = p.get('stop', '')
+            # Arroyo signals a running pipeline via state=null and stop='none'
+            if state is None and stop == 'none':
+                print('Running')
+            elif state is not None:
+                print(str(state))
+            else:
+                print('stopped')
+            break
+    else:
+        # No matching pipeline found yet — print nothing so caller retries
+        pass
+except Exception:
+    pass
 " 2>/dev/null || true)
 
-  if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ]; then
+  if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ] || [ "${state}" = "running" ]; then
     echo "[ingest_wait] Pipeline '${PIPELINE_NAME}' is RUNNING (${elapsed}s elapsed)"
     break
   fi
 
   if [ "${elapsed}" -ge "${MAX_PIPELINE_WAIT}" ]; then
-    echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state}')" >&2
+    echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state:-unknown}')" >&2
+    # Dump pipeline list for diagnosis
+    echo "[ingest_wait] Current Arroyo pipeline list:" >&2
+    curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null | python3 -m json.tool 2>/dev/null >&2 || true
     exit 1
   fi
 
-  echo "[ingest_wait] Pipeline state: '${state:-unknown}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..."
+  echo "[ingest_wait] Pipeline state: '${state:-not found}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..."
   sleep "${SLEEP}"
   elapsed=$(( elapsed + SLEEP ))
 done