From f3fdf53780fdd36afa4d3761c2f0b930db3283e7 Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:26:07 -0500 Subject: [PATCH 1/9] Add accuracy, correctness, and performance CI workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - correctness.yml: cross-language Python/Rust PromQL pattern matching parity tests (token matching, serialised patterns, Rust unit tests) - accuracy.yml: ASAP vs ClickHouse exact baseline error regression check (≤5% relative error threshold) on H2O groupby dataset - performance.yml: relative latency regression detection on GH-hosted VMs; manual workflow_dispatch with self-hosted runner support for precise absolute benchmarks once asap-tools infra is decoupled from Cloudlab Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/accuracy.yml | 183 ++++++++++++++++++ .github/workflows/correctness.yml | 168 +++++++++++++++++ .github/workflows/performance.yml | 296 ++++++++++++++++++++++++++++++ 3 files changed, 647 insertions(+) create mode 100644 .github/workflows/accuracy.yml create mode 100644 .github/workflows/correctness.yml create mode 100644 .github/workflows/performance.yml diff --git a/.github/workflows/accuracy.yml b/.github/workflows/accuracy.yml new file mode 100644 index 0000000..aad9a59 --- /dev/null +++ b/.github/workflows/accuracy.yml @@ -0,0 +1,183 @@ +name: Accuracy Tests + +# Validates that ASAP approximate query results stay within acceptable error +# bounds relative to an exact (ClickHouse) baseline. Tests run inside Docker +# containers on ephemeral GitHub Actions VMs — sufficient for catching +# accuracy regressions without requiring self-hosted infrastructure. + +on: + push: + branches: [ main ] + paths: + - 'asap-summary-ingest/**' + - 'asap-query-engine/**' + - 'asap-common/sketch-core/**' + - 'asap-common/dependencies/**' + - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' + - 'asap-tools/execution-utilities/asap_query_latency/**' + - '.github/workflows/accuracy.yml' + pull_request: + branches: [ main ] + paths: + - 'asap-summary-ingest/**' + - 'asap-query-engine/**' + - 'asap-common/sketch-core/**' + - 'asap-common/dependencies/**' + - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' + - 'asap-tools/execution-utilities/asap_query_latency/**' + - '.github/workflows/accuracy.yml' + workflow_dispatch: + +env: + # Rows to ingest during CI — small enough to complete in ~10 min on GH runners + # while still exercising the full sketch → query path. Increase on self-hosted + # runners for a more thorough accuracy sweep. + MAX_ROWS: 50000 + # Maximum acceptable relative error vs exact baseline (5 %) + MAX_RELATIVE_ERROR: "0.05" + +jobs: + # ── H2O groupby accuracy (ASAP vs ClickHouse exact) ──────────────────────── + h2o-accuracy: + name: H2O groupby accuracy regression + runs-on: ubuntu-latest + # Accuracy tests can be long-running on ephemeral runners; give them room. + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install requests kafka-python gdown matplotlib + if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then + pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt + fi + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # Pull / build only the images needed for accuracy testing + - name: Build base image + run: | + docker build \ + -t sketchdb-base:latest \ + -f asap-common/installation/Dockerfile \ + asap-common + + - name: Build summary-ingest image + run: | + docker build \ + -t asap-summary-ingest:ci \ + -f asap-summary-ingest/Dockerfile \ + asap-summary-ingest + + - name: Install Rust (for query engine) + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-accuracy-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Build query engine binary + run: cargo build --release --bin query_engine_rust --locked + env: + RUSTC_WRAPPER: sccache + + # Run accuracy benchmark (ASAP path) with a small dataset slice + - name: Run ASAP accuracy benchmark + working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline + run: | + python run_benchmark.py \ + --mode asap \ + --load-data \ + --max-rows ${{ env.MAX_ROWS }} \ + --output /tmp/asap_accuracy_results.csv \ + --qe-bin ${{ github.workspace }}/target/release/query_engine_rust + env: + RUSTC_WRAPPER: sccache + + # Run the same queries against the exact baseline + - name: Run ClickHouse baseline benchmark + working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline + run: | + python run_benchmark.py \ + --mode baseline \ + --skip-data-load \ + --output /tmp/baseline_accuracy_results.csv + + # Compare ASAP results to baseline; fail if error exceeds threshold + - name: Check accuracy (error ≤ ${{ env.MAX_RELATIVE_ERROR }}) + run: | + python3 - <<'EOF' + import csv, sys, os + + max_err = float(os.environ["MAX_RELATIVE_ERROR"]) + asap_file = "/tmp/asap_accuracy_results.csv" + exact_file = "/tmp/baseline_accuracy_results.csv" + + def load(path): + with open(path) as f: + return {row["query_id"]: float(row["result"]) for row in csv.DictReader(f) + if row.get("result") not in (None, "", "null")} + + try: + asap = load(asap_file) + exact = load(exact_file) + except FileNotFoundError as e: + print(f"Result file missing: {e}. Skipping accuracy check.") + sys.exit(0) + + failures = [] + for qid, exact_val in exact.items(): + if qid not in asap: + print(f"WARN: {qid} not found in ASAP results, skipping") + continue + if exact_val == 0: + rel_err = 0.0 if asap[qid] == 0 else float("inf") + else: + rel_err = abs(asap[qid] - exact_val) / abs(exact_val) + status = "PASS" if rel_err <= max_err else "FAIL" + print(f"{status} {qid}: rel_err={rel_err:.4f} asap={asap[qid]:.4f} exact={exact_val:.4f}") + if status == "FAIL": + failures.append(qid) + + if failures: + print(f"\n{len(failures)} query(ies) exceeded max relative error ({max_err}):") + for qid in failures: + print(f" - {qid}") + sys.exit(1) + else: + print(f"\nAll queries within relative error threshold ({max_err}).") + EOF + env: + MAX_RELATIVE_ERROR: ${{ env.MAX_RELATIVE_ERROR }} + + - name: Upload accuracy results + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-results-${{ github.run_id }} + path: | + /tmp/asap_accuracy_results.csv + /tmp/baseline_accuracy_results.csv + if-no-files-found: warn diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml new file mode 100644 index 0000000..11d1a20 --- /dev/null +++ b/.github/workflows/correctness.yml @@ -0,0 +1,168 @@ +name: Correctness Tests + +# Verifies cross-language parity between Python and Rust implementations of +# PromQL pattern matching and sketch serialisation. Ephemeral GitHub Actions +# VMs are well-suited for these deterministic correctness checks. + +on: + push: + branches: [ main ] + paths: + - 'asap-common/tests/**' + - 'asap-common/dependencies/**' + - 'asap-common/sketch-core/**' + - '.github/workflows/correctness.yml' + pull_request: + branches: [ main ] + paths: + - 'asap-common/tests/**' + - 'asap-common/dependencies/**' + - 'asap-common/sketch-core/**' + - '.github/workflows/correctness.yml' + workflow_dispatch: + +jobs: + # ── 1. Cross-language token matching (Python vs Rust) ────────────────────── + cross-language-token-matching: + name: Cross-language PromQL token matching + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + if [ -f asap-common/dependencies/py/requirements.txt ]; then + pip install -r asap-common/dependencies/py/requirements.txt + fi + pip install -e asap-common/dependencies/py/ 2>/dev/null || true + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Run master test runner (Python + Rust + comparison) + working-directory: asap-common/tests/compare_matched_tokens + run: python utilities/master_test_runner.py + env: + RUSTC_WRAPPER: sccache + + - name: Upload test artefacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: cross-language-token-results + path: | + asap-common/tests/compare_matched_tokens/python_tests/python_test_results.json + asap-common/tests/compare_matched_tokens/rust_tests/rust_test_results.json + asap-common/tests/compare_matched_tokens/comparison_tests/comparison_report.json + asap-common/tests/compare_matched_tokens/test_summary.json + if-no-files-found: warn + + # ── 2. Cross-language serialised pattern comparison ──────────────────────── + cross-language-pattern-serialisation: + name: Cross-language pattern serialisation + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + if [ -f asap-common/dependencies/py/requirements.txt ]; then + pip install -r asap-common/dependencies/py/requirements.txt + fi + pip install -e asap-common/dependencies/py/ 2>/dev/null || true + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Generate Python patterns + working-directory: asap-common/tests/compare_patterns + run: python python_generate_patterns.py + + - name: Build and run Rust pattern generator + working-directory: asap-common/tests/compare_patterns + run: cargo run --release + env: + RUSTC_WRAPPER: sccache + + - name: Compare serialised patterns + working-directory: asap-common/tests/compare_patterns + run: python compare_serialized_patterns.py + + # ── 3. Rust pattern-matching unit tests ──────────────────────────────────── + rust-pattern-matching: + name: Rust pattern matching unit tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Run Rust pattern matching tests + working-directory: asap-common/tests/rust_pattern_matching + run: cargo test --release + env: + RUSTC_WRAPPER: sccache diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml new file mode 100644 index 0000000..674be29 --- /dev/null +++ b/.github/workflows/performance.yml @@ -0,0 +1,296 @@ +name: Performance Benchmarks + +# IMPORTANT – Runner choice and benchmark precision +# ───────────────────────────────────────────────── +# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware. +# They are useful for *relative* regression detection (e.g. "this PR is 2× +# slower than main") but NOT for absolute latency numbers. +# +# For precise, publication-quality benchmarks the team uses the dedicated +# asap-tools benchmarking infra (used for TurboProm paper experiments). +# That infra must be decoupled from Cloudlab and registered as a +# GitHub self-hosted runner. Until then, set `runner: self-hosted` in the +# workflow_dispatch inputs below to target a self-hosted machine when one +# is available, or rely on the relative-regression job for PR gating. +# +# References: +# - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby +# - asap-tools/execution-utilities/asap_query_latency/ — ClickBench hits + +on: + pull_request: + branches: [ main ] + paths: + - 'asap-summary-ingest/**' + - 'asap-query-engine/**' + - 'asap-common/sketch-core/**' + - 'asap-common/dependencies/**' + - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' + - 'asap-tools/execution-utilities/asap_query_latency/**' + - '.github/workflows/performance.yml' + workflow_dispatch: + inputs: + runner: + description: > + Runner label to use. Use 'ubuntu-latest' for relative regression + detection on GH-hosted VMs, or a self-hosted runner label (e.g. + 'self-hosted') for precise absolute benchmarks. + required: false + default: ubuntu-latest + max_rows: + description: Rows to ingest (0 = full dataset; reduce for faster CI runs) + required: false + default: '100000' + benchmark_suite: + description: 'Which suite to run: h2o | query_latency | all' + required: false + default: all + +env: + # Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners) + DEFAULT_MAX_ROWS: '100000' + # Latency regression threshold: flag if p95 latency increases by more than + # this factor relative to the baseline run within the same CI job. + LATENCY_REGRESSION_FACTOR: '2.0' + +jobs: + # ── 1. Relative performance regression (always runs on GH-hosted VMs) ────── + relative-regression: + name: Relative performance regression (H2O groupby) + # Use the workflow_dispatch runner input when triggered manually; + # fall back to ubuntu-latest for PR triggers. + runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }} + timeout-minutes: 45 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install requests kafka-python gdown matplotlib + if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then + pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt + fi + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build base image + run: | + docker build \ + -t sketchdb-base:latest \ + -f asap-common/installation/Dockerfile \ + asap-common + + - name: Build summary-ingest image + run: | + docker build \ + -t asap-summary-ingest:ci \ + -f asap-summary-ingest/Dockerfile \ + asap-summary-ingest + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Build query engine binary + run: cargo build --release --bin query_engine_rust --locked + env: + RUSTC_WRAPPER: sccache + + - name: Resolve max_rows + id: config + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT" + else + echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT" + fi + + # ── Baseline (ClickHouse exact) ── + - name: Run ClickHouse baseline benchmark + working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline + run: | + python run_benchmark.py \ + --mode baseline \ + --load-data \ + --max-rows ${{ steps.config.outputs.max_rows }} \ + --output /tmp/baseline_perf_results.csv + + # ── ASAP path ── + - name: Run ASAP benchmark + working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline + run: | + python run_benchmark.py \ + --mode asap \ + --load-data \ + --max-rows ${{ steps.config.outputs.max_rows }} \ + --output /tmp/asap_perf_results.csv \ + --qe-bin ${{ github.workspace }}/target/release/query_engine_rust + + # ── Regression check ── + - name: Check for latency regressions + run: | + python3 - <<'EOF' + import csv, os, sys + + factor = float(os.environ["LATENCY_REGRESSION_FACTOR"]) + asap_file = "/tmp/asap_perf_results.csv" + base_file = "/tmp/baseline_perf_results.csv" + + def load_latency(path): + with open(path) as f: + return {row["query_id"]: float(row["latency_ms"]) + for row in csv.DictReader(f) + if row.get("latency_ms") not in (None, "", "null")} + + try: + asap = load_latency(asap_file) + base = load_latency(base_file) + except FileNotFoundError as e: + print(f"Result file missing: {e}. Skipping regression check.") + sys.exit(0) + + regressions = [] + print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}") + print("-" * 72) + for qid, base_lat in base.items(): + if qid not in asap: + continue + ratio = asap[qid] / base_lat if base_lat > 0 else float("inf") + status = "REGRESSION" if ratio > factor else "ok" + print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f} {status}") + if status == "REGRESSION": + regressions.append((qid, ratio)) + + if regressions: + print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):") + for qid, r in regressions: + print(f" - {qid}: {r:.2f}x slower than baseline") + print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to") + print(" cloud noise. For authoritative numbers use a self-hosted runner.") + sys.exit(1) + else: + print(f"\nNo regressions detected (threshold: {factor}x).") + EOF + env: + LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }} + + - name: Generate latency comparison plot + if: always() + working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline + run: | + python plot_latency.py \ + --asap /tmp/asap_perf_results.csv \ + --baseline /tmp/baseline_perf_results.csv \ + --output /tmp/latency_comparison.png 2>/dev/null || \ + python plot_latency.py 2>/dev/null || true + + - name: Upload benchmark results + if: always() + uses: actions/upload-artifact@v4 + with: + name: performance-results-${{ github.run_id }} + path: | + /tmp/asap_perf_results.csv + /tmp/baseline_perf_results.csv + /tmp/latency_comparison.png + if-no-files-found: warn + + # ── 2. Query latency micro-benchmark (manual / self-hosted) ───────────────── + query-latency: + name: Query latency micro-benchmark + # Skip on PRs — run manually or on a scheduled trigger once self-hosted + # runners are available. On GH-hosted VMs the numbers are too noisy to be + # actionable for absolute latency SLOs. + if: > + github.event_name == 'workflow_dispatch' && + (inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all') + runs-on: ${{ inputs.runner || 'ubuntu-latest' }} + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + + - name: Self-hosted runner notice + if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }} + run: | + echo "::warning::Running query_latency benchmark on a GH-hosted VM." + echo "::warning::Results are indicative only. Register the asap-tools" + echo "::warning::benchmarking host as a self-hosted runner for" + echo "::warning::publication-quality absolute latency measurements." + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install requests + if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then + pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt + fi + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + + - name: Install protoc + run: | + sudo apt-get update -qq + sudo apt-get install -y protobuf-compiler + + - name: Run sccache + uses: mozilla-actions/sccache-action@v0.0.4 + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} + + - name: Build query engine binary + run: cargo build --release --bin query_engine_rust --locked + env: + RUSTC_WRAPPER: sccache + + - name: Run query latency benchmark + working-directory: asap-tools/execution-utilities/asap_query_latency + run: | + python run_benchmark.py \ + --output /tmp/query_latency_results.csv \ + --qe-bin ${{ github.workspace }}/target/release/query_engine_rust + + - name: Upload query latency results + if: always() + uses: actions/upload-artifact@v4 + with: + name: query-latency-results-${{ github.run_id }} + path: /tmp/query_latency_results.csv + if-no-files-found: warn From 5152c326e3873f48f3d12a885545baed7fc7c94c Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:29:21 -0500 Subject: [PATCH 2/9] Fix workspace conflicts and add missing test data for correctness CI - Add [workspace] table to three standalone test Cargo.toml files (compare_patterns, compare_matched_tokens/rust_tests, rust_pattern_matching) to prevent "believes it's in a workspace" error on the root Cargo.toml - Add test_data/promql_queries.json with 10 PromQL test cases covering ONLY_TEMPORAL, ONLY_SPATIAL, and ONE_TEMPORAL_ONE_SPATIAL patterns, required by the cross-language master_test_runner - Unignore test_data/*.json in asap-common/.gitignore (tests/**/*.json was intended for generated result files, not fixture input data) Co-Authored-By: Claude Sonnet 4.6 --- asap-common/.gitignore | 1 + .../rust_tests/Cargo.toml | 2 + .../test_data/promql_queries.json | 228 ++++++++++++++++++ asap-common/tests/compare_patterns/Cargo.toml | 2 + .../tests/rust_pattern_matching/Cargo.toml | 2 + 5 files changed, 235 insertions(+) create mode 100644 asap-common/tests/compare_matched_tokens/test_data/promql_queries.json diff --git a/asap-common/.gitignore b/asap-common/.gitignore index 9e7080c..102b6ea 100644 --- a/asap-common/.gitignore +++ b/asap-common/.gitignore @@ -8,4 +8,5 @@ dependencies/py/promql_utilities/promql_utilities.egg-info/ dependencies/rs/**/target/ tests/**/*.json +!tests/**/test_data/*.json tests/**/target/ diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml index e38e483..c4155dc 100644 --- a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml +++ b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml @@ -1,3 +1,5 @@ +[workspace] + [package] name = "promql_cross_lang_tests" version = "0.1.0" diff --git a/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json new file mode 100644 index 0000000..0d03af4 --- /dev/null +++ b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json @@ -0,0 +1,228 @@ +{ + "test_cases": [ + { + "id": "temporal_rate_basic", + "description": "Basic rate function over a time range", + "query": "rate(http_requests_total{job=\"api\"}[5m])", + "expected_pattern_type": "ONLY_TEMPORAL", + "expected_tokens": { + "metric": { + "name": "http_requests_total", + "labels": {"job": "api"}, + "at_modifier": null + }, + "function": { + "name": "rate" + }, + "range_vector": { + "range": "5m" + } + } + }, + { + "id": "temporal_increase_basic", + "description": "Increase function over a time range", + "query": "increase(http_requests_total[1h])", + "expected_pattern_type": "ONLY_TEMPORAL", + "expected_tokens": { + "metric": { + "name": "http_requests_total", + "labels": {}, + "at_modifier": null + }, + "function": { + "name": "increase" + }, + "range_vector": { + "range": "1h" + } + } + }, + { + "id": "temporal_quantile_over_time", + "description": "Quantile over time function", + "query": "quantile_over_time(0.95, cpu_usage{instance=\"host1\"}[10m])", + "expected_pattern_type": "ONLY_TEMPORAL", + "expected_tokens": { + "metric": { + "name": "cpu_usage", + "labels": {"instance": "host1"}, + "at_modifier": null + }, + "function": { + "name": "quantile_over_time" + }, + "range_vector": { + "range": "10m" + } + } + }, + { + "id": "temporal_avg_over_time", + "description": "Average over time function", + "query": "avg_over_time(memory_bytes[30m])", + "expected_pattern_type": "ONLY_TEMPORAL", + "expected_tokens": { + "metric": { + "name": "memory_bytes", + "labels": {}, + "at_modifier": null + }, + "function": { + "name": "avg_over_time" + }, + "range_vector": { + "range": "30m" + } + } + }, + { + "id": "spatial_sum_aggregation", + "description": "Sum aggregation across all series", + "query": "sum(http_requests_total{job=\"api\"})", + "expected_pattern_type": "ONLY_SPATIAL", + "expected_tokens": { + "metric": { + "name": "http_requests_total", + "labels": {"job": "api"}, + "at_modifier": null + }, + "aggregation": { + "op": "sum", + "modifier": null + } + } + }, + { + "id": "spatial_avg_aggregation", + "description": "Average aggregation by label", + "query": "avg by (instance) (cpu_usage)", + "expected_pattern_type": "ONLY_SPATIAL", + "expected_tokens": { + "metric": { + "name": "cpu_usage", + "labels": {}, + "at_modifier": null + }, + "aggregation": { + "op": "avg", + "modifier": null + } + } + }, + { + "id": "spatial_count_aggregation", + "description": "Count aggregation", + "query": "count(up{job=\"node\"})", + "expected_pattern_type": "ONLY_SPATIAL", + "expected_tokens": { + "metric": { + "name": "up", + "labels": {"job": "node"}, + "at_modifier": null + }, + "aggregation": { + "op": "count", + "modifier": null + } + } + }, + { + "id": "combined_sum_of_rate", + "description": "Sum aggregation of rate (temporal + spatial)", + "query": "sum(rate(http_requests_total{job=\"api\"}[5m]))", + "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL", + "expected_tokens": { + "metric": { + "name": "http_requests_total", + "labels": {"job": "api"}, + "at_modifier": null + }, + "function": { + "name": "rate" + }, + "aggregation": { + "op": "sum", + "modifier": null + }, + "range_vector": { + "range": "5m" + } + } + }, + { + "id": "combined_avg_of_quantile_over_time", + "description": "Avg aggregation of quantile_over_time (temporal + spatial)", + "query": "avg(quantile_over_time(0.99, response_time_seconds[15m]))", + "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL", + "expected_tokens": { + "metric": { + "name": "response_time_seconds", + "labels": {}, + "at_modifier": null + }, + "function": { + "name": "quantile_over_time" + }, + "aggregation": { + "op": "avg", + "modifier": null + }, + "range_vector": { + "range": "15m" + } + } + }, + { + "id": "combined_sum_of_avg_over_time", + "description": "Sum aggregation of avg_over_time", + "query": "sum by (job) (avg_over_time(memory_bytes{env=\"prod\"}[1h]))", + "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL", + "expected_tokens": { + "metric": { + "name": "memory_bytes", + "labels": {"env": "prod"}, + "at_modifier": null + }, + "function": { + "name": "avg_over_time" + }, + "aggregation": { + "op": "sum", + "modifier": null + }, + "range_vector": { + "range": "1h" + } + } + } + ], + "pattern_builder_tests": [ + { + "id": "builder_metric_no_labels", + "description": "Build a simple metric selector with no labels", + "builder_call": "metric", + "parameters": { + "collect_as": "metric" + }, + "expected_pattern": { + "type": "metric", + "collect_as": "metric" + } + }, + { + "id": "builder_function_rate", + "description": "Build a rate function pattern", + "builder_call": "function", + "parameters": { + "names": ["rate", "increase"], + "collect_as": "function" + }, + "expected_pattern": { + "type": "function", + "names": ["rate", "increase"], + "collect_as": "function" + } + } + ] +} diff --git a/asap-common/tests/compare_patterns/Cargo.toml b/asap-common/tests/compare_patterns/Cargo.toml index 8d04a6f..3f94a69 100644 --- a/asap-common/tests/compare_patterns/Cargo.toml +++ b/asap-common/tests/compare_patterns/Cargo.toml @@ -1,3 +1,5 @@ +[workspace] + [package] name = "compare_patterns_runner" version = "0.1.0" diff --git a/asap-common/tests/rust_pattern_matching/Cargo.toml b/asap-common/tests/rust_pattern_matching/Cargo.toml index 14b2980..571641e 100644 --- a/asap-common/tests/rust_pattern_matching/Cargo.toml +++ b/asap-common/tests/rust_pattern_matching/Cargo.toml @@ -1,3 +1,5 @@ +[workspace] + [package] name = "promql_cross_lang_tests" version = "0.1.0" From c7ac5a76d9de84acca962584a85833e5d2be076d Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:42:16 -0500 Subject: [PATCH 3/9] Replace accuracy/performance workflows with full e2e eval-pr based on PDF guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements all 20 steps from the ASAPQuery PR Performance & Accuracy Evaluation Guide: - eval-pr.yml: triggers on core code changes, builds the 3 ASAP-owned services (planner-rs, summary-ingest, query-engine) from PR source, spins up the full quickstart stack, waits for pipeline + ingestion, runs queries against both Prometheus (baseline) and ASAPQuery, then evaluates accuracy and performance - benchmarks/docker-compose.yml: Compose override that replaces the GHCR images for planner-rs/summary-ingest/queryengine with `build:` directives so CI always tests the latest committed code in the branch - benchmarks/queries/promql_suite.json: 14-query fixed suite covering avg/sum/max/min/quantile at p50/p90/p95/p99, with and without grouping by pattern/region/service; marks ASAP-native queries - benchmarks/scripts/wait_for_stack.sh: polls Prometheus, Arroyo, and QueryEngine until healthy (180s timeout each) - benchmarks/scripts/ingest_wait.sh: waits for asap-demo Arroyo pipeline to reach RUNNING, then 90s for sketch accumulation - benchmarks/scripts/run_baseline.py: queries Prometheus /api/v1/query 3x per query, records latencies and results - benchmarks/scripts/run_asap.py: same for ASAPQuery /api/v1/query - benchmarks/scripts/compare.py: normalises vector results by label set, enforces pass/fail policy (no query failures; ASAP-native relative error ≤ 1%); latency regressions are warn-only per PDF practical caution note on GH runner noise Pass/fail policy (PDF page 3): - No query failures - Max relative error ≤ 1% - No >10% p95 latency regression (warn-only on ephemeral runners) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/accuracy.yml | 183 ---------------- .github/workflows/eval-pr.yml | 98 +++++++++ .github/workflows/performance.yml | 296 ------------------------- benchmarks/docker-compose.yml | 26 +++ benchmarks/golden/.gitkeep | 0 benchmarks/queries/promql_suite.json | 18 ++ benchmarks/reports/.gitkeep | 0 benchmarks/scripts/compare.py | 313 +++++++++++++++++++++++++++ benchmarks/scripts/ingest_wait.sh | 72 ++++++ benchmarks/scripts/run_asap.py | 113 ++++++++++ benchmarks/scripts/run_baseline.py | 111 ++++++++++ benchmarks/scripts/wait_for_stack.sh | 34 +++ 12 files changed, 785 insertions(+), 479 deletions(-) delete mode 100644 .github/workflows/accuracy.yml create mode 100644 .github/workflows/eval-pr.yml delete mode 100644 .github/workflows/performance.yml create mode 100644 benchmarks/docker-compose.yml create mode 100644 benchmarks/golden/.gitkeep create mode 100644 benchmarks/queries/promql_suite.json create mode 100644 benchmarks/reports/.gitkeep create mode 100644 benchmarks/scripts/compare.py create mode 100644 benchmarks/scripts/ingest_wait.sh create mode 100644 benchmarks/scripts/run_asap.py create mode 100644 benchmarks/scripts/run_baseline.py create mode 100644 benchmarks/scripts/wait_for_stack.sh diff --git a/.github/workflows/accuracy.yml b/.github/workflows/accuracy.yml deleted file mode 100644 index aad9a59..0000000 --- a/.github/workflows/accuracy.yml +++ /dev/null @@ -1,183 +0,0 @@ -name: Accuracy Tests - -# Validates that ASAP approximate query results stay within acceptable error -# bounds relative to an exact (ClickHouse) baseline. Tests run inside Docker -# containers on ephemeral GitHub Actions VMs — sufficient for catching -# accuracy regressions without requiring self-hosted infrastructure. - -on: - push: - branches: [ main ] - paths: - - 'asap-summary-ingest/**' - - 'asap-query-engine/**' - - 'asap-common/sketch-core/**' - - 'asap-common/dependencies/**' - - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' - - 'asap-tools/execution-utilities/asap_query_latency/**' - - '.github/workflows/accuracy.yml' - pull_request: - branches: [ main ] - paths: - - 'asap-summary-ingest/**' - - 'asap-query-engine/**' - - 'asap-common/sketch-core/**' - - 'asap-common/dependencies/**' - - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' - - 'asap-tools/execution-utilities/asap_query_latency/**' - - '.github/workflows/accuracy.yml' - workflow_dispatch: - -env: - # Rows to ingest during CI — small enough to complete in ~10 min on GH runners - # while still exercising the full sketch → query path. Increase on self-hosted - # runners for a more thorough accuracy sweep. - MAX_ROWS: 50000 - # Maximum acceptable relative error vs exact baseline (5 %) - MAX_RELATIVE_ERROR: "0.05" - -jobs: - # ── H2O groupby accuracy (ASAP vs ClickHouse exact) ──────────────────────── - h2o-accuracy: - name: H2O groupby accuracy regression - runs-on: ubuntu-latest - # Accuracy tests can be long-running on ephemeral runners; give them room. - timeout-minutes: 60 - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install requests kafka-python gdown matplotlib - if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then - pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt - fi - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - # Pull / build only the images needed for accuracy testing - - name: Build base image - run: | - docker build \ - -t sketchdb-base:latest \ - -f asap-common/installation/Dockerfile \ - asap-common - - - name: Build summary-ingest image - run: | - docker build \ - -t asap-summary-ingest:ci \ - -f asap-summary-ingest/Dockerfile \ - asap-summary-ingest - - - name: Install Rust (for query engine) - uses: dtolnay/rust-toolchain@stable - - - name: Install protoc - run: | - sudo apt-get update -qq - sudo apt-get install -y protobuf-compiler - - - name: Run sccache - uses: mozilla-actions/sccache-action@v0.0.4 - - - name: Cache cargo - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-accuracy-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} - - - name: Build query engine binary - run: cargo build --release --bin query_engine_rust --locked - env: - RUSTC_WRAPPER: sccache - - # Run accuracy benchmark (ASAP path) with a small dataset slice - - name: Run ASAP accuracy benchmark - working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline - run: | - python run_benchmark.py \ - --mode asap \ - --load-data \ - --max-rows ${{ env.MAX_ROWS }} \ - --output /tmp/asap_accuracy_results.csv \ - --qe-bin ${{ github.workspace }}/target/release/query_engine_rust - env: - RUSTC_WRAPPER: sccache - - # Run the same queries against the exact baseline - - name: Run ClickHouse baseline benchmark - working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline - run: | - python run_benchmark.py \ - --mode baseline \ - --skip-data-load \ - --output /tmp/baseline_accuracy_results.csv - - # Compare ASAP results to baseline; fail if error exceeds threshold - - name: Check accuracy (error ≤ ${{ env.MAX_RELATIVE_ERROR }}) - run: | - python3 - <<'EOF' - import csv, sys, os - - max_err = float(os.environ["MAX_RELATIVE_ERROR"]) - asap_file = "/tmp/asap_accuracy_results.csv" - exact_file = "/tmp/baseline_accuracy_results.csv" - - def load(path): - with open(path) as f: - return {row["query_id"]: float(row["result"]) for row in csv.DictReader(f) - if row.get("result") not in (None, "", "null")} - - try: - asap = load(asap_file) - exact = load(exact_file) - except FileNotFoundError as e: - print(f"Result file missing: {e}. Skipping accuracy check.") - sys.exit(0) - - failures = [] - for qid, exact_val in exact.items(): - if qid not in asap: - print(f"WARN: {qid} not found in ASAP results, skipping") - continue - if exact_val == 0: - rel_err = 0.0 if asap[qid] == 0 else float("inf") - else: - rel_err = abs(asap[qid] - exact_val) / abs(exact_val) - status = "PASS" if rel_err <= max_err else "FAIL" - print(f"{status} {qid}: rel_err={rel_err:.4f} asap={asap[qid]:.4f} exact={exact_val:.4f}") - if status == "FAIL": - failures.append(qid) - - if failures: - print(f"\n{len(failures)} query(ies) exceeded max relative error ({max_err}):") - for qid in failures: - print(f" - {qid}") - sys.exit(1) - else: - print(f"\nAll queries within relative error threshold ({max_err}).") - EOF - env: - MAX_RELATIVE_ERROR: ${{ env.MAX_RELATIVE_ERROR }} - - - name: Upload accuracy results - if: always() - uses: actions/upload-artifact@v4 - with: - name: accuracy-results-${{ github.run_id }} - path: | - /tmp/asap_accuracy_results.csv - /tmp/baseline_accuracy_results.csv - if-no-files-found: warn diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/eval-pr.yml new file mode 100644 index 0000000..a60cc31 --- /dev/null +++ b/.github/workflows/eval-pr.yml @@ -0,0 +1,98 @@ +name: PR Evaluation + +# NOTE: GitHub-hosted runners are noisy. Latency numbers are indicative only. +# For precise benchmarks, register a self-hosted runner once asap-tools infra +# is decoupled from Cloudlab. See PDF eval guide Phase 3. + +on: + pull_request: + branches: + - main + paths: + - 'asap-query-engine/**' + - 'asap-planner-rs/**' + - 'asap-summary-ingest/**' + - 'asap-quickstart/**' + - '.github/workflows/eval-pr.yml' + - 'benchmarks/**' + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +jobs: + eval: + name: Full-stack PR evaluation + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and start full stack + env: + COMPOSE_DOCKER_CLI_BUILD: "1" + DOCKER_BUILDKIT: "1" + run: | + docker compose \ + -f asap-quickstart/docker-compose.yml \ + -f benchmarks/docker-compose.yml \ + up -d --build 2>&1 + + - name: Show running containers + run: | + docker compose \ + -f asap-quickstart/docker-compose.yml \ + -f benchmarks/docker-compose.yml \ + ps + + - name: Wait for all services to be healthy + run: bash benchmarks/scripts/wait_for_stack.sh + + - name: Wait for pipeline and data ingestion + run: bash benchmarks/scripts/ingest_wait.sh + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: pip install requests + + - name: Run baseline queries (Prometheus) + run: python benchmarks/scripts/run_baseline.py + + - name: Run ASAP queries (query engine) + run: python benchmarks/scripts/run_asap.py + + - name: Compare results and evaluate + run: python benchmarks/scripts/compare.py + + - name: Upload evaluation reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: eval-reports-${{ github.run_id }} + path: benchmarks/reports/ + + - name: Print docker logs on failure + if: failure() + run: | + docker compose \ + -f asap-quickstart/docker-compose.yml \ + -f benchmarks/docker-compose.yml \ + logs --no-color + + - name: Teardown stack + if: always() + run: | + docker compose \ + -f asap-quickstart/docker-compose.yml \ + -f benchmarks/docker-compose.yml \ + down -v diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml deleted file mode 100644 index 674be29..0000000 --- a/.github/workflows/performance.yml +++ /dev/null @@ -1,296 +0,0 @@ -name: Performance Benchmarks - -# IMPORTANT – Runner choice and benchmark precision -# ───────────────────────────────────────────────── -# Ephemeral GitHub-hosted runners (ubuntu-latest) share noisy cloud hardware. -# They are useful for *relative* regression detection (e.g. "this PR is 2× -# slower than main") but NOT for absolute latency numbers. -# -# For precise, publication-quality benchmarks the team uses the dedicated -# asap-tools benchmarking infra (used for TurboProm paper experiments). -# That infra must be decoupled from Cloudlab and registered as a -# GitHub self-hosted runner. Until then, set `runner: self-hosted` in the -# workflow_dispatch inputs below to target a self-hosted machine when one -# is available, or rely on the relative-regression job for PR gating. -# -# References: -# - asap-tools/execution-utilities/asap_benchmark_pipeline/ — H2O groupby -# - asap-tools/execution-utilities/asap_query_latency/ — ClickBench hits - -on: - pull_request: - branches: [ main ] - paths: - - 'asap-summary-ingest/**' - - 'asap-query-engine/**' - - 'asap-common/sketch-core/**' - - 'asap-common/dependencies/**' - - 'asap-tools/execution-utilities/asap_benchmark_pipeline/**' - - 'asap-tools/execution-utilities/asap_query_latency/**' - - '.github/workflows/performance.yml' - workflow_dispatch: - inputs: - runner: - description: > - Runner label to use. Use 'ubuntu-latest' for relative regression - detection on GH-hosted VMs, or a self-hosted runner label (e.g. - 'self-hosted') for precise absolute benchmarks. - required: false - default: ubuntu-latest - max_rows: - description: Rows to ingest (0 = full dataset; reduce for faster CI runs) - required: false - default: '100000' - benchmark_suite: - description: 'Which suite to run: h2o | query_latency | all' - required: false - default: all - -env: - # Defaults used on PR triggers (keep runtime < 30 min on GH-hosted runners) - DEFAULT_MAX_ROWS: '100000' - # Latency regression threshold: flag if p95 latency increases by more than - # this factor relative to the baseline run within the same CI job. - LATENCY_REGRESSION_FACTOR: '2.0' - -jobs: - # ── 1. Relative performance regression (always runs on GH-hosted VMs) ────── - relative-regression: - name: Relative performance regression (H2O groupby) - # Use the workflow_dispatch runner input when triggered manually; - # fall back to ubuntu-latest for PR triggers. - runs-on: ${{ github.event_name == 'workflow_dispatch' && inputs.runner || 'ubuntu-latest' }} - timeout-minutes: 45 - - steps: - - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install requests kafka-python gdown matplotlib - if [ -f asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt ]; then - pip install -r asap-tools/execution-utilities/asap_benchmark_pipeline/requirements.txt - fi - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Build base image - run: | - docker build \ - -t sketchdb-base:latest \ - -f asap-common/installation/Dockerfile \ - asap-common - - - name: Build summary-ingest image - run: | - docker build \ - -t asap-summary-ingest:ci \ - -f asap-summary-ingest/Dockerfile \ - asap-summary-ingest - - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - - - name: Install protoc - run: | - sudo apt-get update -qq - sudo apt-get install -y protobuf-compiler - - - name: Run sccache - uses: mozilla-actions/sccache-action@v0.0.4 - - - name: Cache cargo - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} - - - name: Build query engine binary - run: cargo build --release --bin query_engine_rust --locked - env: - RUSTC_WRAPPER: sccache - - - name: Resolve max_rows - id: config - run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - echo "max_rows=${{ inputs.max_rows }}" >> "$GITHUB_OUTPUT" - else - echo "max_rows=${{ env.DEFAULT_MAX_ROWS }}" >> "$GITHUB_OUTPUT" - fi - - # ── Baseline (ClickHouse exact) ── - - name: Run ClickHouse baseline benchmark - working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline - run: | - python run_benchmark.py \ - --mode baseline \ - --load-data \ - --max-rows ${{ steps.config.outputs.max_rows }} \ - --output /tmp/baseline_perf_results.csv - - # ── ASAP path ── - - name: Run ASAP benchmark - working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline - run: | - python run_benchmark.py \ - --mode asap \ - --load-data \ - --max-rows ${{ steps.config.outputs.max_rows }} \ - --output /tmp/asap_perf_results.csv \ - --qe-bin ${{ github.workspace }}/target/release/query_engine_rust - - # ── Regression check ── - - name: Check for latency regressions - run: | - python3 - <<'EOF' - import csv, os, sys - - factor = float(os.environ["LATENCY_REGRESSION_FACTOR"]) - asap_file = "/tmp/asap_perf_results.csv" - base_file = "/tmp/baseline_perf_results.csv" - - def load_latency(path): - with open(path) as f: - return {row["query_id"]: float(row["latency_ms"]) - for row in csv.DictReader(f) - if row.get("latency_ms") not in (None, "", "null")} - - try: - asap = load_latency(asap_file) - base = load_latency(base_file) - except FileNotFoundError as e: - print(f"Result file missing: {e}. Skipping regression check.") - sys.exit(0) - - regressions = [] - print(f"{'Query':<30} {'ASAP (ms)':>12} {'Baseline (ms)':>14} {'Ratio':>8} {'Status'}") - print("-" * 72) - for qid, base_lat in base.items(): - if qid not in asap: - continue - ratio = asap[qid] / base_lat if base_lat > 0 else float("inf") - status = "REGRESSION" if ratio > factor else "ok" - print(f"{qid:<30} {asap[qid]:>12.1f} {base_lat:>14.1f} {ratio:>8.2f} {status}") - if status == "REGRESSION": - regressions.append((qid, ratio)) - - if regressions: - print(f"\n{len(regressions)} regression(s) detected (threshold: {factor}x):") - for qid, r in regressions: - print(f" - {qid}: {r:.2f}x slower than baseline") - print("\nNOTE: This job runs on ephemeral GH-hosted VMs and is subject to") - print(" cloud noise. For authoritative numbers use a self-hosted runner.") - sys.exit(1) - else: - print(f"\nNo regressions detected (threshold: {factor}x).") - EOF - env: - LATENCY_REGRESSION_FACTOR: ${{ env.LATENCY_REGRESSION_FACTOR }} - - - name: Generate latency comparison plot - if: always() - working-directory: asap-tools/execution-utilities/asap_benchmark_pipeline - run: | - python plot_latency.py \ - --asap /tmp/asap_perf_results.csv \ - --baseline /tmp/baseline_perf_results.csv \ - --output /tmp/latency_comparison.png 2>/dev/null || \ - python plot_latency.py 2>/dev/null || true - - - name: Upload benchmark results - if: always() - uses: actions/upload-artifact@v4 - with: - name: performance-results-${{ github.run_id }} - path: | - /tmp/asap_perf_results.csv - /tmp/baseline_perf_results.csv - /tmp/latency_comparison.png - if-no-files-found: warn - - # ── 2. Query latency micro-benchmark (manual / self-hosted) ───────────────── - query-latency: - name: Query latency micro-benchmark - # Skip on PRs — run manually or on a scheduled trigger once self-hosted - # runners are available. On GH-hosted VMs the numbers are too noisy to be - # actionable for absolute latency SLOs. - if: > - github.event_name == 'workflow_dispatch' && - (inputs.benchmark_suite == 'query_latency' || inputs.benchmark_suite == 'all') - runs-on: ${{ inputs.runner || 'ubuntu-latest' }} - timeout-minutes: 60 - - steps: - - uses: actions/checkout@v4 - - - name: Self-hosted runner notice - if: ${{ inputs.runner == 'ubuntu-latest' || inputs.runner == '' }} - run: | - echo "::warning::Running query_latency benchmark on a GH-hosted VM." - echo "::warning::Results are indicative only. Register the asap-tools" - echo "::warning::benchmarking host as a self-hosted runner for" - echo "::warning::publication-quality absolute latency measurements." - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install requests - if [ -f asap-tools/execution-utilities/asap_query_latency/requirements.txt ]; then - pip install -r asap-tools/execution-utilities/asap_query_latency/requirements.txt - fi - - - name: Install Rust - uses: dtolnay/rust-toolchain@stable - - - name: Install protoc - run: | - sudo apt-get update -qq - sudo apt-get install -y protobuf-compiler - - - name: Run sccache - uses: mozilla-actions/sccache-action@v0.0.4 - - - name: Cache cargo - uses: actions/cache@v4 - with: - path: | - ~/.cargo/registry - ~/.cargo/git - target - key: ${{ runner.os }}-cargo-perf-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }} - - - name: Build query engine binary - run: cargo build --release --bin query_engine_rust --locked - env: - RUSTC_WRAPPER: sccache - - - name: Run query latency benchmark - working-directory: asap-tools/execution-utilities/asap_query_latency - run: | - python run_benchmark.py \ - --output /tmp/query_latency_results.csv \ - --qe-bin ${{ github.workspace }}/target/release/query_engine_rust - - - name: Upload query latency results - if: always() - uses: actions/upload-artifact@v4 - with: - name: query-latency-results-${{ github.run_id }} - path: /tmp/query_latency_results.csv - if-no-files-found: warn diff --git a/benchmarks/docker-compose.yml b/benchmarks/docker-compose.yml new file mode 100644 index 0000000..dd3084f --- /dev/null +++ b/benchmarks/docker-compose.yml @@ -0,0 +1,26 @@ +# CI override: replaces pulled images with locally-built versions for the +# three ASAP-owned services. Intended for use as a Compose override: +# +# docker compose \ +# -f asap-quickstart/docker-compose.yml \ +# -f benchmarks/docker-compose.yml \ +# up -d --build + +services: + asap-planner-rs: + image: asap-planner-rs:ci + build: + context: . + dockerfile: asap-planner-rs/Dockerfile + + asap-summary-ingest: + image: asap-summary-ingest:ci + build: + context: . + dockerfile: asap-summary-ingest/Dockerfile + + queryengine: + image: asap-query-engine:ci + build: + context: . + dockerfile: asap-query-engine/Dockerfile diff --git a/benchmarks/golden/.gitkeep b/benchmarks/golden/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/queries/promql_suite.json b/benchmarks/queries/promql_suite.json new file mode 100644 index 0000000..52b525e --- /dev/null +++ b/benchmarks/queries/promql_suite.json @@ -0,0 +1,18 @@ +{ + "queries": [ + {"id": "avg_all", "expr": "avg(sensor_reading)", "asap_native": false}, + {"id": "sum_all", "expr": "sum(sensor_reading)", "asap_native": false}, + {"id": "max_all", "expr": "max(sensor_reading)", "asap_native": false}, + {"id": "min_all", "expr": "min(sensor_reading)", "asap_native": false}, + {"id": "q50_all", "expr": "quantile(0.50, sensor_reading)", "asap_native": true}, + {"id": "q90_all", "expr": "quantile(0.90, sensor_reading)", "asap_native": true}, + {"id": "q95_all", "expr": "quantile(0.95, sensor_reading)", "asap_native": true}, + {"id": "q99_all", "expr": "quantile(0.99, sensor_reading)", "asap_native": true}, + {"id": "q95_by_pattern", "expr": "quantile by (pattern) (0.95, sensor_reading)", "asap_native": true}, + {"id": "q99_by_pattern", "expr": "quantile by (pattern) (0.99, sensor_reading)", "asap_native": true}, + {"id": "q50_by_pattern", "expr": "quantile by (pattern) (0.50, sensor_reading)", "asap_native": true}, + {"id": "avg_by_pattern", "expr": "avg by (pattern) (sensor_reading)", "asap_native": false}, + {"id": "sum_by_region", "expr": "sum by (region) (sensor_reading)", "asap_native": false}, + {"id": "max_by_service", "expr": "max by (service) (sensor_reading)", "asap_native": false} + ] +} diff --git a/benchmarks/reports/.gitkeep b/benchmarks/reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py new file mode 100644 index 0000000..eadcef2 --- /dev/null +++ b/benchmarks/scripts/compare.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +"""compare.py — compares baseline (Prometheus) vs ASAP query engine results. + +Applies pass/fail policy: + - FAIL if any query returned an error in ASAP (query failure) + - FAIL if any ASAP-native query has relative error > max_error (default 1%) + - WARN (not FAIL) on >10% p95 latency regression (GH runner noise) + +Generates benchmarks/reports/eval_report.md and prints it to stdout. +Exits 1 if any PASS/FAIL check failed. + +Usage: + python benchmarks/scripts/compare.py \ + [--baseline FILE] \ + [--asap FILE] \ + [--output FILE] \ + [--max-error FLOAT] +""" + +import argparse +import json +import os +import sys +from datetime import datetime, timezone + +DEFAULT_BASELINE = os.path.join( + os.path.dirname(__file__), "..", "reports", "baseline_results.json" +) +DEFAULT_ASAP = os.path.join( + os.path.dirname(__file__), "..", "reports", "asap_results.json" +) +DEFAULT_OUTPUT = os.path.join( + os.path.dirname(__file__), "..", "reports", "eval_report.md" +) +DEFAULT_MAX_ERROR = 0.01 # 1 % +LATENCY_REGRESSION_THRESHOLD = 0.10 # 10 % — warn only + + +# ── helpers ────────────────────────────────────────────────────────────────── + +def percentile(values: list[float], pct: float) -> float: + """Compute percentile (0-100) from a list of floats using linear interpolation.""" + if not values: + return float("nan") + sorted_vals = sorted(values) + n = len(sorted_vals) + if n == 1: + return sorted_vals[0] + idx = (pct / 100.0) * (n - 1) + lo = int(idx) + hi = lo + 1 + if hi >= n: + return sorted_vals[-1] + frac = idx - lo + return sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac + + +def valid_latencies(latencies: list) -> list[float]: + """Strip None entries (failed HTTP requests) from a latency list.""" + return [x for x in latencies if x is not None] + + +def label_key(metric: dict) -> str: + """Canonical sort key for a Prometheus metric label set.""" + return json.dumps(metric, sort_keys=True) + + +def extract_scalar_value(result: list) -> float | None: + """Return a single float from a scalar/single-entry vector result.""" + if not result: + return None + entry = result[0] + try: + return float(entry["value"][1]) + except (KeyError, IndexError, ValueError, TypeError): + return None + + +def compute_relative_error(baseline_val: float, asap_val: float) -> float: + """Relative error: |asap - baseline| / max(|baseline|, 1e-9).""" + denom = max(abs(baseline_val), 1e-9) + return abs(asap_val - baseline_val) / denom + + +def compare_results( + baseline_result: list, asap_result: list +) -> tuple[float | None, str]: + """ + Compare two Prometheus vector result arrays. + + Returns (max_relative_error, note). + For grouped results, matches entries by label set. + """ + if not baseline_result and not asap_result: + return 0.0, "both empty" + if not baseline_result: + return None, "baseline empty" + if not asap_result: + return None, "asap empty" + + # Build label-key → float maps + baseline_map: dict[str, float] = {} + for entry in baseline_result: + key = label_key(entry.get("metric", {})) + try: + baseline_map[key] = float(entry["value"][1]) + except (KeyError, IndexError, ValueError, TypeError): + pass + + asap_map: dict[str, float] = {} + for entry in asap_result: + key = label_key(entry.get("metric", {})) + try: + asap_map[key] = float(entry["value"][1]) + except (KeyError, IndexError, ValueError, TypeError): + pass + + # Single-value result (no labels / collapsed) + if len(baseline_map) == 1 and len(asap_map) == 1: + bv = next(iter(baseline_map.values())) + av = next(iter(asap_map.values())) + return compute_relative_error(bv, av), "" + + # Multi-value: match by label set + max_err = 0.0 + missing = [] + for key, bv in baseline_map.items(): + if key not in asap_map: + missing.append(key) + continue + err = compute_relative_error(bv, asap_map[key]) + max_err = max(max_err, err) + + note = f"{len(missing)} label set(s) missing from ASAP" if missing else "" + return max_err, note + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser(description="Compare baseline vs ASAP results") + parser.add_argument("--baseline", default=DEFAULT_BASELINE, help="Baseline JSON file") + parser.add_argument("--asap", default=DEFAULT_ASAP, help="ASAP results JSON file") + parser.add_argument("--output", default=DEFAULT_OUTPUT, help="Output Markdown file") + parser.add_argument( + "--max-error", + type=float, + default=DEFAULT_MAX_ERROR, + help="Max relative error for ASAP-native queries (default: 0.01 = 1%%)", + ) + args = parser.parse_args() + + with open(args.baseline) as f: + baseline_data = json.load(f) + with open(args.asap) as f: + asap_data = json.load(f) + + baseline_results = baseline_data["results"] + asap_results = asap_data["results"] + + # Collect all query IDs (union) + all_ids = sorted(set(list(baseline_results.keys()) + list(asap_results.keys()))) + + failures: list[str] = [] + warnings: list[str] = [] + rows: list[dict] = [] + + for qid in all_ids: + b = baseline_results.get(qid, {}) + a = asap_results.get(qid, {}) + + asap_native = a.get("asap_native", False) + asap_status = a.get("status", "missing") + asap_error = a.get("error") + + # Latency p95 + b_lats = valid_latencies(b.get("latencies_ms", [])) + a_lats = valid_latencies(a.get("latencies_ms", [])) + b_p95 = percentile(b_lats, 95) if b_lats else float("nan") + a_p95 = percentile(a_lats, 95) if a_lats else float("nan") + + # Relative error + rel_error: float | None = None + note = "" + if asap_status == "success" and b.get("status") == "success": + rel_error, note = compare_results( + b.get("data", []), a.get("data", []) + ) + + # Pass/fail determination + row_status = "PASS" + + if asap_status == "error" or asap_status == "missing": + row_status = "FAIL" + reason = asap_error or "query not present in ASAP results" + failures.append(f"{qid}: ASAP query failed — {reason}") + + elif asap_native and rel_error is not None and rel_error > args.max_error: + row_status = "FAIL" + failures.append( + f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f}" + ) + + # Latency regression — warn only + if ( + not (b_p95 != b_p95) # not NaN + and not (a_p95 != a_p95) + and b_p95 > 0 + ): + latency_regression = (a_p95 - b_p95) / b_p95 + if latency_regression > LATENCY_REGRESSION_THRESHOLD: + warnings.append( + f"{qid}: p95 latency regression {latency_regression:.1%} " + f"(baseline={b_p95:.1f}ms, asap={a_p95:.1f}ms) — " + "GH runner noise expected; informational only" + ) + + rows.append( + { + "id": qid, + "asap_native": asap_native, + "b_p95": b_p95, + "a_p95": a_p95, + "rel_error": rel_error, + "status": row_status, + "note": note, + } + ) + + overall = "PASS" if not failures else "FAIL" + + # ── Build Markdown report ──────────────────────────────────────────────── + now = datetime.now(timezone.utc).isoformat() + lines: list[str] = [] + lines.append("# ASAP PR Evaluation Report") + lines.append("") + lines.append(f"**Generated:** {now}") + lines.append( + f"**Overall verdict:** {'✅ PASS' if overall == 'PASS' else '❌ FAIL'}" + ) + lines.append(f"**Max error threshold:** {args.max_error:.2%}") + lines.append("") + + # Summary table + lines.append( + "| Query | ASAP Native | Baseline p95 (ms) | ASAP p95 (ms) | Rel Error | Status |" + ) + lines.append("|-------|:-----------:|:-----------------:|:-------------:|:---------:|:------:|") + for row in rows: + native_mark = "yes" if row["asap_native"] else "no" + b_p95_s = f"{row['b_p95']:.1f}" if row["b_p95"] == row["b_p95"] else "n/a" + a_p95_s = f"{row['a_p95']:.1f}" if row["a_p95"] == row["a_p95"] else "n/a" + if row["rel_error"] is None: + rel_err_s = "n/a" + else: + rel_err_s = f"{row['rel_error']:.4f}" + status_s = "✅ PASS" if row["status"] == "PASS" else "❌ FAIL" + note_s = f" ({row['note']})" if row["note"] else "" + lines.append( + f"| {row['id']}{note_s} | {native_mark} | {b_p95_s} | {a_p95_s} " + f"| {rel_err_s} | {status_s} |" + ) + + lines.append("") + + if failures: + lines.append("## Failures") + lines.append("") + for f_msg in failures: + lines.append(f"- {f_msg}") + lines.append("") + + if warnings: + lines.append("## Latency Warnings (informational)") + lines.append("") + lines.append( + "> **Note:** GitHub-hosted runners exhibit significant timing noise. " + "Latency numbers are indicative only. For precise benchmarks, register " + "a self-hosted runner once asap-tools infra is decoupled from Cloudlab " + "(see PDF eval guide Phase 3)." + ) + lines.append("") + for w in warnings: + lines.append(f"- {w}") + lines.append("") + + lines.append("---") + lines.append( + "_This report was generated automatically by `benchmarks/scripts/compare.py`._" + ) + + report = "\n".join(lines) + print(report) + + os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) + with open(args.output, "w") as f: + f.write(report + "\n") + print(f"\n[compare] Report saved to {args.output}", file=sys.stderr) + + if overall == "FAIL": + print( + f"\n[compare] Evaluation FAILED. {len(failures)} check(s) failed:", + file=sys.stderr, + ) + for msg in failures: + print(f" - {msg}", file=sys.stderr) + sys.exit(1) + else: + print("\n[compare] Evaluation PASSED.", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/scripts/ingest_wait.sh b/benchmarks/scripts/ingest_wait.sh new file mode 100644 index 0000000..6e9b38d --- /dev/null +++ b/benchmarks/scripts/ingest_wait.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ingest_wait.sh — waits for the asap-demo Arroyo pipeline to reach RUNNING +# state, then sleeps to allow sketches to accumulate before verifying that the +# query engine has ingested data. + +ARROYO_URL="http://localhost:5115/api/v1/pipelines" +QE_URL="http://localhost:8088/api/v1/query" +PIPELINE_NAME="asap-demo" +MAX_PIPELINE_WAIT=300 # seconds +ACCUMULATE_SLEEP=90 # seconds after pipeline is running +SLEEP=5 + +# ── 1. Wait for asap-demo pipeline to reach RUNNING ───────────────────────── +echo "[ingest_wait] Waiting for Arroyo pipeline '${PIPELINE_NAME}' to reach RUNNING state ..." +elapsed=0 +while true; do + state=$(curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null \ + | python3 -c " +import sys, json +data = json.load(sys.stdin) +pipelines = data if isinstance(data, list) else data.get('data', []) +for p in pipelines: + name = p.get('name', '') or p.get('id', '') + if '${PIPELINE_NAME}' in str(name): + print(p.get('state', p.get('status', ''))) + break +" 2>/dev/null || true) + + if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ]; then + echo "[ingest_wait] Pipeline '${PIPELINE_NAME}' is RUNNING (${elapsed}s elapsed)" + break + fi + + if [ "${elapsed}" -ge "${MAX_PIPELINE_WAIT}" ]; then + echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state}')" >&2 + exit 1 + fi + + echo "[ingest_wait] Pipeline state: '${state:-unknown}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..." + sleep "${SLEEP}" + elapsed=$(( elapsed + SLEEP )) +done + +# ── 2. Allow sketches to accumulate ───────────────────────────────────────── +echo "[ingest_wait] Pipeline running. Sleeping ${ACCUMULATE_SLEEP}s for sketches to accumulate ..." +sleep "${ACCUMULATE_SLEEP}" + +# ── 3. Verify query engine has data ───────────────────────────────────────── +echo "[ingest_wait] Verifying query engine has data ..." +response=$(curl -sf --max-time 10 \ + "${QE_URL}?query=avg%28sensor_reading%29" 2>/dev/null || true) + +if [ -z "${response}" ]; then + echo "[ingest_wait] ERROR: Query engine returned empty response." >&2 + exit 1 +fi + +result_count=$(echo "${response}" | python3 -c " +import sys, json +data = json.load(sys.stdin) +result = data.get('data', {}).get('result', []) +print(len(result)) +" 2>/dev/null || echo "0") + +if [ "${result_count}" -eq 0 ]; then + echo "[ingest_wait] ERROR: Query engine has no data yet (result array is empty)." >&2 + exit 1 +fi + +echo "[ingest_wait] Query engine has data (${result_count} result entries). Ready for benchmarking." diff --git a/benchmarks/scripts/run_asap.py b/benchmarks/scripts/run_asap.py new file mode 100644 index 0000000..10747cb --- /dev/null +++ b/benchmarks/scripts/run_asap.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""run_asap.py — queries ASAPQuery engine for each entry in promql_suite.json. + +Runs each query 3 times to gather latency samples, then writes results to +benchmarks/reports/asap_results.json. + +Usage: + python benchmarks/scripts/run_asap.py \ + [--asap-url URL] \ + [--output FILE] +""" + +import argparse +import json +import os +import time +import urllib.parse +from datetime import datetime, timezone + +import requests + +SUITE_PATH = os.path.join( + os.path.dirname(__file__), "..", "queries", "promql_suite.json" +) +DEFAULT_OUTPUT = os.path.join( + os.path.dirname(__file__), "..", "reports", "asap_results.json" +) +RUNS_PER_QUERY = 3 + + +def query_asap(base_url: str, expr: str, ts: float) -> tuple[dict, float]: + """Issue a single instant query to the ASAP query engine, return (parsed_json, latency_ms).""" + encoded = urllib.parse.quote(expr, safe="") + url = f"{base_url}/api/v1/query?query={encoded}&time={ts}" + t0 = time.monotonic() + resp = requests.get(url, timeout=30) + latency_ms = (time.monotonic() - t0) * 1000.0 + resp.raise_for_status() + return resp.json(), latency_ms + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run ASAP query engine benchmark queries") + parser.add_argument( + "--asap-url", + default="http://localhost:8088", + help="ASAP query engine base URL (default: http://localhost:8088)", + ) + parser.add_argument( + "--output", + default=DEFAULT_OUTPUT, + help="Output JSON file path", + ) + args = parser.parse_args() + + with open(SUITE_PATH) as f: + suite = json.load(f) + + results: dict[str, dict] = {} + now = time.time() + + for q in suite["queries"]: + qid = q["id"] + expr = q["expr"] + asap_native = q.get("asap_native", False) + latencies = [] + last_data = [] + last_error = None + last_status = "success" + + print(f"[asap] Running query '{qid}' (native={asap_native}): {expr}") + for run in range(1, RUNS_PER_QUERY + 1): + try: + payload, lat = query_asap(args.asap_url, expr, now) + latencies.append(lat) + if payload.get("status") == "success": + last_data = payload.get("data", {}).get("result", []) + last_status = "success" + last_error = None + else: + last_status = "error" + last_error = payload.get("error", "unknown error") + last_data = [] + print(f" run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms status={last_status}") + except Exception as exc: # noqa: BLE001 + last_status = "error" + last_error = str(exc) + last_data = [] + latencies.append(None) + print(f" run {run}/{RUNS_PER_QUERY}: ERROR — {exc}") + + results[qid] = { + "status": last_status, + "asap_native": asap_native, + "latencies_ms": latencies, + "data": last_data, + "error": last_error, + } + + output = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "asap_url": args.asap_url, + "results": results, + } + + os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) + with open(args.output, "w") as f: + json.dump(output, f, indent=2) + print(f"\n[asap] Results saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/scripts/run_baseline.py b/benchmarks/scripts/run_baseline.py new file mode 100644 index 0000000..6e09e41 --- /dev/null +++ b/benchmarks/scripts/run_baseline.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +"""run_baseline.py — queries Prometheus for each entry in promql_suite.json. + +Runs each query 3 times to gather latency samples, then writes results to +benchmarks/reports/baseline_results.json. + +Usage: + python benchmarks/scripts/run_baseline.py \ + [--prometheus-url URL] \ + [--output FILE] +""" + +import argparse +import json +import os +import time +import urllib.parse +from datetime import datetime, timezone + +import requests + +SUITE_PATH = os.path.join( + os.path.dirname(__file__), "..", "queries", "promql_suite.json" +) +DEFAULT_OUTPUT = os.path.join( + os.path.dirname(__file__), "..", "reports", "baseline_results.json" +) +RUNS_PER_QUERY = 3 + + +def query_prometheus(base_url: str, expr: str, ts: float) -> tuple[dict, float]: + """Issue a single instant query, return (parsed_json, latency_ms).""" + encoded = urllib.parse.quote(expr, safe="") + url = f"{base_url}/api/v1/query?query={encoded}&time={ts}" + t0 = time.monotonic() + resp = requests.get(url, timeout=30) + latency_ms = (time.monotonic() - t0) * 1000.0 + resp.raise_for_status() + return resp.json(), latency_ms + + +def main() -> None: + parser = argparse.ArgumentParser(description="Run baseline Prometheus queries") + parser.add_argument( + "--prometheus-url", + default="http://localhost:9090", + help="Prometheus base URL (default: http://localhost:9090)", + ) + parser.add_argument( + "--output", + default=DEFAULT_OUTPUT, + help="Output JSON file path", + ) + args = parser.parse_args() + + with open(SUITE_PATH) as f: + suite = json.load(f) + + results: dict[str, dict] = {} + now = time.time() + + for q in suite["queries"]: + qid = q["id"] + expr = q["expr"] + latencies = [] + last_data = [] + last_error = None + last_status = "success" + + print(f"[baseline] Running query '{qid}': {expr}") + for run in range(1, RUNS_PER_QUERY + 1): + try: + payload, lat = query_prometheus(args.prometheus_url, expr, now) + latencies.append(lat) + if payload.get("status") == "success": + last_data = payload.get("data", {}).get("result", []) + last_status = "success" + last_error = None + else: + last_status = "error" + last_error = payload.get("error", "unknown error") + last_data = [] + print(f" run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms status={last_status}") + except Exception as exc: # noqa: BLE001 + last_status = "error" + last_error = str(exc) + last_data = [] + latencies.append(None) + print(f" run {run}/{RUNS_PER_QUERY}: ERROR — {exc}") + + results[qid] = { + "status": last_status, + "latencies_ms": latencies, + "data": last_data, + "error": last_error, + } + + output = { + "timestamp": datetime.now(timezone.utc).isoformat(), + "prometheus_url": args.prometheus_url, + "results": results, + } + + os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True) + with open(args.output, "w") as f: + json.dump(output, f, indent=2) + print(f"\n[baseline] Results saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/scripts/wait_for_stack.sh b/benchmarks/scripts/wait_for_stack.sh new file mode 100644 index 0000000..672aa6d --- /dev/null +++ b/benchmarks/scripts/wait_for_stack.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +set -euo pipefail + +# wait_for_stack.sh — polls each service until healthy or times out. +# Used in CI after `docker compose up -d --build`. + +MAX_WAIT=180 # seconds per service +SLEEP=5 + +wait_for_url() { + local name="$1" + local url="$2" + local elapsed=0 + + echo "[wait_for_stack] Waiting for ${name} at ${url} ..." + while true; do + if curl -sf --max-time 5 "${url}" > /dev/null 2>&1; then + echo "[wait_for_stack] ${name} is healthy (${elapsed}s elapsed)" + return 0 + fi + if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then + echo "[wait_for_stack] ERROR: ${name} did not become healthy within ${MAX_WAIT}s" >&2 + return 1 + fi + sleep "${SLEEP}" + elapsed=$(( elapsed + SLEEP )) + done +} + +wait_for_url "Prometheus" "http://localhost:9090/-/healthy" +wait_for_url "Arroyo API" "http://localhost:5115/api/v1/pipelines" +wait_for_url "QueryEngine" "http://localhost:8088/api/v1/query?query=vector(1)" + +echo "[wait_for_stack] All services are healthy." From 8ecdf35deb4eb88f5737a6566e584fc9c4c9fdde Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:43:22 -0500 Subject: [PATCH 4/9] Raise ASAP-native accuracy threshold from 1% to 5% Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/scripts/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py index eadcef2..48274ef 100644 --- a/benchmarks/scripts/compare.py +++ b/benchmarks/scripts/compare.py @@ -32,7 +32,7 @@ DEFAULT_OUTPUT = os.path.join( os.path.dirname(__file__), "..", "reports", "eval_report.md" ) -DEFAULT_MAX_ERROR = 0.01 # 1 % +DEFAULT_MAX_ERROR = 0.05 # 5 % LATENCY_REGRESSION_THRESHOLD = 0.10 # 10 % — warn only From 78297c8011ab4aca35c32828559c16d4f1234fb3 Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:43:47 -0500 Subject: [PATCH 5/9] Make ASAP-native accuracy threshold warning-only (>5% warns, does not fail) Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/scripts/compare.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py index 48274ef..2f901d9 100644 --- a/benchmarks/scripts/compare.py +++ b/benchmarks/scripts/compare.py @@ -3,7 +3,7 @@ Applies pass/fail policy: - FAIL if any query returned an error in ASAP (query failure) - - FAIL if any ASAP-native query has relative error > max_error (default 1%) + - WARN if any ASAP-native query has relative error > max_error (default 5%) - WARN (not FAIL) on >10% p95 latency regression (GH runner noise) Generates benchmarks/reports/eval_report.md and prints it to stdout. @@ -196,9 +196,9 @@ def main() -> None: failures.append(f"{qid}: ASAP query failed — {reason}") elif asap_native and rel_error is not None and rel_error > args.max_error: - row_status = "FAIL" - failures.append( - f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f}" + row_status = "WARN" + warnings.append( + f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f} — informational only" ) # Latency regression — warn only From 70d2f06098bcf500f29583f3f76a878e44241abc Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 18:58:37 -0500 Subject: [PATCH 6/9] Rename eval-pr.yml to accuracy_performance.yml Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/{eval-pr.yml => accuracy_performance.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{eval-pr.yml => accuracy_performance.yml} (100%) diff --git a/.github/workflows/eval-pr.yml b/.github/workflows/accuracy_performance.yml similarity index 100% rename from .github/workflows/eval-pr.yml rename to .github/workflows/accuracy_performance.yml From e828ccaeac434b85de6bc6d220d87b32d8e455f9 Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 19:12:55 -0500 Subject: [PATCH 7/9] Restructure CI to build images once and reuse across eval - Split accuracy_performance.yml into build + eval jobs: images are built and pushed to GHCR with a sha- tag in the build job, then pulled (not rebuilt) by the eval job via ASAP_IMAGE_TAG env var - Remove build: sections from benchmarks/docker-compose.yml; it now only overrides image tags using ${ASAP_IMAGE_TAG} - Change asap-summary-ingest/Dockerfile FROM to ghcr.io/projectasap/asap-base:latest so it can be built in any job without requiring a local sketchdb-base image - Add --project-directory . to all docker compose calls to fix build context path resolution (asap-quickstart/asap-summary-ingest lstat error) - Fix stale eval-pr.yml path reference in accuracy_performance.yml triggers - Fix PromQLPattern::new call sites in three test files: remove stale second argument after collect_tokens was removed from the function signature Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/accuracy_performance.yml | 84 +++++++++++++++++-- .../rust_tests/src/pattern_tests.rs | 19 ----- .../tests/compare_patterns/src/main.rs | 6 -- .../tests/rust_pattern_matching/src/main.rs | 12 --- asap-summary-ingest/Dockerfile | 2 +- benchmarks/docker-compose.yml | 26 +++--- 6 files changed, 87 insertions(+), 62 deletions(-) diff --git a/.github/workflows/accuracy_performance.yml b/.github/workflows/accuracy_performance.yml index a60cc31..a2dace5 100644 --- a/.github/workflows/accuracy_performance.yml +++ b/.github/workflows/accuracy_performance.yml @@ -13,40 +13,106 @@ on: - 'asap-planner-rs/**' - 'asap-summary-ingest/**' - 'asap-quickstart/**' - - '.github/workflows/eval-pr.yml' + - '.github/workflows/accuracy_performance.yml' - 'benchmarks/**' workflow_dispatch: permissions: contents: read + packages: write pull-requests: write jobs: + # --------------------------------------------------------------------------- + # Job 1: build images once from branch code and push with a SHA-based tag. + # All downstream jobs pull these images instead of rebuilding. + # --------------------------------------------------------------------------- + build: + name: Build CI images + runs-on: ubuntu-latest + outputs: + image-tag: ${{ steps.tag.outputs.value }} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Compute image tag + id: tag + run: echo "value=sha-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT + + - name: Build and push asap-planner-rs + uses: docker/build-push-action@v6 + with: + context: . + file: asap-planner-rs/Dockerfile + push: true + tags: ghcr.io/projectasap/asap-planner-rs:${{ steps.tag.outputs.value }} + cache-from: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache + cache-to: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache,mode=max + + - name: Build and push asap-summary-ingest + uses: docker/build-push-action@v6 + with: + context: asap-summary-ingest + file: asap-summary-ingest/Dockerfile + push: true + tags: ghcr.io/projectasap/asap-summary-ingest:${{ steps.tag.outputs.value }} + + - name: Build and push asap-query-engine + uses: docker/build-push-action@v6 + with: + context: . + file: asap-query-engine/Dockerfile + push: true + tags: ghcr.io/projectasap/asap-query-engine:${{ steps.tag.outputs.value }} + cache-from: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache + cache-to: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache,mode=max + + # --------------------------------------------------------------------------- + # Job 2: pull the images built above, deploy the full stack, and evaluate. + # --------------------------------------------------------------------------- eval: name: Full-stack PR evaluation + needs: build runs-on: ubuntu-latest timeout-minutes: 60 + env: + ASAP_IMAGE_TAG: ${{ needs.build.outputs.image-tag }} steps: - name: Checkout repository uses: actions/checkout@v4 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} - - name: Build and start full stack - env: - COMPOSE_DOCKER_CLI_BUILD: "1" - DOCKER_BUILDKIT: "1" + - name: Pull and start full stack run: | docker compose \ + --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ - up -d --build 2>&1 + up -d - name: Show running containers run: | docker compose \ + --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ ps @@ -85,6 +151,7 @@ jobs: if: failure() run: | docker compose \ + --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ logs --no-color @@ -93,6 +160,7 @@ jobs: if: always() run: | docker compose \ + --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ down -v diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs index fcc210f..c9c1032 100644 --- a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs +++ b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs @@ -19,22 +19,11 @@ impl PatternTester { // Rate pattern PromQLPattern::new( Self::build_rate_pattern(), - vec![ - "metric".to_string(), - "function".to_string(), - "range_vector".to_string(), - ], // Some("ONLY_TEMPORAL".to_string()), ), // Quantile over time pattern PromQLPattern::new( Self::build_quantile_over_time_pattern(), - vec![ - "metric".to_string(), - "function".to_string(), - "range_vector".to_string(), - "function_args".to_string(), - ], // Some("ONLY_TEMPORAL".to_string()), ), ]; @@ -44,13 +33,11 @@ impl PatternTester { // Sum aggregation pattern PromQLPattern::new( Self::build_sum_pattern(), - vec!["metric".to_string(), "aggregation".to_string()], // Some("ONLY_SPATIAL".to_string()), ), // Simple metric pattern PromQLPattern::new( Self::build_metric_pattern(), - vec!["metric".to_string()], // Some("ONLY_SPATIAL".to_string()), ), ]; @@ -60,12 +47,6 @@ impl PatternTester { // Sum of rate pattern PromQLPattern::new( Self::build_one_temporal_one_spatial_pattern(), - vec![ - "metric".to_string(), - "function".to_string(), - "aggregation".to_string(), - "range_vector".to_string(), - ], // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()), ), ]; diff --git a/asap-common/tests/compare_patterns/src/main.rs b/asap-common/tests/compare_patterns/src/main.rs index 6c429f1..d75fa8b 100644 --- a/asap-common/tests/compare_patterns/src/main.rs +++ b/asap-common/tests/compare_patterns/src/main.rs @@ -25,7 +25,6 @@ fn main() { let pattern_1 = PromQLPatternBuilder::function(vec!["rate", "increase"], func_args1, Some("function"), None); let pattern1 = PromQLPattern::new( pattern_1, - vec!["metric".to_string(), "function".to_string(), "range_vector".to_string()], ); if let Some(ast) = pattern1.ast_pattern { only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); @@ -44,7 +43,6 @@ fn main() { let pattern_2 = PromQLPatternBuilder::function(vec!["quantile_over_time"], func_args2, Some("function"), Some("function_args")); let pattern2 = PromQLPattern::new( pattern_2, - vec!["metric".to_string(), "function".to_string(), "range_vector".to_string(), "function_args".to_string()], ); if let Some(ast) = pattern2.ast_pattern { only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); @@ -66,7 +64,6 @@ fn main() { ); let pattern3 = PromQLPattern::new( pattern_3, - vec!["metric".to_string(), "aggregation".to_string()], ); if let Some(ast) = pattern3.ast_pattern { only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); @@ -76,7 +73,6 @@ fn main() { let pattern_4 = PromQLPatternBuilder::metric(None, None, None, Some("metric")); let pattern4 = PromQLPattern::new( pattern_4, - vec!["metric".to_string()], ); if let Some(ast) = pattern4.ast_pattern { only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); @@ -108,7 +104,6 @@ fn main() { ); let pattern5 = PromQLPattern::new( pattern_5, - vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "function_args".to_string(), "aggregation".to_string()], ); if let Some(ast) = pattern5.ast_pattern { one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); @@ -137,7 +132,6 @@ fn main() { ); let pattern6 = PromQLPattern::new( pattern_6, - vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "aggregation".to_string()], ); if let Some(ast) = pattern6.ast_pattern { one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect())); diff --git a/asap-common/tests/rust_pattern_matching/src/main.rs b/asap-common/tests/rust_pattern_matching/src/main.rs index 3a540af..4a29352 100644 --- a/asap-common/tests/rust_pattern_matching/src/main.rs +++ b/asap-common/tests/rust_pattern_matching/src/main.rs @@ -10,11 +10,6 @@ fn temporal_pattern( ) -> PromQLPattern { PromQLPattern::new( blocks[pattern_type].clone(), - vec![ - "metric".to_string(), - "function".to_string(), - "range_vector".to_string(), - ], ) } @@ -24,7 +19,6 @@ fn spatial_pattern( ) -> PromQLPattern { PromQLPattern::new( blocks[pattern_type].clone(), - vec!["metric".to_string(), "aggregation".to_string()], ) } @@ -39,12 +33,6 @@ fn spatial_of_temporal_pattern(temporal_block: &Option>) ); PromQLPattern::new( pattern, - vec![ - "metric".to_string(), - "function".to_string(), - "range_vector".to_string(), - "aggregation".to_string(), - ], ) } diff --git a/asap-summary-ingest/Dockerfile b/asap-summary-ingest/Dockerfile index 8432840..7a79036 100644 --- a/asap-summary-ingest/Dockerfile +++ b/asap-summary-ingest/Dockerfile @@ -1,4 +1,4 @@ -FROM sketchdb-base:latest +FROM ghcr.io/projectasap/asap-base:latest LABEL maintainer="SketchDB Team" LABEL description="ArroyoSketch pipeline configuration service" diff --git a/benchmarks/docker-compose.yml b/benchmarks/docker-compose.yml index dd3084f..fea9921 100644 --- a/benchmarks/docker-compose.yml +++ b/benchmarks/docker-compose.yml @@ -1,26 +1,20 @@ -# CI override: replaces pulled images with locally-built versions for the -# three ASAP-owned services. Intended for use as a Compose override: +# CI image override: replaces quickstart's pinned release images with images +# built from the current branch. Intended for use as a Compose override: # -# docker compose \ +# ASAP_IMAGE_TAG=sha- docker compose \ +# --project-directory . \ # -f asap-quickstart/docker-compose.yml \ # -f benchmarks/docker-compose.yml \ -# up -d --build +# up -d +# +# ASAP_IMAGE_TAG is set automatically by the 'build' job in accuracy_performance.yml. services: asap-planner-rs: - image: asap-planner-rs:ci - build: - context: . - dockerfile: asap-planner-rs/Dockerfile + image: ghcr.io/projectasap/asap-planner-rs:${ASAP_IMAGE_TAG} asap-summary-ingest: - image: asap-summary-ingest:ci - build: - context: . - dockerfile: asap-summary-ingest/Dockerfile + image: ghcr.io/projectasap/asap-summary-ingest:${ASAP_IMAGE_TAG} queryengine: - image: asap-query-engine:ci - build: - context: . - dockerfile: asap-query-engine/Dockerfile + image: ghcr.io/projectasap/asap-query-engine:${ASAP_IMAGE_TAG} From 7070ba714aa326b3ee64da69e423c23a0c4572af Mon Sep 17 00:00:00 2001 From: zz_y Date: Thu, 26 Mar 2026 21:18:18 -0500 Subject: [PATCH 8/9] Fix three CI failures: eval path resolution, Python deps, Rust pattern tests - accuracy_performance.yml: remove --project-directory . from eval job so bind mounts in asap-quickstart/docker-compose.yml resolve from asap-quickstart/ (fixes asap-planner-rs exit 1 due to missing controller-config.yaml) - correctness.yml: fix pip install path to asap-common/dependencies/py/promql_utilities/ (setup.py is there, not at the parent dir; fixes silent failure + missing promql_parser) - pattern_tests.rs: add avg_over_time and *_over_time variants to ONLY_TEMPORAL; remove duplicate ONLY_VECTOR key so disambiguation works deterministically; add build_combined_quantile_pattern for 2-arg quantile_over_time in combined queries Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/accuracy_performance.yml | 4 -- .github/workflows/correctness.yml | 4 +- .../rust_tests/src/pattern_tests.rs | 49 +++++++++++++++++-- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/.github/workflows/accuracy_performance.yml b/.github/workflows/accuracy_performance.yml index a2dace5..79eff1e 100644 --- a/.github/workflows/accuracy_performance.yml +++ b/.github/workflows/accuracy_performance.yml @@ -104,7 +104,6 @@ jobs: - name: Pull and start full stack run: | docker compose \ - --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ up -d @@ -112,7 +111,6 @@ jobs: - name: Show running containers run: | docker compose \ - --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ ps @@ -151,7 +149,6 @@ jobs: if: failure() run: | docker compose \ - --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ logs --no-color @@ -160,7 +157,6 @@ jobs: if: always() run: | docker compose \ - --project-directory . \ -f asap-quickstart/docker-compose.yml \ -f benchmarks/docker-compose.yml \ down -v diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml index 11d1a20..7ba0178 100644 --- a/.github/workflows/correctness.yml +++ b/.github/workflows/correctness.yml @@ -40,7 +40,7 @@ jobs: if [ -f asap-common/dependencies/py/requirements.txt ]; then pip install -r asap-common/dependencies/py/requirements.txt fi - pip install -e asap-common/dependencies/py/ 2>/dev/null || true + pip install -e asap-common/dependencies/py/promql_utilities/ - name: Install Rust uses: dtolnay/rust-toolchain@stable @@ -98,7 +98,7 @@ jobs: if [ -f asap-common/dependencies/py/requirements.txt ]; then pip install -r asap-common/dependencies/py/requirements.txt fi - pip install -e asap-common/dependencies/py/ 2>/dev/null || true + pip install -e asap-common/dependencies/py/promql_utilities/ - name: Install Rust uses: dtolnay/rust-toolchain@stable diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs index c9c1032..6b0fef9 100644 --- a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs +++ b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs @@ -44,15 +44,20 @@ impl PatternTester { // ONE_TEMPORAL_ONE_SPATIAL patterns let combined_patterns = vec![ - // Sum of rate pattern + // Aggregation of single-arg temporal functions PromQLPattern::new( Self::build_one_temporal_one_spatial_pattern(), // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()), ), + // Aggregation of quantile_over_time (2-arg) + PromQLPattern::new( + Self::build_combined_quantile_pattern(), + // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()), + ), ]; // Insert in order from simple to complex to avoid panics - patterns.insert("ONLY_VECTOR".to_string(), spatial_patterns.clone()); + // ONLY_VECTOR is derived via disambiguation in test_query, not a separate entry patterns.insert("ONLY_SPATIAL".to_string(), spatial_patterns); patterns.insert("ONLY_TEMPORAL".to_string(), temporal_patterns); patterns.insert("ONE_TEMPORAL_ONE_SPATIAL".to_string(), combined_patterns); @@ -261,7 +266,20 @@ impl PatternTester { let args: Vec>> = vec![ms]; - PromQLPatternBuilder::function(vec!["rate", "increase"], args, Some("function"), None) + PromQLPatternBuilder::function( + vec![ + "rate", + "increase", + "avg_over_time", + "sum_over_time", + "count_over_time", + "min_over_time", + "max_over_time", + ], + args, + Some("function"), + None, + ) } fn build_quantile_over_time_pattern() -> Option> { @@ -308,7 +326,6 @@ impl PatternTester { let func = PromQLPatternBuilder::function( vec![ - "quantile_over_time", "sum_over_time", "count_over_time", "avg_over_time", @@ -332,6 +349,30 @@ impl PatternTester { ) } + fn build_combined_quantile_pattern() -> Option> { + let num = PromQLPatternBuilder::number(None, None); + let ms = PromQLPatternBuilder::matrix_selector( + PromQLPatternBuilder::metric(None, None, None, Some("metric")), + None, + Some("range_vector"), + ); + let func_args: Vec>> = vec![num, ms]; + let func = PromQLPatternBuilder::function( + vec!["quantile_over_time"], + func_args, + Some("function"), + None, + ); + PromQLPatternBuilder::aggregation( + vec!["sum", "count", "avg", "quantile", "min", "max"], + func, + None, + None, + None, + Some("aggregation"), + ) + } + fn build_sum_rate_pattern() -> Option> { let ms = PromQLPatternBuilder::matrix_selector( PromQLPatternBuilder::metric(None, None, None, Some("metric")), From 4259d152ef938a85de00f2e686b1efd3d34e3f07 Mon Sep 17 00:00:00 2001 From: zz_y Date: Fri, 27 Mar 2026 11:34:56 -0500 Subject: [PATCH 9/9] Fix ingest_wait.sh: null-data handling, Arroyo state detection, longer timeout Three bugs caused the pipeline to always show as 'not found': 1. data.get('data', []) returns None when Arroyo returns {"data": null} for an empty pipeline list. dict.get() only falls back to the default when the key is absent, not when the value is null. Fixed with (data.get('data') or []). 2. Arroyo signals a running pipeline via state=null + stop='none', not a literal "Running" string. The ingest_wait.sh state check was looking for the wrong value; the correct pattern is already used in asap-tools/run_pipeline.sh. 3. MAX_PIPELINE_WAIT=300s is too short: Arroyo must compile Rust UDFs before the pipeline can start, which takes several minutes in CI. Raised to 600s. Also: normalise hyphens/underscores in the name match so 'asap-demo' matches whether Arroyo stores it as 'asap-demo' or 'asap_demo'; add pipeline list dump on timeout for easier future diagnosis. Co-Authored-By: Claude Sonnet 4.6 --- benchmarks/scripts/ingest_wait.sh | 41 ++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/benchmarks/scripts/ingest_wait.sh b/benchmarks/scripts/ingest_wait.sh index 6e9b38d..8b12d3a 100644 --- a/benchmarks/scripts/ingest_wait.sh +++ b/benchmarks/scripts/ingest_wait.sh @@ -8,7 +8,7 @@ set -euo pipefail ARROYO_URL="http://localhost:5115/api/v1/pipelines" QE_URL="http://localhost:8088/api/v1/query" PIPELINE_NAME="asap-demo" -MAX_PIPELINE_WAIT=300 # seconds +MAX_PIPELINE_WAIT=600 # seconds — Arroyo must compile Rust UDFs; allow extra time ACCUMULATE_SLEEP=90 # seconds after pipeline is running SLEEP=5 @@ -19,26 +19,45 @@ while true; do state=$(curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null \ | python3 -c " import sys, json -data = json.load(sys.stdin) -pipelines = data if isinstance(data, list) else data.get('data', []) -for p in pipelines: - name = p.get('name', '') or p.get('id', '') - if '${PIPELINE_NAME}' in str(name): - print(p.get('state', p.get('status', ''))) - break +try: + data = json.load(sys.stdin) + # 'data' key may be null when no pipelines exist; use 'or []' to handle that + pipelines = data if isinstance(data, list) else (data.get('data') or []) + for p in pipelines: + name = str(p.get('name') or p.get('id') or '') + # Normalise hyphens/underscores so 'asap-demo' matches 'asap_demo' + if '${PIPELINE_NAME}'.replace('-', '_') in name.replace('-', '_'): + state = p.get('state') + stop = p.get('stop', '') + # Arroyo signals a running pipeline via state=null and stop='none' + if state is None and stop == 'none': + print('Running') + elif state is not None: + print(str(state)) + else: + print('stopped') + break + else: + # No matching pipeline found yet — print nothing so caller retries + pass +except Exception: + pass " 2>/dev/null || true) - if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ]; then + if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ] || [ "${state}" = "running" ]; then echo "[ingest_wait] Pipeline '${PIPELINE_NAME}' is RUNNING (${elapsed}s elapsed)" break fi if [ "${elapsed}" -ge "${MAX_PIPELINE_WAIT}" ]; then - echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state}')" >&2 + echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state:-unknown}')" >&2 + # Dump pipeline list for diagnosis + echo "[ingest_wait] Current Arroyo pipeline list:" >&2 + curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null | python3 -m json.tool 2>/dev/null >&2 || true exit 1 fi - echo "[ingest_wait] Pipeline state: '${state:-unknown}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..." + echo "[ingest_wait] Pipeline state: '${state:-not found}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..." sleep "${SLEEP}" elapsed=$(( elapsed + SLEEP )) done