diff --git a/.github/workflows/accuracy_performance.yml b/.github/workflows/accuracy_performance.yml
new file mode 100644
index 0000000..79eff1e
--- /dev/null
+++ b/.github/workflows/accuracy_performance.yml
@@ -0,0 +1,162 @@
+name: PR Evaluation
+
+# NOTE: GitHub-hosted runners are noisy. Latency numbers are indicative only.
+# For precise benchmarks, register a self-hosted runner once asap-tools infra
+# is decoupled from Cloudlab. See PDF eval guide Phase 3.
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'asap-query-engine/**'
+      - 'asap-planner-rs/**'
+      - 'asap-summary-ingest/**'
+      - 'asap-quickstart/**'
+      - '.github/workflows/accuracy_performance.yml'
+      - 'benchmarks/**'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  packages: write
+  pull-requests: write
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Job 1: build images once from branch code and push with a SHA-based tag.
+  # All downstream jobs pull these images instead of rebuilding.
+  # ---------------------------------------------------------------------------
+  build:
+    name: Build CI images
+    runs-on: ubuntu-latest
+    outputs:
+      image-tag: ${{ steps.tag.outputs.value }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Compute image tag
+        id: tag
+        run: echo "value=sha-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
+
+      - name: Build and push asap-planner-rs
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: asap-planner-rs/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-planner-rs:${{ steps.tag.outputs.value }}
+          cache-from: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache
+          cache-to: type=registry,ref=ghcr.io/projectasap/asap-planner-rs:buildcache,mode=max
+
+      - name: Build and push asap-summary-ingest
+        uses: docker/build-push-action@v6
+        with:
+          context: asap-summary-ingest
+          file: asap-summary-ingest/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-summary-ingest:${{ steps.tag.outputs.value }}
+
+      - name: Build and push asap-query-engine
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: asap-query-engine/Dockerfile
+          push: true
+          tags: ghcr.io/projectasap/asap-query-engine:${{ steps.tag.outputs.value }}
+          cache-from: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache
+          cache-to: type=registry,ref=ghcr.io/projectasap/asap-query-engine:buildcache,mode=max
+
+  # ---------------------------------------------------------------------------
+  # Job 2: pull the images built above, deploy the full stack, and evaluate.
+  # ---------------------------------------------------------------------------
+  eval:
+    name: Full-stack PR evaluation
+    needs: build
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    env:
+      ASAP_IMAGE_TAG: ${{ needs.build.outputs.image-tag }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Pull and start full stack
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            up -d
+
+      - name: Show running containers
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            ps
+
+      - name: Wait for all services to be healthy
+        run: bash benchmarks/scripts/wait_for_stack.sh
+
+      - name: Wait for pipeline and data ingestion
+        run: bash benchmarks/scripts/ingest_wait.sh
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: pip install requests
+
+      - name: Run baseline queries (Prometheus)
+        run: python benchmarks/scripts/run_baseline.py
+
+      - name: Run ASAP queries (query engine)
+        run: python benchmarks/scripts/run_asap.py
+
+      - name: Compare results and evaluate
+        run: python benchmarks/scripts/compare.py
+
+      - name: Upload evaluation reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-reports-${{ github.run_id }}
+          path: benchmarks/reports/
+
+      - name: Print docker logs on failure
+        if: failure()
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            logs --no-color
+
+      - name: Teardown stack
+        if: always()
+        run: |
+          docker compose \
+            -f asap-quickstart/docker-compose.yml \
+            -f benchmarks/docker-compose.yml \
+            down -v
diff --git a/.github/workflows/correctness.yml b/.github/workflows/correctness.yml
new file mode 100644
index 0000000..7ba0178
--- /dev/null
+++ b/.github/workflows/correctness.yml
@@ -0,0 +1,168 @@
+name: Correctness Tests
+
+# Verifies cross-language parity between Python and Rust implementations of
+# PromQL pattern matching and sketch serialisation. Ephemeral GitHub Actions
+# VMs are well-suited for these deterministic correctness checks.
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'asap-common/tests/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-common/sketch-core/**'
+      - '.github/workflows/correctness.yml'
+  pull_request:
+    branches: [ main ]
+    paths:
+      - 'asap-common/tests/**'
+      - 'asap-common/dependencies/**'
+      - 'asap-common/sketch-core/**'
+      - '.github/workflows/correctness.yml'
+  workflow_dispatch:
+
+jobs:
+  # ── 1. Cross-language token matching (Python vs Rust) ──────────────────────
+  cross-language-token-matching:
+    name: Cross-language PromQL token matching
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f asap-common/dependencies/py/requirements.txt ]; then
+            pip install -r asap-common/dependencies/py/requirements.txt
+          fi
+          pip install -e asap-common/dependencies/py/promql_utilities/
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Run master test runner (Python + Rust + comparison)
+        working-directory: asap-common/tests/compare_matched_tokens
+        run: python utilities/master_test_runner.py
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Upload test artefacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: cross-language-token-results
+          path: |
+            asap-common/tests/compare_matched_tokens/python_tests/python_test_results.json
+            asap-common/tests/compare_matched_tokens/rust_tests/rust_test_results.json
+            asap-common/tests/compare_matched_tokens/comparison_tests/comparison_report.json
+            asap-common/tests/compare_matched_tokens/test_summary.json
+          if-no-files-found: warn
+
+  # ── 2. Cross-language serialised pattern comparison ────────────────────────
+  cross-language-pattern-serialisation:
+    name: Cross-language pattern serialisation
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f asap-common/dependencies/py/requirements.txt ]; then
+            pip install -r asap-common/dependencies/py/requirements.txt
+          fi
+          pip install -e asap-common/dependencies/py/promql_utilities/
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Generate Python patterns
+        working-directory: asap-common/tests/compare_patterns
+        run: python python_generate_patterns.py
+
+      - name: Build and run Rust pattern generator
+        working-directory: asap-common/tests/compare_patterns
+        run: cargo run --release
+        env:
+          RUSTC_WRAPPER: sccache
+
+      - name: Compare serialised patterns
+        working-directory: asap-common/tests/compare_patterns
+        run: python compare_serialized_patterns.py
+
+  # ── 3. Rust pattern-matching unit tests ────────────────────────────────────
+  rust-pattern-matching:
+    name: Rust pattern matching unit tests
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install protoc
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y protobuf-compiler
+
+      - name: Run sccache
+        uses: mozilla-actions/sccache-action@v0.0.4
+
+      - name: Cache cargo
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            target
+          key: ${{ runner.os }}-cargo-correctness-${{ hashFiles('**/Cargo.lock', '**/Cargo.toml') }}
+
+      - name: Run Rust pattern matching tests
+        working-directory: asap-common/tests/rust_pattern_matching
+        run: cargo test --release
+        env:
+          RUSTC_WRAPPER: sccache
diff --git a/asap-common/.gitignore b/asap-common/.gitignore
index 9e7080c..102b6ea 100644
--- a/asap-common/.gitignore
+++ b/asap-common/.gitignore
@@ -8,4 +8,5 @@ dependencies/py/promql_utilities/promql_utilities.egg-info/
 dependencies/rs/**/target/
 
 tests/**/*.json
+!tests/**/test_data/*.json
 tests/**/target/
diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
index e38e483..c4155dc 100644
--- a/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
+++ b/asap-common/tests/compare_matched_tokens/rust_tests/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "promql_cross_lang_tests"
 version = "0.1.0"
diff --git a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
index fcc210f..6b0fef9 100644
--- a/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
+++ b/asap-common/tests/compare_matched_tokens/rust_tests/src/pattern_tests.rs
@@ -19,22 +19,11 @@ impl PatternTester {
             // Rate pattern
             PromQLPattern::new(
                 Self::build_rate_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "range_vector".to_string(),
-                ],
                 // Some("ONLY_TEMPORAL".to_string()),
             ),
             // Quantile over time pattern
             PromQLPattern::new(
                 Self::build_quantile_over_time_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "range_vector".to_string(),
-                    "function_args".to_string(),
-                ],
                 // Some("ONLY_TEMPORAL".to_string()),
             ),
         ];
@@ -44,34 +33,31 @@ impl PatternTester {
             // Sum aggregation pattern
             PromQLPattern::new(
                 Self::build_sum_pattern(),
-                vec!["metric".to_string(), "aggregation".to_string()],
                 // Some("ONLY_SPATIAL".to_string()),
             ),
             // Simple metric pattern
             PromQLPattern::new(
                 Self::build_metric_pattern(),
-                vec!["metric".to_string()],
                 // Some("ONLY_SPATIAL".to_string()),
             ),
         ];
 
         // ONE_TEMPORAL_ONE_SPATIAL patterns
         let combined_patterns = vec![
-            // Sum of rate pattern
+            // Aggregation of single-arg temporal functions
             PromQLPattern::new(
                 Self::build_one_temporal_one_spatial_pattern(),
-                vec![
-                    "metric".to_string(),
-                    "function".to_string(),
-                    "aggregation".to_string(),
-                    "range_vector".to_string(),
-                ],
+                // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()),
+            ),
+            // Aggregation of quantile_over_time (2-arg)
+            PromQLPattern::new(
+                Self::build_combined_quantile_pattern(),
                 // Some("ONE_TEMPORAL_ONE_SPATIAL".to_string()),
             ),
         ];
 
         // Insert in order from simple to complex to avoid panics
-        patterns.insert("ONLY_VECTOR".to_string(), spatial_patterns.clone());
+        // ONLY_VECTOR is derived via disambiguation in test_query, not a separate entry
         patterns.insert("ONLY_SPATIAL".to_string(), spatial_patterns);
         patterns.insert("ONLY_TEMPORAL".to_string(), temporal_patterns);
         patterns.insert("ONE_TEMPORAL_ONE_SPATIAL".to_string(), combined_patterns);
@@ -280,7 +266,20 @@ impl PatternTester {
 
         let args: Vec<Option<HashMap<String, Value>>> = vec![ms];
 
-        PromQLPatternBuilder::function(vec!["rate", "increase"], args, Some("function"), None)
+        PromQLPatternBuilder::function(
+            vec![
+                "rate",
+                "increase",
+                "avg_over_time",
+                "sum_over_time",
+                "count_over_time",
+                "min_over_time",
+                "max_over_time",
+            ],
+            args,
+            Some("function"),
+            None,
+        )
     }
 
     fn build_quantile_over_time_pattern() -> Option<HashMap<String, Value>> {
@@ -327,7 +326,6 @@ impl PatternTester {
 
         let func = PromQLPatternBuilder::function(
             vec![
-                "quantile_over_time",
                 "sum_over_time",
                 "count_over_time",
                 "avg_over_time",
@@ -351,6 +349,30 @@ impl PatternTester {
         )
     }
 
+    fn build_combined_quantile_pattern() -> Option<HashMap<String, Value>> {
+        let num = PromQLPatternBuilder::number(None, None);
+        let ms = PromQLPatternBuilder::matrix_selector(
+            PromQLPatternBuilder::metric(None, None, None, Some("metric")),
+            None,
+            Some("range_vector"),
+        );
+        let func_args: Vec<Option<HashMap<String, Value>>> = vec![num, ms];
+        let func = PromQLPatternBuilder::function(
+            vec!["quantile_over_time"],
+            func_args,
+            Some("function"),
+            None,
+        );
+        PromQLPatternBuilder::aggregation(
+            vec!["sum", "count", "avg", "quantile", "min", "max"],
+            func,
+            None,
+            None,
+            None,
+            Some("aggregation"),
+        )
+    }
+
     fn build_sum_rate_pattern() -> Option<HashMap<String, Value>> {
         let ms = PromQLPatternBuilder::matrix_selector(
             PromQLPatternBuilder::metric(None, None, None, Some("metric")),
diff --git a/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json
new file mode 100644
index 0000000..0d03af4
--- /dev/null
+++ b/asap-common/tests/compare_matched_tokens/test_data/promql_queries.json
@@ -0,0 +1,228 @@
+{
+  "test_cases": [
+    {
+      "id": "temporal_rate_basic",
+      "description": "Basic rate function over a time range",
+      "query": "rate(http_requests_total{job=\"api\"}[5m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "rate"
+        },
+        "range_vector": {
+          "range": "5m"
+        }
+      }
+    },
+    {
+      "id": "temporal_increase_basic",
+      "description": "Increase function over a time range",
+      "query": "increase(http_requests_total[1h])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "increase"
+        },
+        "range_vector": {
+          "range": "1h"
+        }
+      }
+    },
+    {
+      "id": "temporal_quantile_over_time",
+      "description": "Quantile over time function",
+      "query": "quantile_over_time(0.95, cpu_usage{instance=\"host1\"}[10m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "cpu_usage",
+          "labels": {"instance": "host1"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "quantile_over_time"
+        },
+        "range_vector": {
+          "range": "10m"
+        }
+      }
+    },
+    {
+      "id": "temporal_avg_over_time",
+      "description": "Average over time function",
+      "query": "avg_over_time(memory_bytes[30m])",
+      "expected_pattern_type": "ONLY_TEMPORAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "memory_bytes",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "avg_over_time"
+        },
+        "range_vector": {
+          "range": "30m"
+        }
+      }
+    },
+    {
+      "id": "spatial_sum_aggregation",
+      "description": "Sum aggregation across all series",
+      "query": "sum(http_requests_total{job=\"api\"})",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "spatial_avg_aggregation",
+      "description": "Average aggregation by label",
+      "query": "avg by (instance) (cpu_usage)",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "cpu_usage",
+          "labels": {},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "avg",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "spatial_count_aggregation",
+      "description": "Count aggregation",
+      "query": "count(up{job=\"node\"})",
+      "expected_pattern_type": "ONLY_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "up",
+          "labels": {"job": "node"},
+          "at_modifier": null
+        },
+        "aggregation": {
+          "op": "count",
+          "modifier": null
+        }
+      }
+    },
+    {
+      "id": "combined_sum_of_rate",
+      "description": "Sum aggregation of rate (temporal + spatial)",
+      "query": "sum(rate(http_requests_total{job=\"api\"}[5m]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "http_requests_total",
+          "labels": {"job": "api"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "rate"
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "5m"
+        }
+      }
+    },
+    {
+      "id": "combined_avg_of_quantile_over_time",
+      "description": "Avg aggregation of quantile_over_time (temporal + spatial)",
+      "query": "avg(quantile_over_time(0.99, response_time_seconds[15m]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "response_time_seconds",
+          "labels": {},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "quantile_over_time"
+        },
+        "aggregation": {
+          "op": "avg",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "15m"
+        }
+      }
+    },
+    {
+      "id": "combined_sum_of_avg_over_time",
+      "description": "Sum aggregation of avg_over_time",
+      "query": "sum by (job) (avg_over_time(memory_bytes{env=\"prod\"}[1h]))",
+      "expected_pattern_type": "ONE_TEMPORAL_ONE_SPATIAL",
+      "expected_tokens": {
+        "metric": {
+          "name": "memory_bytes",
+          "labels": {"env": "prod"},
+          "at_modifier": null
+        },
+        "function": {
+          "name": "avg_over_time"
+        },
+        "aggregation": {
+          "op": "sum",
+          "modifier": null
+        },
+        "range_vector": {
+          "range": "1h"
+        }
+      }
+    }
+  ],
+  "pattern_builder_tests": [
+    {
+      "id": "builder_metric_no_labels",
+      "description": "Build a simple metric selector with no labels",
+      "builder_call": "metric",
+      "parameters": {
+        "collect_as": "metric"
+      },
+      "expected_pattern": {
+        "type": "metric",
+        "collect_as": "metric"
+      }
+    },
+    {
+      "id": "builder_function_rate",
+      "description": "Build a rate function pattern",
+      "builder_call": "function",
+      "parameters": {
+        "names": ["rate", "increase"],
+        "collect_as": "function"
+      },
+      "expected_pattern": {
+        "type": "function",
+        "names": ["rate", "increase"],
+        "collect_as": "function"
+      }
+    }
+  ]
+}
diff --git a/asap-common/tests/compare_patterns/Cargo.toml b/asap-common/tests/compare_patterns/Cargo.toml
index 8d04a6f..3f94a69 100644
--- a/asap-common/tests/compare_patterns/Cargo.toml
+++ b/asap-common/tests/compare_patterns/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "compare_patterns_runner"
 version = "0.1.0"
diff --git a/asap-common/tests/compare_patterns/src/main.rs b/asap-common/tests/compare_patterns/src/main.rs
index 6c429f1..d75fa8b 100644
--- a/asap-common/tests/compare_patterns/src/main.rs
+++ b/asap-common/tests/compare_patterns/src/main.rs
@@ -25,7 +25,6 @@ fn main() {
     let pattern_1 = PromQLPatternBuilder::function(vec!["rate", "increase"], func_args1, Some("function"), None);
     let pattern1 = PromQLPattern::new(
         pattern_1,
-        vec!["metric".to_string(), "function".to_string(), "range_vector".to_string()],
     );
     if let Some(ast) = pattern1.ast_pattern {
         only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -44,7 +43,6 @@ fn main() {
     let pattern_2 = PromQLPatternBuilder::function(vec!["quantile_over_time"], func_args2, Some("function"), Some("function_args"));
     let pattern2 = PromQLPattern::new(
         pattern_2,
-        vec!["metric".to_string(), "function".to_string(), "range_vector".to_string(), "function_args".to_string()],
     );
     if let Some(ast) = pattern2.ast_pattern {
         only_temporal_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -66,7 +64,6 @@ fn main() {
     );
     let pattern3 = PromQLPattern::new(
         pattern_3,
-        vec!["metric".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern3.ast_pattern {
         only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -76,7 +73,6 @@ fn main() {
     let pattern_4 = PromQLPatternBuilder::metric(None, None, None, Some("metric"));
     let pattern4 = PromQLPattern::new(
         pattern_4,
-        vec!["metric".to_string()],
     );
     if let Some(ast) = pattern4.ast_pattern {
         only_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -108,7 +104,6 @@ fn main() {
     );
     let pattern5 = PromQLPattern::new(
         pattern_5,
-        vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "function_args".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern5.ast_pattern {
         one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
@@ -137,7 +132,6 @@ fn main() {
     );
     let pattern6 = PromQLPattern::new(
         pattern_6,
-        vec!["metric".to_string(), "range_vector".to_string(), "function".to_string(), "aggregation".to_string()],
     );
     if let Some(ast) = pattern6.ast_pattern {
         one_temporal_one_spatial_patterns.push(serde_json::Value::Object(ast.into_iter().collect()));
diff --git a/asap-common/tests/rust_pattern_matching/Cargo.toml b/asap-common/tests/rust_pattern_matching/Cargo.toml
index 14b2980..571641e 100644
--- a/asap-common/tests/rust_pattern_matching/Cargo.toml
+++ b/asap-common/tests/rust_pattern_matching/Cargo.toml
@@ -1,3 +1,5 @@
+[workspace]
+
 [package]
 name = "promql_cross_lang_tests"
 version = "0.1.0"
diff --git a/asap-common/tests/rust_pattern_matching/src/main.rs b/asap-common/tests/rust_pattern_matching/src/main.rs
index 3a540af..4a29352 100644
--- a/asap-common/tests/rust_pattern_matching/src/main.rs
+++ b/asap-common/tests/rust_pattern_matching/src/main.rs
@@ -10,11 +10,6 @@ fn temporal_pattern(
 ) -> PromQLPattern {
     PromQLPattern::new(
         blocks[pattern_type].clone(),
-        vec![
-            "metric".to_string(),
-            "function".to_string(),
-            "range_vector".to_string(),
-        ],
     )
 }
 
@@ -24,7 +19,6 @@ fn spatial_pattern(
 ) -> PromQLPattern {
     PromQLPattern::new(
         blocks[pattern_type].clone(),
-        vec!["metric".to_string(), "aggregation".to_string()],
     )
 }
 
@@ -39,12 +33,6 @@ fn spatial_of_temporal_pattern(temporal_block: &Option<HashMap<String, Value>>)
     );
     PromQLPattern::new(
         pattern,
-        vec![
-            "metric".to_string(),
-            "function".to_string(),
-            "range_vector".to_string(),
-            "aggregation".to_string(),
-        ],
     )
 }
 
diff --git a/asap-summary-ingest/Dockerfile b/asap-summary-ingest/Dockerfile
index 8432840..7a79036 100644
--- a/asap-summary-ingest/Dockerfile
+++ b/asap-summary-ingest/Dockerfile
@@ -1,4 +1,4 @@
-FROM sketchdb-base:latest
+FROM ghcr.io/projectasap/asap-base:latest
 
 LABEL maintainer="SketchDB Team"
 LABEL description="ArroyoSketch pipeline configuration service"
diff --git a/benchmarks/docker-compose.yml b/benchmarks/docker-compose.yml
new file mode 100644
index 0000000..fea9921
--- /dev/null
+++ b/benchmarks/docker-compose.yml
@@ -0,0 +1,20 @@
+# CI image override: replaces quickstart's pinned release images with images
+# built from the current branch. Intended for use as a Compose override:
+#
+#   ASAP_IMAGE_TAG=sha-<short-sha> docker compose \
+#     --project-directory . \
+#     -f asap-quickstart/docker-compose.yml \
+#     -f benchmarks/docker-compose.yml \
+#     up -d
+#
+# ASAP_IMAGE_TAG is set automatically by the 'build' job in accuracy_performance.yml.
+
+services:
+  asap-planner-rs:
+    image: ghcr.io/projectasap/asap-planner-rs:${ASAP_IMAGE_TAG}
+
+  asap-summary-ingest:
+    image: ghcr.io/projectasap/asap-summary-ingest:${ASAP_IMAGE_TAG}
+
+  queryengine:
+    image: ghcr.io/projectasap/asap-query-engine:${ASAP_IMAGE_TAG}
diff --git a/benchmarks/golden/.gitkeep b/benchmarks/golden/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/queries/promql_suite.json b/benchmarks/queries/promql_suite.json
new file mode 100644
index 0000000..52b525e
--- /dev/null
+++ b/benchmarks/queries/promql_suite.json
@@ -0,0 +1,18 @@
+{
+  "queries": [
+    {"id": "avg_all", "expr": "avg(sensor_reading)", "asap_native": false},
+    {"id": "sum_all", "expr": "sum(sensor_reading)", "asap_native": false},
+    {"id": "max_all", "expr": "max(sensor_reading)", "asap_native": false},
+    {"id": "min_all", "expr": "min(sensor_reading)", "asap_native": false},
+    {"id": "q50_all", "expr": "quantile(0.50, sensor_reading)", "asap_native": true},
+    {"id": "q90_all", "expr": "quantile(0.90, sensor_reading)", "asap_native": true},
+    {"id": "q95_all", "expr": "quantile(0.95, sensor_reading)", "asap_native": true},
+    {"id": "q99_all", "expr": "quantile(0.99, sensor_reading)", "asap_native": true},
+    {"id": "q95_by_pattern", "expr": "quantile by (pattern) (0.95, sensor_reading)", "asap_native": true},
+    {"id": "q99_by_pattern", "expr": "quantile by (pattern) (0.99, sensor_reading)", "asap_native": true},
+    {"id": "q50_by_pattern", "expr": "quantile by (pattern) (0.50, sensor_reading)", "asap_native": true},
+    {"id": "avg_by_pattern", "expr": "avg by (pattern) (sensor_reading)", "asap_native": false},
+    {"id": "sum_by_region", "expr": "sum by (region) (sensor_reading)", "asap_native": false},
+    {"id": "max_by_service", "expr": "max by (service) (sensor_reading)", "asap_native": false}
+  ]
+}
diff --git a/benchmarks/reports/.gitkeep b/benchmarks/reports/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/benchmarks/scripts/compare.py b/benchmarks/scripts/compare.py
new file mode 100644
index 0000000..2f901d9
--- /dev/null
+++ b/benchmarks/scripts/compare.py
@@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""compare.py — compares baseline (Prometheus) vs ASAP query engine results.
+
+Applies pass/fail policy:
+  - FAIL if any query returned an error in ASAP (query failure)
+  - WARN if any ASAP-native query has relative error > max_error (default 5%)
+  - WARN (not FAIL) on >10% p95 latency regression (GH runner noise)
+
+Generates benchmarks/reports/eval_report.md and prints it to stdout.
+Exits 1 if any PASS/FAIL check failed.
+
+Usage:
+    python benchmarks/scripts/compare.py \
+        [--baseline FILE] \
+        [--asap FILE] \
+        [--output FILE] \
+        [--max-error FLOAT]
+"""
+
+import argparse
+import json
+import os
+import sys
+from datetime import datetime, timezone
+
+DEFAULT_BASELINE = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "baseline_results.json"
+)
+DEFAULT_ASAP = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "asap_results.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "eval_report.md"
+)
+DEFAULT_MAX_ERROR = 0.05   # 5 %
+LATENCY_REGRESSION_THRESHOLD = 0.10  # 10 % — warn only
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+def percentile(values: list[float], pct: float) -> float:
+    """Compute percentile (0-100) from a list of floats using linear interpolation."""
+    if not values:
+        return float("nan")
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    if n == 1:
+        return sorted_vals[0]
+    idx = (pct / 100.0) * (n - 1)
+    lo = int(idx)
+    hi = lo + 1
+    if hi >= n:
+        return sorted_vals[-1]
+    frac = idx - lo
+    return sorted_vals[lo] * (1 - frac) + sorted_vals[hi] * frac
+
+
+def valid_latencies(latencies: list) -> list[float]:
+    """Strip None entries (failed HTTP requests) from a latency list."""
+    return [x for x in latencies if x is not None]
+
+
+def label_key(metric: dict) -> str:
+    """Canonical sort key for a Prometheus metric label set."""
+    return json.dumps(metric, sort_keys=True)
+
+
+def extract_scalar_value(result: list) -> float | None:
+    """Return a single float from a scalar/single-entry vector result."""
+    if not result:
+        return None
+    entry = result[0]
+    try:
+        return float(entry["value"][1])
+    except (KeyError, IndexError, ValueError, TypeError):
+        return None
+
+
+def compute_relative_error(baseline_val: float, asap_val: float) -> float:
+    """Relative error: |asap - baseline| / max(|baseline|, 1e-9)."""
+    denom = max(abs(baseline_val), 1e-9)
+    return abs(asap_val - baseline_val) / denom
+
+
+def compare_results(
+    baseline_result: list, asap_result: list
+) -> tuple[float | None, str]:
+    """
+    Compare two Prometheus vector result arrays.
+
+    Returns (max_relative_error, note).
+    For grouped results, matches entries by label set.
+    """
+    if not baseline_result and not asap_result:
+        return 0.0, "both empty"
+    if not baseline_result:
+        return None, "baseline empty"
+    if not asap_result:
+        return None, "asap empty"
+
+    # Build label-key → float maps
+    baseline_map: dict[str, float] = {}
+    for entry in baseline_result:
+        key = label_key(entry.get("metric", {}))
+        try:
+            baseline_map[key] = float(entry["value"][1])
+        except (KeyError, IndexError, ValueError, TypeError):
+            pass
+
+    asap_map: dict[str, float] = {}
+    for entry in asap_result:
+        key = label_key(entry.get("metric", {}))
+        try:
+            asap_map[key] = float(entry["value"][1])
+        except (KeyError, IndexError, ValueError, TypeError):
+            pass
+
+    # Single-value result (no labels / collapsed)
+    if len(baseline_map) == 1 and len(asap_map) == 1:
+        bv = next(iter(baseline_map.values()))
+        av = next(iter(asap_map.values()))
+        return compute_relative_error(bv, av), ""
+
+    # Multi-value: match by label set
+    max_err = 0.0
+    missing = []
+    for key, bv in baseline_map.items():
+        if key not in asap_map:
+            missing.append(key)
+            continue
+        err = compute_relative_error(bv, asap_map[key])
+        max_err = max(max_err, err)
+
+    note = f"{len(missing)} label set(s) missing from ASAP" if missing else ""
+    return max_err, note
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Compare baseline vs ASAP results")
+    parser.add_argument("--baseline", default=DEFAULT_BASELINE, help="Baseline JSON file")
+    parser.add_argument("--asap", default=DEFAULT_ASAP, help="ASAP results JSON file")
+    parser.add_argument("--output", default=DEFAULT_OUTPUT, help="Output Markdown file")
+    parser.add_argument(
+        "--max-error",
+        type=float,
+        default=DEFAULT_MAX_ERROR,
+        help="Max relative error for ASAP-native queries (default: 0.01 = 1%%)",
+    )
+    args = parser.parse_args()
+
+    with open(args.baseline) as f:
+        baseline_data = json.load(f)
+    with open(args.asap) as f:
+        asap_data = json.load(f)
+
+    baseline_results = baseline_data["results"]
+    asap_results = asap_data["results"]
+
+    # Collect all query IDs (union)
+    all_ids = sorted(set(list(baseline_results.keys()) + list(asap_results.keys())))
+
+    failures: list[str] = []
+    warnings: list[str] = []
+    rows: list[dict] = []
+
+    for qid in all_ids:
+        b = baseline_results.get(qid, {})
+        a = asap_results.get(qid, {})
+
+        asap_native = a.get("asap_native", False)
+        asap_status = a.get("status", "missing")
+        asap_error = a.get("error")
+
+        # Latency p95
+        b_lats = valid_latencies(b.get("latencies_ms", []))
+        a_lats = valid_latencies(a.get("latencies_ms", []))
+        b_p95 = percentile(b_lats, 95) if b_lats else float("nan")
+        a_p95 = percentile(a_lats, 95) if a_lats else float("nan")
+
+        # Relative error
+        rel_error: float | None = None
+        note = ""
+        if asap_status == "success" and b.get("status") == "success":
+            rel_error, note = compare_results(
+                b.get("data", []), a.get("data", [])
+            )
+
+        # Pass/fail determination
+        row_status = "PASS"
+
+        if asap_status == "error" or asap_status == "missing":
+            row_status = "FAIL"
+            reason = asap_error or "query not present in ASAP results"
+            failures.append(f"{qid}: ASAP query failed — {reason}")
+
+        elif asap_native and rel_error is not None and rel_error > args.max_error:
+            row_status = "WARN"
+            warnings.append(
+                f"{qid}: relative error {rel_error:.4f} > threshold {args.max_error:.4f} — informational only"
+            )
+
+        # Latency regression — warn only
+        if (
+            not (b_p95 != b_p95)  # not NaN
+            and not (a_p95 != a_p95)
+            and b_p95 > 0
+        ):
+            latency_regression = (a_p95 - b_p95) / b_p95
+            if latency_regression > LATENCY_REGRESSION_THRESHOLD:
+                warnings.append(
+                    f"{qid}: p95 latency regression {latency_regression:.1%} "
+                    f"(baseline={b_p95:.1f}ms, asap={a_p95:.1f}ms) — "
+                    "GH runner noise expected; informational only"
+                )
+
+        rows.append(
+            {
+                "id": qid,
+                "asap_native": asap_native,
+                "b_p95": b_p95,
+                "a_p95": a_p95,
+                "rel_error": rel_error,
+                "status": row_status,
+                "note": note,
+            }
+        )
+
+    overall = "PASS" if not failures else "FAIL"
+
+    # ── Build Markdown report ────────────────────────────────────────────────
+    now = datetime.now(timezone.utc).isoformat()
+    lines: list[str] = []
+    lines.append("# ASAP PR Evaluation Report")
+    lines.append("")
+    lines.append(f"**Generated:** {now}")
+    lines.append(
+        f"**Overall verdict:** {'✅ PASS' if overall == 'PASS' else '❌ FAIL'}"
+    )
+    lines.append(f"**Max error threshold:** {args.max_error:.2%}")
+    lines.append("")
+
+    # Summary table
+    lines.append(
+        "| Query | ASAP Native | Baseline p95 (ms) | ASAP p95 (ms) | Rel Error | Status |"
+    )
+    lines.append("|-------|:-----------:|:-----------------:|:-------------:|:---------:|:------:|")
+    for row in rows:
+        native_mark = "yes" if row["asap_native"] else "no"
+        b_p95_s = f"{row['b_p95']:.1f}" if row["b_p95"] == row["b_p95"] else "n/a"
+        a_p95_s = f"{row['a_p95']:.1f}" if row["a_p95"] == row["a_p95"] else "n/a"
+        if row["rel_error"] is None:
+            rel_err_s = "n/a"
+        else:
+            rel_err_s = f"{row['rel_error']:.4f}"
+        status_s = "✅ PASS" if row["status"] == "PASS" else "❌ FAIL"
+        note_s = f" ({row['note']})" if row["note"] else ""
+        lines.append(
+            f"| {row['id']}{note_s} | {native_mark} | {b_p95_s} | {a_p95_s} "
+            f"| {rel_err_s} | {status_s} |"
+        )
+
+    lines.append("")
+
+    if failures:
+        lines.append("## Failures")
+        lines.append("")
+        for f_msg in failures:
+            lines.append(f"- {f_msg}")
+        lines.append("")
+
+    if warnings:
+        lines.append("## Latency Warnings (informational)")
+        lines.append("")
+        lines.append(
+            "> **Note:** GitHub-hosted runners exhibit significant timing noise. "
+            "Latency numbers are indicative only. For precise benchmarks, register "
+            "a self-hosted runner once asap-tools infra is decoupled from Cloudlab "
+            "(see PDF eval guide Phase 3)."
+        )
+        lines.append("")
+        for w in warnings:
+            lines.append(f"- {w}")
+        lines.append("")
+
+    lines.append("---")
+    lines.append(
+        "_This report was generated automatically by `benchmarks/scripts/compare.py`._"
+    )
+
+    report = "\n".join(lines)
+    print(report)
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        f.write(report + "\n")
+    print(f"\n[compare] Report saved to {args.output}", file=sys.stderr)
+
+    if overall == "FAIL":
+        print(
+            f"\n[compare] Evaluation FAILED. {len(failures)} check(s) failed:",
+            file=sys.stderr,
+        )
+        for msg in failures:
+            print(f"  - {msg}", file=sys.stderr)
+        sys.exit(1)
+    else:
+        print("\n[compare] Evaluation PASSED.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/ingest_wait.sh b/benchmarks/scripts/ingest_wait.sh
new file mode 100644
index 0000000..8b12d3a
--- /dev/null
+++ b/benchmarks/scripts/ingest_wait.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ingest_wait.sh — waits for the asap-demo Arroyo pipeline to reach RUNNING
+# state, then sleeps to allow sketches to accumulate before verifying that the
+# query engine has ingested data.
+
+ARROYO_URL="http://localhost:5115/api/v1/pipelines"
+QE_URL="http://localhost:8088/api/v1/query"
+PIPELINE_NAME="asap-demo"
+MAX_PIPELINE_WAIT=600   # seconds — Arroyo must compile Rust UDFs; allow extra time
+ACCUMULATE_SLEEP=90     # seconds after pipeline is running
+SLEEP=5
+
+# ── 1. Wait for asap-demo pipeline to reach RUNNING ─────────────────────────
+echo "[ingest_wait] Waiting for Arroyo pipeline '${PIPELINE_NAME}' to reach RUNNING state ..."
+elapsed=0
+while true; do
+  state=$(curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null \
+    | python3 -c "
+import sys, json
+try:
+    data = json.load(sys.stdin)
+    # 'data' key may be null when no pipelines exist; use 'or []' to handle that
+    pipelines = data if isinstance(data, list) else (data.get('data') or [])
+    for p in pipelines:
+        name = str(p.get('name') or p.get('id') or '')
+        # Normalise hyphens/underscores so 'asap-demo' matches 'asap_demo'
+        if '${PIPELINE_NAME}'.replace('-', '_') in name.replace('-', '_'):
+            state = p.get('state')
+            stop  = p.get('stop', '')
+            # Arroyo signals a running pipeline via state=null and stop='none'
+            if state is None and stop == 'none':
+                print('Running')
+            elif state is not None:
+                print(str(state))
+            else:
+                print('stopped')
+            break
+    else:
+        # No matching pipeline found yet — print nothing so caller retries
+        pass
+except Exception:
+    pass
+" 2>/dev/null || true)
+
+  if [ "${state}" = "Running" ] || [ "${state}" = "RUNNING" ] || [ "${state}" = "running" ]; then
+    echo "[ingest_wait] Pipeline '${PIPELINE_NAME}' is RUNNING (${elapsed}s elapsed)"
+    break
+  fi
+
+  if [ "${elapsed}" -ge "${MAX_PIPELINE_WAIT}" ]; then
+    echo "[ingest_wait] ERROR: Pipeline '${PIPELINE_NAME}' did not reach RUNNING within ${MAX_PIPELINE_WAIT}s (last state: '${state:-unknown}')" >&2
+    # Dump pipeline list for diagnosis
+    echo "[ingest_wait] Current Arroyo pipeline list:" >&2
+    curl -sf --max-time 10 "${ARROYO_URL}" 2>/dev/null | python3 -m json.tool 2>/dev/null >&2 || true
+    exit 1
+  fi
+
+  echo "[ingest_wait] Pipeline state: '${state:-not found}' — retrying in ${SLEEP}s (${elapsed}s elapsed) ..."
+  sleep "${SLEEP}"
+  elapsed=$(( elapsed + SLEEP ))
+done
+
+# ── 2. Allow sketches to accumulate ─────────────────────────────────────────
+echo "[ingest_wait] Pipeline running. Sleeping ${ACCUMULATE_SLEEP}s for sketches to accumulate ..."
+sleep "${ACCUMULATE_SLEEP}"
+
+# ── 3. Verify query engine has data ─────────────────────────────────────────
+echo "[ingest_wait] Verifying query engine has data ..."
+response=$(curl -sf --max-time 10 \
+  "${QE_URL}?query=avg%28sensor_reading%29" 2>/dev/null || true)
+
+if [ -z "${response}" ]; then
+  echo "[ingest_wait] ERROR: Query engine returned empty response." >&2
+  exit 1
+fi
+
+result_count=$(echo "${response}" | python3 -c "
+import sys, json
+data = json.load(sys.stdin)
+result = data.get('data', {}).get('result', [])
+print(len(result))
+" 2>/dev/null || echo "0")
+
+if [ "${result_count}" -eq 0 ]; then
+  echo "[ingest_wait] ERROR: Query engine has no data yet (result array is empty)." >&2
+  exit 1
+fi
+
+echo "[ingest_wait] Query engine has data (${result_count} result entries). Ready for benchmarking."
diff --git a/benchmarks/scripts/run_asap.py b/benchmarks/scripts/run_asap.py
new file mode 100644
index 0000000..10747cb
--- /dev/null
+++ b/benchmarks/scripts/run_asap.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""run_asap.py — queries ASAPQuery engine for each entry in promql_suite.json.
+
+Runs each query 3 times to gather latency samples, then writes results to
+benchmarks/reports/asap_results.json.
+
+Usage:
+    python benchmarks/scripts/run_asap.py \
+        [--asap-url URL] \
+        [--output FILE]
+"""
+
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from datetime import datetime, timezone
+
+import requests
+
+SUITE_PATH = os.path.join(
+    os.path.dirname(__file__), "..", "queries", "promql_suite.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "asap_results.json"
+)
+RUNS_PER_QUERY = 3
+
+
+def query_asap(base_url: str, expr: str, ts: float) -> tuple[dict, float]:
+    """Issue a single instant query to the ASAP query engine, return (parsed_json, latency_ms)."""
+    encoded = urllib.parse.quote(expr, safe="")
+    url = f"{base_url}/api/v1/query?query={encoded}&time={ts}"
+    t0 = time.monotonic()
+    resp = requests.get(url, timeout=30)
+    latency_ms = (time.monotonic() - t0) * 1000.0
+    resp.raise_for_status()
+    return resp.json(), latency_ms
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run ASAP query engine benchmark queries")
+    parser.add_argument(
+        "--asap-url",
+        default="http://localhost:8088",
+        help="ASAP query engine base URL (default: http://localhost:8088)",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_OUTPUT,
+        help="Output JSON file path",
+    )
+    args = parser.parse_args()
+
+    with open(SUITE_PATH) as f:
+        suite = json.load(f)
+
+    results: dict[str, dict] = {}
+    now = time.time()
+
+    for q in suite["queries"]:
+        qid = q["id"]
+        expr = q["expr"]
+        asap_native = q.get("asap_native", False)
+        latencies = []
+        last_data = []
+        last_error = None
+        last_status = "success"
+
+        print(f"[asap] Running query '{qid}' (native={asap_native}): {expr}")
+        for run in range(1, RUNS_PER_QUERY + 1):
+            try:
+                payload, lat = query_asap(args.asap_url, expr, now)
+                latencies.append(lat)
+                if payload.get("status") == "success":
+                    last_data = payload.get("data", {}).get("result", [])
+                    last_status = "success"
+                    last_error = None
+                else:
+                    last_status = "error"
+                    last_error = payload.get("error", "unknown error")
+                    last_data = []
+                print(f"  run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms  status={last_status}")
+            except Exception as exc:  # noqa: BLE001
+                last_status = "error"
+                last_error = str(exc)
+                last_data = []
+                latencies.append(None)
+                print(f"  run {run}/{RUNS_PER_QUERY}: ERROR — {exc}")
+
+        results[qid] = {
+            "status": last_status,
+            "asap_native": asap_native,
+            "latencies_ms": latencies,
+            "data": last_data,
+            "error": last_error,
+        }
+
+    output = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "asap_url": args.asap_url,
+        "results": results,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"\n[asap] Results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/run_baseline.py b/benchmarks/scripts/run_baseline.py
new file mode 100644
index 0000000..6e09e41
--- /dev/null
+++ b/benchmarks/scripts/run_baseline.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""run_baseline.py — queries Prometheus for each entry in promql_suite.json.
+
+Runs each query 3 times to gather latency samples, then writes results to
+benchmarks/reports/baseline_results.json.
+
+Usage:
+    python benchmarks/scripts/run_baseline.py \
+        [--prometheus-url URL] \
+        [--output FILE]
+"""
+
+import argparse
+import json
+import os
+import time
+import urllib.parse
+from datetime import datetime, timezone
+
+import requests
+
+SUITE_PATH = os.path.join(
+    os.path.dirname(__file__), "..", "queries", "promql_suite.json"
+)
+DEFAULT_OUTPUT = os.path.join(
+    os.path.dirname(__file__), "..", "reports", "baseline_results.json"
+)
+RUNS_PER_QUERY = 3
+
+
+def query_prometheus(base_url: str, expr: str, ts: float) -> tuple[dict, float]:
+    """Issue a single instant query, return (parsed_json, latency_ms)."""
+    encoded = urllib.parse.quote(expr, safe="")
+    url = f"{base_url}/api/v1/query?query={encoded}&time={ts}"
+    t0 = time.monotonic()
+    resp = requests.get(url, timeout=30)
+    latency_ms = (time.monotonic() - t0) * 1000.0
+    resp.raise_for_status()
+    return resp.json(), latency_ms
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run baseline Prometheus queries")
+    parser.add_argument(
+        "--prometheus-url",
+        default="http://localhost:9090",
+        help="Prometheus base URL (default: http://localhost:9090)",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_OUTPUT,
+        help="Output JSON file path",
+    )
+    args = parser.parse_args()
+
+    with open(SUITE_PATH) as f:
+        suite = json.load(f)
+
+    results: dict[str, dict] = {}
+    now = time.time()
+
+    for q in suite["queries"]:
+        qid = q["id"]
+        expr = q["expr"]
+        latencies = []
+        last_data = []
+        last_error = None
+        last_status = "success"
+
+        print(f"[baseline] Running query '{qid}': {expr}")
+        for run in range(1, RUNS_PER_QUERY + 1):
+            try:
+                payload, lat = query_prometheus(args.prometheus_url, expr, now)
+                latencies.append(lat)
+                if payload.get("status") == "success":
+                    last_data = payload.get("data", {}).get("result", [])
+                    last_status = "success"
+                    last_error = None
+                else:
+                    last_status = "error"
+                    last_error = payload.get("error", "unknown error")
+                    last_data = []
+                print(f"  run {run}/{RUNS_PER_QUERY}: {lat:.1f} ms  status={last_status}")
+            except Exception as exc:  # noqa: BLE001
+                last_status = "error"
+                last_error = str(exc)
+                last_data = []
+                latencies.append(None)
+                print(f"  run {run}/{RUNS_PER_QUERY}: ERROR — {exc}")
+
+        results[qid] = {
+            "status": last_status,
+            "latencies_ms": latencies,
+            "data": last_data,
+            "error": last_error,
+        }
+
+    output = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "prometheus_url": args.prometheus_url,
+        "results": results,
+    }
+
+    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
+    with open(args.output, "w") as f:
+        json.dump(output, f, indent=2)
+    print(f"\n[baseline] Results saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/wait_for_stack.sh b/benchmarks/scripts/wait_for_stack.sh
new file mode 100644
index 0000000..672aa6d
--- /dev/null
+++ b/benchmarks/scripts/wait_for_stack.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# wait_for_stack.sh — polls each service until healthy or times out.
+# Used in CI after `docker compose up -d --build`.
+
+MAX_WAIT=180   # seconds per service
+SLEEP=5
+
+wait_for_url() {
+  local name="$1"
+  local url="$2"
+  local elapsed=0
+
+  echo "[wait_for_stack] Waiting for ${name} at ${url} ..."
+  while true; do
+    if curl -sf --max-time 5 "${url}" > /dev/null 2>&1; then
+      echo "[wait_for_stack] ${name} is healthy (${elapsed}s elapsed)"
+      return 0
+    fi
+    if [ "${elapsed}" -ge "${MAX_WAIT}" ]; then
+      echo "[wait_for_stack] ERROR: ${name} did not become healthy within ${MAX_WAIT}s" >&2
+      return 1
+    fi
+    sleep "${SLEEP}"
+    elapsed=$(( elapsed + SLEEP ))
+  done
+}
+
+wait_for_url "Prometheus"   "http://localhost:9090/-/healthy"
+wait_for_url "Arroyo API"   "http://localhost:5115/api/v1/pipelines"
+wait_for_url "QueryEngine"  "http://localhost:8088/api/v1/query?query=vector(1)"
+
+echo "[wait_for_stack] All services are healthy."