diff --git a/asap-query-engine/Cargo.toml b/asap-query-engine/Cargo.toml
index 6c031ae..a5fca27 100644
--- a/asap-query-engine/Cargo.toml
+++ b/asap-query-engine/Cargo.toml
@@ -59,6 +59,14 @@ tracing-appender = "0.2"
 elastic_dsl_utilities.workspace = true
 sketchlib-rust = { git = "https://github.com/ProjectASAP/sketchlib-rust", rev = "440427438fdaf3ac2298b53ee148f9e12a64ffcc" }
 
+[[bin]]
+name = "precompute_engine"
+path = "src/bin/precompute_engine.rs"
+
+[[bin]]
+name = "test_e2e_precompute"
+path = "src/bin/test_e2e_precompute.rs"
+
 [dev-dependencies]
 ctor = "0.2"
 tempfile = "3.20.0"
diff --git a/asap-query-engine/Dockerfile b/asap-query-engine/Dockerfile
index d4c2f5a..890a065 100644
--- a/asap-query-engine/Dockerfile
+++ b/asap-query-engine/Dockerfile
@@ -25,10 +25,14 @@ COPY asap-query-engine/Cargo.toml ./asap-query-engine/
 COPY asap-planner-rs/Cargo.toml ./asap-planner-rs/
 
 # Create dummy source files so Cargo can resolve all workspace members
-RUN mkdir -p asap-query-engine/src && echo "fn main() {}" > asap-query-engine/src/main.rs && \
-    mkdir -p asap-query-engine/benches && echo "fn main() {}" > asap-query-engine/benches/simple_store_bench.rs && \
-    mkdir -p asap-planner-rs/src && echo "fn main() {}" > asap-planner-rs/src/main.rs && \
-    echo "pub fn placeholder() {}" >> asap-planner-rs/src/lib.rs
+# All explicit [[bin]] targets in Cargo.toml must have stubs here for the dependency cache layer
+RUN mkdir -p asap-query-engine/src/bin \
+    && echo "fn main() {}" > asap-query-engine/src/main.rs \
+    && echo "fn main() {}" > asap-query-engine/src/bin/precompute_engine.rs \
+    && echo "fn main() {}" > asap-query-engine/src/bin/test_e2e_precompute.rs \
+    && mkdir -p asap-query-engine/benches && echo "fn main() {}" > asap-query-engine/benches/simple_store_bench.rs \
+    && mkdir -p asap-planner-rs/src && echo "fn main() {}" > asap-planner-rs/src/main.rs \
+    && echo "pub fn placeholder() {}" >> asap-planner-rs/src/lib.rs
 
 # Build dependencies (this layer will be cached)
 WORKDIR /code/asap-query-engine
diff --git a/asap-query-engine/src/bin/precompute_engine.rs b/asap-query-engine/src/bin/precompute_engine.rs
new file mode 100644
index 0000000..e0ec1fe
--- /dev/null
+++ b/asap-query-engine/src/bin/precompute_engine.rs
@@ -0,0 +1,151 @@
+use clap::Parser;
+use query_engine_rust::data_model::QueryLanguage;
+use query_engine_rust::data_model::{
+    CleanupPolicy, InferenceConfig, LockStrategy, StreamingConfig,
+};
+use query_engine_rust::drivers::query::adapters::AdapterConfig;
+use query_engine_rust::engines::SimpleEngine;
+use query_engine_rust::precompute_engine::config::PrecomputeEngineConfig;
+use query_engine_rust::precompute_engine::output_sink::{RawPassthroughSink, StoreOutputSink};
+use query_engine_rust::precompute_engine::PrecomputeEngine;
+use query_engine_rust::stores::SimpleMapStore;
+use query_engine_rust::{HttpServer, HttpServerConfig};
+use std::sync::Arc;
+use tracing::info;
+use tracing_subscriber::fmt::format::FmtSpan;
+
+#[derive(Parser, Debug)]
+#[command(name = "precompute_engine")]
+#[command(about = "Standalone precompute engine for SketchDB")]
+struct Args {
+    /// Path to streaming config YAML file
+    #[arg(long)]
+    streaming_config: String,
+
+    /// Port for Prometheus remote write ingest
+    #[arg(long, default_value_t = 9090)]
+    ingest_port: u16,
+
+    /// Number of worker threads
+    #[arg(long, default_value_t = 4)]
+    num_workers: usize,
+
+    /// Maximum allowed lateness for out-of-order samples (ms)
+    #[arg(long, default_value_t = 5000)]
+    allowed_lateness_ms: i64,
+
+    /// Maximum buffered samples per series
+    #[arg(long, default_value_t = 10000)]
+    max_buffer_per_series: usize,
+
+    /// Flush interval for idle window detection (ms)
+    #[arg(long, default_value_t = 1000)]
+    flush_interval_ms: u64,
+
+    /// MPSC channel buffer size per worker
+    #[arg(long, default_value_t = 10000)]
+    channel_buffer_size: usize,
+
+    /// Port for the query HTTP server (0 to disable)
+    #[arg(long, default_value_t = 8080)]
+    query_port: u16,
+
+    /// Lock strategy for the store
+    #[arg(long, value_enum, default_value_t = LockStrategy::PerKey)]
+    lock_strategy: LockStrategy,
+
+    /// Skip aggregation and pass each raw sample directly to the store
+    #[arg(long, default_value_t = false)]
+    pass_raw_samples: bool,
+
+    /// Aggregation ID to stamp on each raw-mode output
+    #[arg(long, default_value_t = 0)]
+    raw_mode_aggregation_id: u64,
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    // Initialize tracing
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
+        )
+        .with_span_events(FmtSpan::CLOSE)
+        .init();
+
+    let args = Args::parse();
+
+    info!("Loading streaming config from: {}", args.streaming_config);
+    let streaming_config = Arc::new(StreamingConfig::from_yaml_file(&args.streaming_config)?);
+
+    info!(
+        "Loaded {} aggregation configs",
+        streaming_config.get_all_aggregation_configs().len()
+    );
+
+    // Create the store
+    let store: Arc<dyn query_engine_rust::stores::Store> =
+        Arc::new(SimpleMapStore::new_with_strategy(
+            streaming_config.clone(),
+            CleanupPolicy::CircularBuffer,
+            args.lock_strategy,
+        ));
+
+    // Optionally start the query HTTP server
+    if args.query_port > 0 {
+        let inference_config =
+            InferenceConfig::new(QueryLanguage::promql, CleanupPolicy::CircularBuffer);
+        let query_engine = Arc::new(SimpleEngine::new(
+            store.clone(),
+            inference_config,
+            streaming_config.clone(),
+            15, // default prometheus scrape interval
+            QueryLanguage::promql,
+        ));
+        let http_config = HttpServerConfig {
+            port: args.query_port,
+            handle_http_requests: true,
+            adapter_config: AdapterConfig {
+                protocol: query_engine_rust::data_model::QueryProtocol::PrometheusHttp,
+                language: QueryLanguage::promql,
+                fallback: None,
+            },
+        };
+        let http_server = HttpServer::new(http_config, query_engine, store.clone());
+        tokio::spawn(async move {
+            if let Err(e) = http_server.run().await {
+                tracing::error!("Query server error: {}", e);
+            }
+        });
+        info!("Query server started on port {}", args.query_port);
+    }
+
+    // Build the precompute engine config
+    let engine_config = PrecomputeEngineConfig {
+        num_workers: args.num_workers,
+        ingest_port: args.ingest_port,
+        allowed_lateness_ms: args.allowed_lateness_ms,
+        max_buffer_per_series: args.max_buffer_per_series,
+        flush_interval_ms: args.flush_interval_ms,
+        channel_buffer_size: args.channel_buffer_size,
+        pass_raw_samples: args.pass_raw_samples,
+        raw_mode_aggregation_id: args.raw_mode_aggregation_id,
+    };
+
+    // Create the output sink (writes directly to the store)
+    let output_sink: Arc<dyn query_engine_rust::precompute_engine::output_sink::OutputSink> =
+        if args.pass_raw_samples {
+            Arc::new(RawPassthroughSink::new(store))
+        } else {
+            Arc::new(StoreOutputSink::new(store))
+        };
+
+    // Build and run the engine
+    let engine = PrecomputeEngine::new(engine_config, streaming_config, output_sink);
+
+    info!("Starting precompute engine...");
+    engine.run().await?;
+
+    Ok(())
+}
diff --git a/asap-query-engine/src/bin/test_e2e_precompute.rs b/asap-query-engine/src/bin/test_e2e_precompute.rs
new file mode 100644
index 0000000..a8235d0
--- /dev/null
+++ b/asap-query-engine/src/bin/test_e2e_precompute.rs
@@ -0,0 +1,347 @@
+//! End-to-end test for the standalone precompute_engine binary.
+//!
+//! This binary:
+//! 1. Starts a PrecomputeEngine in-process (same as the precompute_engine binary)
+//! 2. Sends Prometheus remote write samples via HTTP
+//! 3. Queries the PromQL endpoint and prints results
+//!
+//! Usage:
+//!   cargo run --bin test_e2e_precompute
+
+use prost::Message;
+use query_engine_rust::data_model::{LockStrategy, QueryLanguage};
+use query_engine_rust::drivers::ingest::prometheus_remote_write::{
+    Label, Sample, TimeSeries, WriteRequest,
+};
+use query_engine_rust::drivers::query::adapters::AdapterConfig;
+use query_engine_rust::engines::SimpleEngine;
+use query_engine_rust::precompute_engine::config::PrecomputeEngineConfig;
+use query_engine_rust::precompute_engine::output_sink::{RawPassthroughSink, StoreOutputSink};
+use query_engine_rust::precompute_engine::PrecomputeEngine;
+use query_engine_rust::stores::SimpleMapStore;
+use query_engine_rust::utils::file_io::{read_inference_config, read_streaming_config};
+use query_engine_rust::{HttpServer, HttpServerConfig};
+use std::sync::Arc;
+
+const INGEST_PORT: u16 = 19090;
+const QUERY_PORT: u16 = 18080;
+const RAW_INGEST_PORT: u16 = 19091;
+const SCRAPE_INTERVAL: u64 = 1; // 1 second to match tumblingWindowSize
+
+fn build_remote_write_body(timeseries: Vec<TimeSeries>) -> Vec<u8> {
+    let write_req = WriteRequest { timeseries };
+    let proto_bytes = write_req.encode_to_vec();
+    snap::raw::Encoder::new()
+        .compress_vec(&proto_bytes)
+        .expect("snappy compress failed")
+}
+
+fn make_sample(metric: &str, label_0: &str, timestamp_ms: i64, value: f64) -> TimeSeries {
+    TimeSeries {
+        labels: vec![
+            Label {
+                name: "__name__".into(),
+                value: metric.into(),
+            },
+            Label {
+                name: "instance".into(),
+                value: "i1".into(),
+            },
+            Label {
+                name: "job".into(),
+                value: "test".into(),
+            },
+            Label {
+                name: "label_0".into(),
+                value: label_0.into(),
+            },
+            Label {
+                name: "label_1".into(),
+                value: "v1".into(),
+            },
+        ],
+        samples: vec![Sample {
+            value,
+            timestamp: timestamp_ms,
+        }],
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+    tracing_subscriber::fmt()
+        .with_env_filter(
+            tracing_subscriber::EnvFilter::try_from_default_env()
+                .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
+        )
+        .with_span_events(tracing_subscriber::fmt::format::FmtSpan::CLOSE)
+        .init();
+
+    // Load configs the same way main.rs does
+    let inference_config = read_inference_config(
+        "examples/promql/inference_config.yaml",
+        QueryLanguage::promql,
+    )?;
+    println!(
+        "Loaded inference config with {} query configs",
+        inference_config.query_configs.len()
+    );
+    for qc in &inference_config.query_configs {
+        println!("  Query: '{}' -> {:?}", qc.query, qc.aggregations);
+    }
+
+    let cleanup_policy = inference_config.cleanup_policy;
+    let streaming_config = Arc::new(read_streaming_config(
+        "examples/promql/streaming_config.yaml",
+        &inference_config,
+    )?);
+    println!(
+        "Loaded streaming config with {} aggregation configs",
+        streaming_config.get_all_aggregation_configs().len()
+    );
+
+    println!("\n=== Starting precompute engine (ingest={INGEST_PORT}, query={QUERY_PORT}) ===");
+
+    // Create store
+    let store: Arc<dyn query_engine_rust::stores::Store> =
+        Arc::new(SimpleMapStore::new_with_strategy(
+            streaming_config.clone(),
+            cleanup_policy,
+            LockStrategy::PerKey,
+        ));
+
+    // Start query server
+    let query_engine = Arc::new(SimpleEngine::new(
+        store.clone(),
+        inference_config,
+        streaming_config.clone(),
+        SCRAPE_INTERVAL,
+        QueryLanguage::promql,
+    ));
+    let http_config = HttpServerConfig {
+        port: QUERY_PORT,
+        handle_http_requests: true,
+        adapter_config: AdapterConfig {
+            protocol: query_engine_rust::data_model::QueryProtocol::PrometheusHttp,
+            language: QueryLanguage::promql,
+            fallback: None,
+        },
+    };
+    let http_server = HttpServer::new(http_config, query_engine, store.clone());
+    tokio::spawn(async move {
+        if let Err(e) = http_server.run().await {
+            eprintln!("Query server error: {e}");
+        }
+    });
+
+    // Start precompute engine
+    let engine_config = PrecomputeEngineConfig {
+        num_workers: 2,
+        ingest_port: INGEST_PORT,
+        allowed_lateness_ms: 5000,
+        max_buffer_per_series: 10000,
+        flush_interval_ms: 200,
+        channel_buffer_size: 10000,
+        pass_raw_samples: false,
+        raw_mode_aggregation_id: 0,
+    };
+    let output_sink = Arc::new(StoreOutputSink::new(store.clone()));
+    let engine = PrecomputeEngine::new(engine_config, streaming_config.clone(), output_sink);
+    tokio::spawn(async move {
+        if let Err(e) = engine.run().await {
+            eprintln!("Precompute engine error: {e}");
+        }
+    });
+
+    // Wait for servers to bind
+    tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+
+    let client = reqwest::Client::new();
+
+    // -----------------------------------------------------------------------
+    // Send samples across multiple 1-second tumbling windows.
+    // tumblingWindowSize=1 means windows are [0,1000), [1000,2000), etc.
+    // We need enough windows of data so the query engine can find results.
+    // -----------------------------------------------------------------------
+    println!("\n=== Sending remote write samples ===");
+
+    // Send 20 windows worth of data (timestamps 0ms..20000ms = 0s..20s)
+    // Each window gets one sample.
+    for window in 0..20 {
+        let ts = window * 1000 + 500; // mid-window
+        let val = 10.0 + window as f64;
+        let body = build_remote_write_body(vec![make_sample("fake_metric", "groupA", ts, val)]);
+
+        let resp = client
+            .post(format!("http://localhost:{INGEST_PORT}/api/v1/write"))
+            .header("Content-Type", "application/x-protobuf")
+            .header("Content-Encoding", "snappy")
+            .body(body)
+            .send()
+            .await?;
+
+        println!("  Sent t={ts}ms v={val} -> HTTP {}", resp.status().as_u16());
+    }
+
+    // Advance watermark well past to close all windows
+    println!("\n=== Advancing watermark to close all windows ===");
+    let body = build_remote_write_body(vec![make_sample("fake_metric", "groupA", 25000, 0.0)]);
+    let resp = client
+        .post(format!("http://localhost:{INGEST_PORT}/api/v1/write"))
+        .header("Content-Type", "application/x-protobuf")
+        .header("Content-Encoding", "snappy")
+        .body(body)
+        .send()
+        .await?;
+    println!("  Sent t=25000ms v=0 -> HTTP {}", resp.status().as_u16());
+
+    // Wait for flush + processing
+    println!("\n  Waiting for flush...");
+    tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
+
+    // -----------------------------------------------------------------------
+    // Query the PromQL endpoint
+    // The inference_config has: "quantile by (label_0) (0.99, fake_metric)"
+    // which maps to aggregation_id 1.
+    // -----------------------------------------------------------------------
+    println!("\n=== Querying PromQL endpoint ===");
+
+    // Use the exact query pattern from inference_config
+    let queries_instant = vec![
+        (
+            "quantile by (label_0) (0.99, fake_metric)",
+            "10",
+            "Configured query at t=10",
+        ),
+        (
+            "quantile by (label_0) (0.99, fake_metric)",
+            "15",
+            "Configured query at t=15",
+        ),
+        (
+            "sum_over_time(fake_metric[1s])",
+            "10",
+            "Temporal: sum_over_time at t=10",
+        ),
+        ("sum(fake_metric)", "10", "Spatial: sum at t=10"),
+    ];
+
+    for (query, time, label) in &queries_instant {
+        println!("\n--- Instant query: {label} ---");
+        let resp = client
+            .get(format!("http://localhost:{QUERY_PORT}/api/v1/query"))
+            .query(&[("query", *query), ("time", *time)])
+            .send()
+            .await?
+            .text()
+            .await?;
+        print_json(&resp);
+    }
+
+    // Range query
+    println!("\n--- Range query: quantile by (label_0) (0.99, fake_metric) t=5..20 step=1 ---");
+    let resp = client
+        .get(format!("http://localhost:{QUERY_PORT}/api/v1/query_range"))
+        .query(&[
+            ("query", "quantile by (label_0) (0.99, fake_metric)"),
+            ("start", "5"),
+            ("end", "20"),
+            ("step", "1"),
+        ])
+        .send()
+        .await?
+        .text()
+        .await?;
+    print_json(&resp);
+
+    // Runtime info
+    println!("\n--- Runtime info ---");
+    let resp = client
+        .get(format!(
+            "http://localhost:{QUERY_PORT}/api/v1/status/runtimeinfo"
+        ))
+        .send()
+        .await?
+        .text()
+        .await?;
+    print_json(&resp);
+
+    // -----------------------------------------------------------------------
+    // RAW MODE TEST
+    // -----------------------------------------------------------------------
+    println!("\n=== Starting raw-mode precompute engine (ingest={RAW_INGEST_PORT}) ===");
+
+    // The raw engine reuses the same store so we can query results directly.
+    // Pick aggregation_id = 1 to match the existing streaming config.
+    let raw_agg_id: u64 = 1;
+    let raw_engine_config = PrecomputeEngineConfig {
+        num_workers: 1,
+        ingest_port: RAW_INGEST_PORT,
+        allowed_lateness_ms: 5000,
+        max_buffer_per_series: 10000,
+        flush_interval_ms: 200,
+        channel_buffer_size: 10000,
+        pass_raw_samples: true,
+        raw_mode_aggregation_id: raw_agg_id,
+    };
+    let raw_sink = Arc::new(RawPassthroughSink::new(store.clone()));
+    let raw_engine = PrecomputeEngine::new(raw_engine_config, streaming_config.clone(), raw_sink);
+    tokio::spawn(async move {
+        if let Err(e) = raw_engine.run().await {
+            eprintln!("Raw precompute engine error: {e}");
+        }
+    });
+
+    // Wait for server to bind
+    tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+
+    // Send a few raw samples — no need to advance watermark.
+    println!("\n=== Sending raw-mode samples ===");
+    let raw_timestamps = [100_000i64, 101_000, 102_000];
+    let raw_values = [42.0f64, 43.0, 44.0];
+    for (&ts, &val) in raw_timestamps.iter().zip(raw_values.iter()) {
+        let body = build_remote_write_body(vec![make_sample("fake_metric", "groupA", ts, val)]);
+        let resp = client
+            .post(format!("http://localhost:{RAW_INGEST_PORT}/api/v1/write"))
+            .header("Content-Type", "application/x-protobuf")
+            .header("Content-Encoding", "snappy")
+            .body(body)
+            .send()
+            .await?;
+        println!(
+            "  Sent raw t={ts}ms v={val} -> HTTP {}",
+            resp.status().as_u16()
+        );
+    }
+
+    // Short wait for processing (no watermark advancement needed)
+    tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+
+    // Verify raw samples appeared in the store
+    println!("\n=== Verifying raw samples in store ===");
+    let results = store.query_precomputed_output("fake_metric", raw_agg_id, 100_000, 103_000)?;
+    let total_buckets: usize = results.values().map(|v| v.len()).sum();
+    println!("  Found {total_buckets} buckets for aggregation_id={raw_agg_id} in [100000, 103000)");
+    assert!(
+        total_buckets >= 3,
+        "Expected at least 3 raw samples in store, got {total_buckets}"
+    );
+
+    for (key, buckets) in &results {
+        for ((start, end), _acc) in buckets {
+            println!("    key={key:?} start={start} end={end}");
+        }
+    }
+    println!("  Raw mode test PASSED");
+
+    println!("\n=== E2E test complete ===");
+
+    Ok(())
+}
+
+fn print_json(s: &str) {
+    match serde_json::from_str::<serde_json::Value>(s) {
+        Ok(v) => println!("{}", serde_json::to_string_pretty(&v).unwrap()),
+        Err(_) => println!("{s}"),
+    }
+}
diff --git a/asap-query-engine/src/drivers/ingest/prometheus_remote_write.rs b/asap-query-engine/src/drivers/ingest/prometheus_remote_write.rs
index 428c9de..dd45029 100644
--- a/asap-query-engine/src/drivers/ingest/prometheus_remote_write.rs
+++ b/asap-query-engine/src/drivers/ingest/prometheus_remote_write.rs
@@ -1,5 +1,6 @@
+use prost::Message;
+
 // use axum::{body::Bytes, extract::State, http::StatusCode, routing::post, Router};
-// use prost::Message;
 // use std::sync::Arc;
 // use tokio::net::TcpListener;
 // use tracing::{debug, error, info, warn};
@@ -7,137 +8,137 @@
 // // use crate::stores::promsketch_store::metrics as ps_metrics;
 // // use crate::stores::promsketch_store::PromSketchStore;
 
-// // ---------------------------------------------------------------------------
-// // Protobuf message types (Prometheus remote write wire format)
-// // ---------------------------------------------------------------------------
-// // These mirror the upstream proto definitions in prometheus/prompb but are
-// // defined inline via prost derive macros so we don't need a .proto file or
-// // build script.
-
-// #[derive(Clone, PartialEq, Message)]
-// pub struct WriteRequest {
-//     #[prost(message, repeated, tag = "1")]
-//     pub timeseries: Vec<TimeSeries>,
-// }
-
-// #[derive(Clone, PartialEq, Message)]
-// pub struct TimeSeries {
-//     #[prost(message, repeated, tag = "1")]
-//     pub labels: Vec<Label>,
-//     #[prost(message, repeated, tag = "2")]
-//     pub samples: Vec<Sample>,
-// }
-
-// #[derive(Clone, PartialEq, Message)]
-// pub struct Label {
-//     #[prost(string, tag = "1")]
-//     pub name: String,
-//     #[prost(string, tag = "2")]
-//     pub value: String,
-// }
-
-// #[derive(Clone, PartialEq, Message)]
-// pub struct Sample {
-//     #[prost(double, tag = "1")]
-//     pub value: f64,
-//     #[prost(int64, tag = "2")]
-//     pub timestamp: i64,
-// }
-
-// // ---------------------------------------------------------------------------
-// // Label helpers
-// // ---------------------------------------------------------------------------
-
-// /// Convert a slice of Prometheus [`Label`] pairs into the canonical
-// /// `metric_name{key1="val1",key2="val2"}` string format.
-// ///
-// /// The `__name__` label becomes the metric name prefix; the remaining labels
-// /// are sorted alphabetically by name.
-// pub fn labels_to_string(labels: &[Label]) -> String {
-//     let mut name: Option<&str> = None;
-//     let mut rest: Vec<(&str, &str)> = Vec::new();
-
-//     for l in labels {
-//         if l.name == "__name__" {
-//             name = Some(&l.value);
-//         } else {
-//             rest.push((&l.name, &l.value));
-//         }
-//     }
-
-//     rest.sort_by(|a, b| a.0.cmp(b.0));
-
-//     let metric = name.unwrap_or("");
-
-//     if rest.is_empty() {
-//         return metric.to_string();
-//     }
-
-//     let mut out = String::with_capacity(metric.len() + 2 + rest.len() * 16);
-//     out.push_str(metric);
-//     out.push('{');
-//     for (i, (k, v)) in rest.iter().enumerate() {
-//         if i > 0 {
-//             out.push(',');
-//         }
-//         out.push_str(k);
-//         out.push_str("=\"");
-//         out.push_str(v);
-//         out.push('"');
-//     }
-//     out.push('}');
-//     out
-// }
-
-// // ---------------------------------------------------------------------------
-// // Decoded sample — the output of this driver
-// // ---------------------------------------------------------------------------
-
-// /// A single decoded sample ready for downstream consumption.
-// #[derive(Debug, Clone)]
-// pub struct DecodedSample {
-//     pub labels: String,
-//     pub timestamp_ms: i64,
-//     pub value: f64,
-// }
-
-// // ---------------------------------------------------------------------------
-// // Decode helpers
-// // ---------------------------------------------------------------------------
-
-// /// Snappy-decompress and protobuf-decode a raw Prometheus remote write body
-// /// into a flat list of [`DecodedSample`]s.
-// pub fn decode_prometheus_remote_write(
-//     body: &[u8],
-// ) -> Result<Vec<DecodedSample>, PrometheusRemoteWriteError> {
-//     let decompressed = snap::raw::Decoder::new()
-//         .decompress_vec(body)
-//         .map_err(|e| PrometheusRemoteWriteError::SnappyDecompress(e.to_string()))?;
-
-//     let write_req = WriteRequest::decode(decompressed.as_slice())
-//         .map_err(|e| PrometheusRemoteWriteError::ProtobufDecode(e.to_string()))?;
-
-//     let mut samples = Vec::new();
-//     for ts in &write_req.timeseries {
-//         let labels_str = labels_to_string(&ts.labels);
-//         for s in &ts.samples {
-//             samples.push(DecodedSample {
-//                 labels: labels_str.clone(),
-//                 timestamp_ms: s.timestamp,
-//                 value: s.value,
-//             });
-//         }
-//     }
-//     Ok(samples)
-// }
-
-// #[derive(Debug, thiserror::Error)]
-// pub enum PrometheusRemoteWriteError {
-//     #[error("snappy decompression failed: {0}")]
-//     SnappyDecompress(String),
-//     #[error("protobuf decode failed: {0}")]
-//     ProtobufDecode(String),
-// }
+// ---------------------------------------------------------------------------
+// Protobuf message types (Prometheus remote write wire format)
+// ---------------------------------------------------------------------------
+// These mirror the upstream proto definitions in prometheus/prompb but are
+// defined inline via prost derive macros so we don't need a .proto file or
+// build script.
+
+#[derive(Clone, PartialEq, Message)]
+pub struct WriteRequest {
+    #[prost(message, repeated, tag = "1")]
+    pub timeseries: Vec<TimeSeries>,
+}
+
+#[derive(Clone, PartialEq, Message)]
+pub struct TimeSeries {
+    #[prost(message, repeated, tag = "1")]
+    pub labels: Vec<Label>,
+    #[prost(message, repeated, tag = "2")]
+    pub samples: Vec<Sample>,
+}
+
+#[derive(Clone, PartialEq, Message)]
+pub struct Label {
+    #[prost(string, tag = "1")]
+    pub name: String,
+    #[prost(string, tag = "2")]
+    pub value: String,
+}
+
+#[derive(Clone, PartialEq, Message)]
+pub struct Sample {
+    #[prost(double, tag = "1")]
+    pub value: f64,
+    #[prost(int64, tag = "2")]
+    pub timestamp: i64,
+}
+
+// ---------------------------------------------------------------------------
+// Label helpers
+// ---------------------------------------------------------------------------
+
+/// Convert a slice of Prometheus [`Label`] pairs into the canonical
+/// `metric_name{key1="val1",key2="val2"}` string format.
+///
+/// The `__name__` label becomes the metric name prefix; the remaining labels
+/// are sorted alphabetically by name.
+pub fn labels_to_string(labels: &[Label]) -> String {
+    let mut name: Option<&str> = None;
+    let mut rest: Vec<(&str, &str)> = Vec::new();
+
+    for l in labels {
+        if l.name == "__name__" {
+            name = Some(&l.value);
+        } else {
+            rest.push((&l.name, &l.value));
+        }
+    }
+
+    rest.sort_by(|a, b| a.0.cmp(b.0));
+
+    let metric = name.unwrap_or("");
+
+    if rest.is_empty() {
+        return metric.to_string();
+    }
+
+    let mut out = String::with_capacity(metric.len() + 2 + rest.len() * 16);
+    out.push_str(metric);
+    out.push('{');
+    for (i, (k, v)) in rest.iter().enumerate() {
+        if i > 0 {
+            out.push(',');
+        }
+        out.push_str(k);
+        out.push_str("=\"");
+        out.push_str(v);
+        out.push('"');
+    }
+    out.push('}');
+    out
+}
+
+// ---------------------------------------------------------------------------
+// Decoded sample — the output of this driver
+// ---------------------------------------------------------------------------
+
+/// A single decoded sample ready for downstream consumption.
+#[derive(Debug, Clone)]
+pub struct DecodedSample {
+    pub labels: String,
+    pub timestamp_ms: i64,
+    pub value: f64,
+}
+
+// ---------------------------------------------------------------------------
+// Decode helpers
+// ---------------------------------------------------------------------------
+
+/// Snappy-decompress and protobuf-decode a raw Prometheus remote write body
+/// into a flat list of [`DecodedSample`]s.
+pub fn decode_prometheus_remote_write(
+    body: &[u8],
+) -> Result<Vec<DecodedSample>, PrometheusRemoteWriteError> {
+    let decompressed = snap::raw::Decoder::new()
+        .decompress_vec(body)
+        .map_err(|e| PrometheusRemoteWriteError::SnappyDecompress(e.to_string()))?;
+
+    let write_req = WriteRequest::decode(decompressed.as_slice())
+        .map_err(|e| PrometheusRemoteWriteError::ProtobufDecode(e.to_string()))?;
+
+    let mut samples = Vec::new();
+    for ts in &write_req.timeseries {
+        let labels_str = labels_to_string(&ts.labels);
+        for s in &ts.samples {
+            samples.push(DecodedSample {
+                labels: labels_str.clone(),
+                timestamp_ms: s.timestamp,
+                value: s.value,
+            });
+        }
+    }
+    Ok(samples)
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum PrometheusRemoteWriteError {
+    #[error("snappy decompression failed: {0}")]
+    SnappyDecompress(String),
+    #[error("protobuf decode failed: {0}")]
+    ProtobufDecode(String),
+}
 
 // // ---------------------------------------------------------------------------
 // // Config
diff --git a/asap-query-engine/src/lib.rs b/asap-query-engine/src/lib.rs
index e70ad09..a76efeb 100644
--- a/asap-query-engine/src/lib.rs
+++ b/asap-query-engine/src/lib.rs
@@ -14,6 +14,7 @@ fn init_sketch_backend_for_tests() {
 pub mod data_model;
 pub mod drivers;
 pub mod engines;
+pub mod precompute_engine;
 pub mod precompute_operators;
 pub mod stores;
 
@@ -42,6 +43,10 @@ pub use drivers::{
     OtlpReceiverConfig,
 };
 
+pub use precompute_engine::config::PrecomputeEngineConfig;
+pub use precompute_engine::output_sink::StoreOutputSink;
+pub use precompute_engine::PrecomputeEngine;
+
 pub use utils::{normalize_spatial_filter, read_inference_config, read_streaming_config};
 
 pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
diff --git a/asap-query-engine/src/main.rs b/asap-query-engine/src/main.rs
index 00be2fe..dcd953e 100644
--- a/asap-query-engine/src/main.rs
+++ b/asap-query-engine/src/main.rs
@@ -12,7 +12,8 @@ use query_engine_rust::drivers::AdapterConfig;
 use query_engine_rust::utils::file_io::{read_inference_config, read_streaming_config};
 use query_engine_rust::{
     HttpServer, HttpServerConfig, KafkaConsumer, KafkaConsumerConfig, OtlpReceiver,
-    OtlpReceiverConfig, Result, SimpleEngine, SimpleMapStore,
+    OtlpReceiverConfig, PrecomputeEngine, PrecomputeEngineConfig, Result, SimpleEngine,
+    SimpleMapStore, StoreOutputSink,
 };
 
 #[derive(Parser, Debug)]
@@ -102,10 +103,6 @@ struct Args {
     #[arg(long, default_value = "9090")]
     prometheus_remote_write_port: u16,
 
-    /// Automatically initialize all sketch types for newly seen series during ingestion
-    #[arg(long, default_value = "true")]
-    auto_init_sketches: bool,
-
     /// Path to promsketch configuration YAML file (optional; uses defaults if omitted)
     #[arg(long)]
     promsketch_config: Option<String>,
@@ -133,6 +130,26 @@ struct Args {
     /// OTLP HTTP listen port
     #[arg(long, default_value = "4318")]
     otel_http_port: u16,
+
+    /// Number of precompute engine worker threads
+    #[arg(long, default_value = "4")]
+    precompute_num_workers: usize,
+
+    /// Maximum allowed lateness for out-of-order samples (milliseconds)
+    #[arg(long, default_value = "5000")]
+    precompute_allowed_lateness_ms: i64,
+
+    /// Maximum buffered samples per series before eviction
+    #[arg(long, default_value = "10000")]
+    precompute_max_buffer_per_series: usize,
+
+    /// Interval at which the flush timer fires (milliseconds)
+    #[arg(long, default_value = "1000")]
+    precompute_flush_interval_ms: u64,
+
+    /// Capacity of the channel between router and each worker
+    #[arg(long, default_value = "10000")]
+    precompute_channel_buffer_size: usize,
 }
 
 #[tokio::main]
@@ -273,25 +290,33 @@ async fn main() -> Result<()> {
         None
     };
 
-    // Setup Prometheus remote write server
-    // let prometheus_remote_write_handle = if args.enable_prometheus_remote_write {
-    //     let prw_config = PrometheusRemoteWriteConfig {
-    //         port: args.prometheus_remote_write_port,
-    //         auto_init_sketches: args.auto_init_sketches,
-    //     };
-    //     let server = PrometheusRemoteWriteServer::new(prw_config, promsketch_store.clone());
-    //     info!(
-    //         "Starting Prometheus remote write server on port {}",
-    //         args.prometheus_remote_write_port
-    //     );
-    //     Some(tokio::spawn(async move {
-    //         if let Err(e) = server.run().await {
-    //             error!("Prometheus remote write server error: {}", e);
-    //         }
-    //     }))
-    // } else {
-    //     None
-    // };
+    // Setup precompute engine (replaces standalone Prometheus remote write server)
+    let precompute_handle = if args.enable_prometheus_remote_write {
+        let precompute_config = PrecomputeEngineConfig {
+            num_workers: args.precompute_num_workers,
+            ingest_port: args.prometheus_remote_write_port,
+            allowed_lateness_ms: args.precompute_allowed_lateness_ms,
+            max_buffer_per_series: args.precompute_max_buffer_per_series,
+            flush_interval_ms: args.precompute_flush_interval_ms,
+            channel_buffer_size: args.precompute_channel_buffer_size,
+            pass_raw_samples: false,
+            raw_mode_aggregation_id: 0,
+        };
+        let output_sink = Arc::new(StoreOutputSink::new(store.clone()));
+        let engine =
+            PrecomputeEngine::new(precompute_config, streaming_config.clone(), output_sink);
+        info!(
+            "Starting precompute engine on port {}",
+            args.prometheus_remote_write_port
+        );
+        Some(tokio::spawn(async move {
+            if let Err(e) = engine.run().await {
+                error!("Precompute engine error: {}", e);
+            }
+        }))
+    } else {
+        None
+    };
 
     //info!("=== TEMPORARY: Using ClickHouse HTTP adapter ===");
     //info!("ClickHouse endpoint will be available at: /clickhouse/query");
@@ -343,11 +368,11 @@ async fn main() -> Result<()> {
         let _ = handle.await;
     }
 
-    // if let Some(handle) = prometheus_remote_write_handle {
-    //     info!("Shutting down Prometheus remote write server...");
-    //     handle.abort();
-    //     let _ = handle.await;
-    // }
+    if let Some(handle) = precompute_handle {
+        info!("Shutting down precompute engine...");
+        handle.abort();
+        let _ = handle.await;
+    }
 
     info!("Shutdown complete");
     Ok(())
diff --git a/asap-query-engine/src/precompute_engine/accumulator_factory.rs b/asap-query-engine/src/precompute_engine/accumulator_factory.rs
new file mode 100644
index 0000000..7c181e6
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/accumulator_factory.rs
@@ -0,0 +1,707 @@
+use crate::data_model::{AggregateCore, KeyByLabelValues, Measurement};
+use crate::precompute_operators::{
+    CountMinSketchAccumulator, DatasketchesKLLAccumulator, HydraKllSketchAccumulator,
+    IncreaseAccumulator, MinMaxAccumulator, MultipleIncreaseAccumulator, MultipleMinMaxAccumulator,
+    MultipleSumAccumulator, SumAccumulator,
+};
+use sketch_db_common::aggregation_config::AggregationConfig;
+
+/// Trait for feeding samples into accumulators in the precompute engine.
+///
+/// This provides a uniform interface over all accumulator types so that the
+/// worker loop doesn't need to know which concrete type it's dealing with.
+pub trait AccumulatorUpdater: Send {
+    /// Feed a single (value, timestamp_ms) pair — for SingleSubpopulation types.
+    fn update_single(&mut self, value: f64, timestamp_ms: i64);
+
+    /// Feed a keyed (key, value, timestamp_ms) triple — for MultipleSubpopulation types.
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, timestamp_ms: i64);
+
+    /// Extract the final accumulator as a boxed `AggregateCore`.
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore>;
+
+    /// Reset internal state for reuse (avoids re-allocation).
+    fn reset(&mut self);
+
+    /// Whether this updater is keyed (MultipleSubpopulation).
+    fn is_keyed(&self) -> bool;
+
+    /// Estimated memory usage in bytes.
+    fn memory_usage_bytes(&self) -> usize;
+}
+
+// ---------------------------------------------------------------------------
+// SumAccumulatorUpdater
+// ---------------------------------------------------------------------------
+
+pub struct SumAccumulatorUpdater {
+    acc: SumAccumulator,
+}
+
+impl SumAccumulatorUpdater {
+    pub fn new() -> Self {
+        Self {
+            acc: SumAccumulator::new(),
+        }
+    }
+}
+
+impl Default for SumAccumulatorUpdater {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AccumulatorUpdater for SumAccumulatorUpdater {
+    fn update_single(&mut self, value: f64, _timestamp_ms: i64) {
+        self.acc.update(value);
+    }
+
+    fn update_keyed(&mut self, _key: &KeyByLabelValues, value: f64, timestamp_ms: i64) {
+        self.update_single(value, timestamp_ms);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = SumAccumulator::new();
+    }
+
+    fn is_keyed(&self) -> bool {
+        false
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<SumAccumulator>()
+    }
+}
+
+// ---------------------------------------------------------------------------
+// MinMaxAccumulatorUpdater
+// ---------------------------------------------------------------------------
+
+pub struct MinMaxAccumulatorUpdater {
+    acc: MinMaxAccumulator,
+    sub_type: String,
+}
+
+impl MinMaxAccumulatorUpdater {
+    pub fn new(sub_type: String) -> Self {
+        Self {
+            acc: MinMaxAccumulator::new(sub_type.clone()),
+            sub_type,
+        }
+    }
+}
+
+impl AccumulatorUpdater for MinMaxAccumulatorUpdater {
+    fn update_single(&mut self, value: f64, _timestamp_ms: i64) {
+        self.acc.update(value);
+    }
+
+    fn update_keyed(&mut self, _key: &KeyByLabelValues, value: f64, timestamp_ms: i64) {
+        self.update_single(value, timestamp_ms);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = MinMaxAccumulator::new(self.sub_type.clone());
+    }
+
+    fn is_keyed(&self) -> bool {
+        false
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<MinMaxAccumulator>()
+    }
+}
+
+// ---------------------------------------------------------------------------
+// IncreaseAccumulatorUpdater
+// ---------------------------------------------------------------------------
+
+pub struct IncreaseAccumulatorUpdater {
+    acc: Option<IncreaseAccumulator>,
+}
+
+impl IncreaseAccumulatorUpdater {
+    pub fn new() -> Self {
+        Self { acc: None }
+    }
+}
+
+impl Default for IncreaseAccumulatorUpdater {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AccumulatorUpdater for IncreaseAccumulatorUpdater {
+    fn update_single(&mut self, value: f64, timestamp_ms: i64) {
+        let measurement = Measurement::new(value);
+        match &mut self.acc {
+            Some(acc) => acc.update(measurement, timestamp_ms),
+            None => {
+                self.acc = Some(IncreaseAccumulator::new(
+                    measurement.clone(),
+                    timestamp_ms,
+                    measurement,
+                    timestamp_ms,
+                ));
+            }
+        }
+    }
+
+    fn update_keyed(&mut self, _key: &KeyByLabelValues, value: f64, timestamp_ms: i64) {
+        self.update_single(value, timestamp_ms);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let acc = self.acc.take().unwrap_or_else(|| {
+            IncreaseAccumulator::new(Measurement::new(0.0), 0, Measurement::new(0.0), 0)
+        });
+        let result = Box::new(acc);
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = None;
+    }
+
+    fn is_keyed(&self) -> bool {
+        false
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<Option<IncreaseAccumulator>>()
+    }
+}
+
+// ---------------------------------------------------------------------------
+// KllAccumulatorUpdater
+// ---------------------------------------------------------------------------
+
+pub struct KllAccumulatorUpdater {
+    acc: DatasketchesKLLAccumulator,
+    k: u16,
+}
+
+impl KllAccumulatorUpdater {
+    pub fn new(k: u16) -> Self {
+        Self {
+            acc: DatasketchesKLLAccumulator::new(k),
+            k,
+        }
+    }
+}
+
+impl AccumulatorUpdater for KllAccumulatorUpdater {
+    fn update_single(&mut self, value: f64, _timestamp_ms: i64) {
+        self.acc._update(value);
+    }
+
+    fn update_keyed(&mut self, _key: &KeyByLabelValues, value: f64, timestamp_ms: i64) {
+        self.update_single(value, timestamp_ms);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = DatasketchesKLLAccumulator::new(self.k);
+    }
+
+    fn is_keyed(&self) -> bool {
+        false
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        // KLL sketch size is hard to estimate precisely; use a rough estimate
+        std::mem::size_of::<DatasketchesKLLAccumulator>() + 4096
+    }
+}
+
+// ---------------------------------------------------------------------------
+// MultipleSumUpdater
+// ---------------------------------------------------------------------------
+
+pub struct MultipleSumUpdater {
+    acc: MultipleSumAccumulator,
+}
+
+impl MultipleSumUpdater {
+    pub fn new() -> Self {
+        Self {
+            acc: MultipleSumAccumulator::new(),
+        }
+    }
+}
+
+impl Default for MultipleSumUpdater {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AccumulatorUpdater for MultipleSumUpdater {
+    fn update_single(&mut self, _value: f64, _timestamp_ms: i64) {
+        // Multiple-subpopulation — use update_keyed instead
+    }
+
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, _timestamp_ms: i64) {
+        self.acc.update(key.clone(), value);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = MultipleSumAccumulator::new();
+    }
+
+    fn is_keyed(&self) -> bool {
+        true
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<MultipleSumAccumulator>()
+            + self.acc.sums.len() * (std::mem::size_of::<KeyByLabelValues>() + 8)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// MultipleMinMaxUpdater
+// ---------------------------------------------------------------------------
+
+pub struct MultipleMinMaxUpdater {
+    acc: MultipleMinMaxAccumulator,
+    sub_type: String,
+}
+
+impl MultipleMinMaxUpdater {
+    pub fn new(sub_type: String) -> Self {
+        Self {
+            acc: MultipleMinMaxAccumulator::new(sub_type.clone()),
+            sub_type,
+        }
+    }
+}
+
+impl AccumulatorUpdater for MultipleMinMaxUpdater {
+    fn update_single(&mut self, _value: f64, _timestamp_ms: i64) {
+        // Multiple-subpopulation — use update_keyed instead
+    }
+
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, _timestamp_ms: i64) {
+        self.acc.update(key.clone(), value);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = MultipleMinMaxAccumulator::new(self.sub_type.clone());
+    }
+
+    fn is_keyed(&self) -> bool {
+        true
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<MultipleMinMaxAccumulator>()
+            + self.acc.values.len() * (std::mem::size_of::<KeyByLabelValues>() + 8)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// MultipleIncreaseUpdater
+// ---------------------------------------------------------------------------
+
+pub struct MultipleIncreaseUpdater {
+    acc: MultipleIncreaseAccumulator,
+}
+
+impl MultipleIncreaseUpdater {
+    pub fn new() -> Self {
+        Self {
+            acc: MultipleIncreaseAccumulator::new(),
+        }
+    }
+}
+
+impl Default for MultipleIncreaseUpdater {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AccumulatorUpdater for MultipleIncreaseUpdater {
+    fn update_single(&mut self, _value: f64, _timestamp_ms: i64) {
+        // Multiple-subpopulation — use update_keyed instead
+    }
+
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, timestamp_ms: i64) {
+        let measurement = Measurement::new(value);
+        // If key already exists, update it; otherwise create new
+        if self.acc.increases.contains_key(key) {
+            if let Some(existing) = self.acc.increases.get_mut(key) {
+                existing.update(measurement, timestamp_ms);
+            }
+        } else {
+            let new_acc = IncreaseAccumulator::new(
+                measurement.clone(),
+                timestamp_ms,
+                measurement,
+                timestamp_ms,
+            );
+            self.acc.update(key.clone(), new_acc);
+        }
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = MultipleIncreaseAccumulator::new();
+    }
+
+    fn is_keyed(&self) -> bool {
+        true
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<MultipleIncreaseAccumulator>()
+            + self.acc.increases.len()
+                * (std::mem::size_of::<KeyByLabelValues>()
+                    + std::mem::size_of::<IncreaseAccumulator>())
+    }
+}
+
+// ---------------------------------------------------------------------------
+// CmsAccumulatorUpdater (CountMinSketch)
+// ---------------------------------------------------------------------------
+
+pub struct CmsAccumulatorUpdater {
+    acc: CountMinSketchAccumulator,
+    row_num: usize,
+    col_num: usize,
+}
+
+impl CmsAccumulatorUpdater {
+    pub fn new(row_num: usize, col_num: usize) -> Self {
+        Self {
+            acc: CountMinSketchAccumulator::new(row_num, col_num),
+            row_num,
+            col_num,
+        }
+    }
+}
+
+impl AccumulatorUpdater for CmsAccumulatorUpdater {
+    fn update_single(&mut self, _value: f64, _timestamp_ms: i64) {
+        // CMS is keyed — use update_keyed
+    }
+
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, _timestamp_ms: i64) {
+        self.acc.inner.update(&key.to_semicolon_str(), value);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = CountMinSketchAccumulator::new(self.row_num, self.col_num);
+    }
+
+    fn is_keyed(&self) -> bool {
+        true
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        std::mem::size_of::<CountMinSketchAccumulator>()
+            + self.row_num * self.col_num * std::mem::size_of::<f64>()
+    }
+}
+
+// ---------------------------------------------------------------------------
+// HydraKllAccumulatorUpdater
+// ---------------------------------------------------------------------------
+
+pub struct HydraKllAccumulatorUpdater {
+    acc: HydraKllSketchAccumulator,
+    row_num: usize,
+    col_num: usize,
+    k: u16,
+}
+
+impl HydraKllAccumulatorUpdater {
+    pub fn new(row_num: usize, col_num: usize, k: u16) -> Self {
+        Self {
+            acc: HydraKllSketchAccumulator::new(row_num, col_num, k),
+            row_num,
+            col_num,
+            k,
+        }
+    }
+}
+
+impl AccumulatorUpdater for HydraKllAccumulatorUpdater {
+    fn update_single(&mut self, _value: f64, _timestamp_ms: i64) {
+        // HydraKLL is keyed — use update_keyed
+    }
+
+    fn update_keyed(&mut self, key: &KeyByLabelValues, value: f64, _timestamp_ms: i64) {
+        self.acc.update(key, value);
+    }
+
+    fn take_accumulator(&mut self) -> Box<dyn AggregateCore> {
+        let result = Box::new(self.acc.clone());
+        self.reset();
+        result
+    }
+
+    fn reset(&mut self) {
+        self.acc = HydraKllSketchAccumulator::new(self.row_num, self.col_num, self.k);
+    }
+
+    fn is_keyed(&self) -> bool {
+        true
+    }
+
+    fn memory_usage_bytes(&self) -> usize {
+        // Rough estimate: each cell is a KLL sketch
+        std::mem::size_of::<HydraKllSketchAccumulator>() + self.row_num * self.col_num * 4096
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Factory function
+// ---------------------------------------------------------------------------
+
+/// Create an appropriate `AccumulatorUpdater` from an `AggregationConfig`.
+pub fn create_accumulator_updater(config: &AggregationConfig) -> Box<dyn AccumulatorUpdater> {
+    let agg_type = config.aggregation_type.as_str();
+    let sub_type = config.aggregation_sub_type.as_str();
+
+    match agg_type {
+        "SingleSubpopulation" => match sub_type {
+            "Sum" | "sum" => Box::new(SumAccumulatorUpdater::new()),
+            "Min" | "min" => Box::new(MinMaxAccumulatorUpdater::new("min".to_string())),
+            "Max" | "max" => Box::new(MinMaxAccumulatorUpdater::new("max".to_string())),
+            "Increase" | "increase" => Box::new(IncreaseAccumulatorUpdater::new()),
+            "DatasketchesKLL" | "datasketches_kll" | "KLL" | "kll" => {
+                let k = config
+                    .parameters
+                    .get("k")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(200) as u16;
+                Box::new(KllAccumulatorUpdater::new(k))
+            }
+            other => {
+                tracing::warn!(
+                    "Unknown SingleSubpopulation sub_type '{}', defaulting to Sum",
+                    other
+                );
+                Box::new(SumAccumulatorUpdater::new())
+            }
+        },
+        "MultipleSubpopulation" => match sub_type {
+            "Sum" | "sum" => Box::new(MultipleSumUpdater::new()),
+            "Min" | "min" => Box::new(MultipleMinMaxUpdater::new("min".to_string())),
+            "Max" | "max" => Box::new(MultipleMinMaxUpdater::new("max".to_string())),
+            "Increase" | "increase" => Box::new(MultipleIncreaseUpdater::new()),
+            "CountMinSketch" | "count_min_sketch" | "CMS" | "cms" => {
+                let row_num = config
+                    .parameters
+                    .get("row_num")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(4) as usize;
+                let col_num = config
+                    .parameters
+                    .get("col_num")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(1000) as usize;
+                Box::new(CmsAccumulatorUpdater::new(row_num, col_num))
+            }
+            "HydraKLL" | "hydra_kll" => {
+                let row_num = config
+                    .parameters
+                    .get("row_num")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(4) as usize;
+                let col_num = config
+                    .parameters
+                    .get("col_num")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(1000) as usize;
+                let k = config
+                    .parameters
+                    .get("k")
+                    .and_then(|v| v.as_u64())
+                    .unwrap_or(200) as u16;
+                Box::new(HydraKllAccumulatorUpdater::new(row_num, col_num, k))
+            }
+            other => {
+                tracing::warn!(
+                    "Unknown MultipleSubpopulation sub_type '{}', defaulting to Sum",
+                    other
+                );
+                Box::new(MultipleSumUpdater::new())
+            }
+        },
+        // Top-level aggregation types (e.g. "DatasketchesKLL" directly in aggregationType)
+        "DatasketchesKLL" | "datasketches_kll" | "KLL" | "kll" => {
+            let k = config
+                .parameters
+                .get("K")
+                .or_else(|| config.parameters.get("k"))
+                .and_then(|v| v.as_u64())
+                .unwrap_or(200) as u16;
+            Box::new(KllAccumulatorUpdater::new(k))
+        }
+        "Sum" | "sum" => Box::new(SumAccumulatorUpdater::new()),
+        "Min" | "min" => Box::new(MinMaxAccumulatorUpdater::new("min".to_string())),
+        "Max" | "max" => Box::new(MinMaxAccumulatorUpdater::new("max".to_string())),
+        "Increase" | "increase" => Box::new(IncreaseAccumulatorUpdater::new()),
+        "CountMinSketch" | "count_min_sketch" | "CMS" | "cms" => {
+            let row_num = config
+                .parameters
+                .get("row_num")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(4) as usize;
+            let col_num = config
+                .parameters
+                .get("col_num")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(1000) as usize;
+            Box::new(CmsAccumulatorUpdater::new(row_num, col_num))
+        }
+        "HydraKLL" | "hydra_kll" => {
+            let row_num = config
+                .parameters
+                .get("row_num")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(4) as usize;
+            let col_num = config
+                .parameters
+                .get("col_num")
+                .and_then(|v| v.as_u64())
+                .unwrap_or(1000) as usize;
+            let k = config
+                .parameters
+                .get("K")
+                .or_else(|| config.parameters.get("k"))
+                .and_then(|v| v.as_u64())
+                .unwrap_or(200) as u16;
+            Box::new(HydraKllAccumulatorUpdater::new(row_num, col_num, k))
+        }
+        other => {
+            tracing::warn!(
+                "Unknown aggregation_type '{}', defaulting to SingleSubpopulation Sum",
+                other
+            );
+            Box::new(SumAccumulatorUpdater::new())
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_sum_updater() {
+        let mut updater = SumAccumulatorUpdater::new();
+        assert!(!updater.is_keyed());
+
+        updater.update_single(1.0, 1000);
+        updater.update_single(2.0, 2000);
+        updater.update_single(3.0, 3000);
+
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "SumAccumulator");
+    }
+
+    #[test]
+    fn test_minmax_updater() {
+        let mut updater = MinMaxAccumulatorUpdater::new("max".to_string());
+        updater.update_single(5.0, 1000);
+        updater.update_single(3.0, 2000);
+        updater.update_single(7.0, 3000);
+
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "MinMaxAccumulator");
+    }
+
+    #[test]
+    fn test_increase_updater() {
+        let mut updater = IncreaseAccumulatorUpdater::new();
+        updater.update_single(10.0, 1000);
+        updater.update_single(15.0, 2000);
+
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "IncreaseAccumulator");
+    }
+
+    #[test]
+    fn test_kll_updater() {
+        let mut updater = KllAccumulatorUpdater::new(200);
+        for i in 1..=10 {
+            updater.update_single(i as f64, i * 1000);
+        }
+
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "DatasketchesKLLAccumulator");
+    }
+
+    #[test]
+    fn test_multiple_sum_updater() {
+        let mut updater = MultipleSumUpdater::new();
+        assert!(updater.is_keyed());
+
+        let key_a = KeyByLabelValues::new_with_labels(vec!["a".to_string()]);
+        let key_b = KeyByLabelValues::new_with_labels(vec!["b".to_string()]);
+
+        updater.update_keyed(&key_a, 1.0, 1000);
+        updater.update_keyed(&key_b, 2.0, 2000);
+
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "MultipleSumAccumulator");
+    }
+
+    #[test]
+    fn test_reset_clears_state() {
+        let mut updater = SumAccumulatorUpdater::new();
+        updater.update_single(100.0, 1000);
+        updater.reset();
+        // After reset, should produce a fresh accumulator
+        let acc = updater.take_accumulator();
+        assert_eq!(acc.type_name(), "SumAccumulator");
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/config.rs b/asap-query-engine/src/precompute_engine/config.rs
new file mode 100644
index 0000000..7bc8df9
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/config.rs
@@ -0,0 +1,57 @@
+use serde::{Deserialize, Serialize};
+
+/// Configuration for the precompute engine.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct PrecomputeEngineConfig {
+    /// Number of worker threads for parallel processing.
+    pub num_workers: usize,
+    /// Port for the Prometheus remote write ingest endpoint.
+    pub ingest_port: u16,
+    /// Maximum allowed lateness for out-of-order samples (milliseconds).
+    /// Samples arriving later than this behind the watermark are dropped.
+    pub allowed_lateness_ms: i64,
+    /// Maximum number of buffered samples per series before oldest are evicted.
+    pub max_buffer_per_series: usize,
+    /// Interval at which the flush timer fires to close idle windows (milliseconds).
+    pub flush_interval_ms: u64,
+    /// Capacity of the MPSC channel between router and each worker.
+    pub channel_buffer_size: usize,
+    /// When true, skip all aggregation and pass each raw sample directly to the
+    /// output sink as a `SumAccumulator::with_sum(value)`.
+    pub pass_raw_samples: bool,
+    /// Aggregation ID to stamp on each raw-mode output.
+    pub raw_mode_aggregation_id: u64,
+}
+
+impl Default for PrecomputeEngineConfig {
+    fn default() -> Self {
+        Self {
+            num_workers: 4,
+            ingest_port: 9090,
+            allowed_lateness_ms: 5_000,
+            max_buffer_per_series: 10_000,
+            flush_interval_ms: 1_000,
+            channel_buffer_size: 10_000,
+            pass_raw_samples: false,
+            raw_mode_aggregation_id: 0,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_default_config() {
+        let config = PrecomputeEngineConfig::default();
+        assert_eq!(config.num_workers, 4);
+        assert_eq!(config.ingest_port, 9090);
+        assert_eq!(config.allowed_lateness_ms, 5_000);
+        assert_eq!(config.max_buffer_per_series, 10_000);
+        assert_eq!(config.flush_interval_ms, 1_000);
+        assert_eq!(config.channel_buffer_size, 10_000);
+        assert!(!config.pass_raw_samples);
+        assert_eq!(config.raw_mode_aggregation_id, 0);
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/mod.rs b/asap-query-engine/src/precompute_engine/mod.rs
new file mode 100644
index 0000000..c7fe7e9
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/mod.rs
@@ -0,0 +1,186 @@
+pub mod accumulator_factory;
+pub mod config;
+pub mod output_sink;
+pub mod series_buffer;
+pub mod series_router;
+pub mod window_manager;
+pub mod worker;
+
+use crate::data_model::StreamingConfig;
+use crate::drivers::ingest::prometheus_remote_write::decode_prometheus_remote_write;
+use crate::precompute_engine::config::PrecomputeEngineConfig;
+use crate::precompute_engine::output_sink::OutputSink;
+use crate::precompute_engine::series_router::{SeriesRouter, WorkerMessage};
+use crate::precompute_engine::worker::Worker;
+use axum::{body::Bytes, extract::State, http::StatusCode, routing::post, Router};
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Instant;
+use tokio::net::TcpListener;
+use tokio::sync::mpsc;
+use tracing::{debug_span, info, warn, Instrument};
+
+/// Shared state for the ingest HTTP handler.
+struct IngestState {
+    router: SeriesRouter,
+    samples_ingested: std::sync::atomic::AtomicU64,
+}
+
+/// The top-level precompute engine orchestrator.
+///
+/// Creates worker threads, the series router, and the Axum ingest server.
+pub struct PrecomputeEngine {
+    config: PrecomputeEngineConfig,
+    streaming_config: Arc<StreamingConfig>,
+    output_sink: Arc<dyn OutputSink>,
+}
+
+impl PrecomputeEngine {
+    pub fn new(
+        config: PrecomputeEngineConfig,
+        streaming_config: Arc<StreamingConfig>,
+        output_sink: Arc<dyn OutputSink>,
+    ) -> Self {
+        Self {
+            config,
+            streaming_config,
+            output_sink,
+        }
+    }
+
+    /// Start the precompute engine. This spawns worker tasks and the HTTP
+    /// ingest server, then blocks until shutdown.
+    pub async fn run(self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        let num_workers = self.config.num_workers;
+        let channel_size = self.config.channel_buffer_size;
+
+        // Build MPSC channels for each worker
+        let mut senders = Vec::with_capacity(num_workers);
+        let mut receivers = Vec::with_capacity(num_workers);
+        for _ in 0..num_workers {
+            let (tx, rx) = mpsc::channel::<WorkerMessage>(channel_size);
+            senders.push(tx);
+            receivers.push(rx);
+        }
+
+        // Build the router
+        let router = SeriesRouter::new(senders);
+
+        // Build aggregation config map from streaming config
+        let agg_configs: HashMap<u64, _> =
+            self.streaming_config.get_all_aggregation_configs().clone();
+
+        // Spawn workers
+        let mut worker_handles = Vec::with_capacity(num_workers);
+        for (id, rx) in receivers.into_iter().enumerate() {
+            let worker = Worker::new(
+                id,
+                rx,
+                self.output_sink.clone(),
+                agg_configs.clone(),
+                self.config.max_buffer_per_series,
+                self.config.allowed_lateness_ms,
+                self.config.pass_raw_samples,
+                self.config.raw_mode_aggregation_id,
+            );
+            let handle = tokio::spawn(async move {
+                worker.run().await;
+            });
+            worker_handles.push(handle);
+        }
+
+        info!(
+            "PrecomputeEngine started with {} workers on port {}",
+            num_workers, self.config.ingest_port
+        );
+
+        // Build the ingest state
+        let ingest_state = Arc::new(IngestState {
+            router,
+            samples_ingested: std::sync::atomic::AtomicU64::new(0),
+        });
+
+        // Start flush timer
+        let flush_state = ingest_state.clone();
+        let flush_interval_ms = self.config.flush_interval_ms;
+        tokio::spawn(async move {
+            let mut interval =
+                tokio::time::interval(tokio::time::Duration::from_millis(flush_interval_ms));
+            loop {
+                interval.tick().await;
+                if let Err(e) = flush_state.router.broadcast_flush().await {
+                    warn!("Flush broadcast error: {}", e);
+                    break;
+                }
+            }
+        });
+
+        // Start the Axum HTTP server for Prometheus remote write ingest
+        let app = Router::new()
+            .route("/api/v1/write", post(handle_ingest))
+            .with_state(ingest_state);
+
+        let addr = format!("0.0.0.0:{}", self.config.ingest_port);
+        info!("Ingest server listening on {}", addr);
+
+        let listener = TcpListener::bind(&addr).await?;
+        axum::serve(listener, app).await?;
+
+        // Wait for workers to finish (this only happens on shutdown)
+        for handle in worker_handles {
+            let _ = handle.await;
+        }
+
+        Ok(())
+    }
+}
+
+/// Axum handler for Prometheus remote write.
+async fn handle_ingest(State(state): State<Arc<IngestState>>, body: Bytes) -> StatusCode {
+    let ingest_span = debug_span!("ingest", body_len = body.len());
+    let ingest_received_at = Instant::now();
+
+    async {
+        let samples = match decode_prometheus_remote_write(&body) {
+            Ok(s) => s,
+            Err(e) => {
+                warn!("Failed to decode remote write: {}", e);
+                return StatusCode::BAD_REQUEST;
+            }
+        };
+
+        if samples.is_empty() {
+            return StatusCode::NO_CONTENT;
+        }
+
+        let count = samples.len() as u64;
+        state
+            .samples_ingested
+            .fetch_add(count, std::sync::atomic::Ordering::Relaxed);
+
+        // Group samples by series key for batch routing
+        let mut by_series: HashMap<&str, Vec<(i64, f64)>> = HashMap::new();
+        for s in &samples {
+            by_series
+                .entry(&s.labels)
+                .or_default()
+                .push((s.timestamp_ms, s.value));
+        }
+
+        // Route each series batch to the correct worker
+        for (series_key, batch) in by_series {
+            if let Err(e) = state
+                .router
+                .route(series_key, batch, ingest_received_at)
+                .await
+            {
+                warn!("Routing error for {}: {}", series_key, e);
+                return StatusCode::INTERNAL_SERVER_ERROR;
+            }
+        }
+
+        StatusCode::NO_CONTENT
+    }
+    .instrument(ingest_span)
+    .await
+}
diff --git a/asap-query-engine/src/precompute_engine/output_sink.rs b/asap-query-engine/src/precompute_engine/output_sink.rs
new file mode 100644
index 0000000..69bd3a1
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/output_sink.rs
@@ -0,0 +1,92 @@
+use crate::data_model::{AggregateCore, PrecomputedOutput};
+use crate::stores::Store;
+use std::sync::Arc;
+use tracing::debug_span;
+
+/// Trait for emitting completed window outputs.
+pub trait OutputSink: Send + Sync {
+    fn emit_batch(
+        &self,
+        outputs: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>>;
+}
+
+/// Output sink that writes directly to a `Store`.
+pub struct StoreOutputSink {
+    store: Arc<dyn Store>,
+}
+
+impl StoreOutputSink {
+    pub fn new(store: Arc<dyn Store>) -> Self {
+        Self { store }
+    }
+}
+
+impl OutputSink for StoreOutputSink {
+    fn emit_batch(
+        &self,
+        outputs: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        if outputs.is_empty() {
+            return Ok(());
+        }
+        let _span = debug_span!("store_insert", batch_size = outputs.len()).entered();
+        self.store.insert_precomputed_output_batch(outputs)
+    }
+}
+
+/// Output sink for raw passthrough mode — forwards raw samples to the store
+/// without sketch computation. In this mode the samples are stored as
+/// SumAccumulators (one per sample).
+pub struct RawPassthroughSink {
+    store: Arc<dyn Store>,
+}
+
+impl RawPassthroughSink {
+    pub fn new(store: Arc<dyn Store>) -> Self {
+        Self { store }
+    }
+}
+
+impl OutputSink for RawPassthroughSink {
+    fn emit_batch(
+        &self,
+        outputs: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        if outputs.is_empty() {
+            return Ok(());
+        }
+        let _span = debug_span!("store_insert_raw", batch_size = outputs.len()).entered();
+        self.store.insert_precomputed_output_batch(outputs)
+    }
+}
+
+/// A no-op sink for testing that just counts emitted batches.
+pub struct NoopOutputSink {
+    pub emit_count: std::sync::atomic::AtomicU64,
+}
+
+impl NoopOutputSink {
+    pub fn new() -> Self {
+        Self {
+            emit_count: std::sync::atomic::AtomicU64::new(0),
+        }
+    }
+}
+
+impl Default for NoopOutputSink {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl OutputSink for NoopOutputSink {
+    fn emit_batch(
+        &self,
+        outputs: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        self.emit_count
+            .fetch_add(outputs.len() as u64, std::sync::atomic::Ordering::Relaxed);
+        Ok(())
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/precompute_engine_design_doc.md b/asap-query-engine/src/precompute_engine/precompute_engine_design_doc.md
new file mode 100644
index 0000000..471bb34
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/precompute_engine_design_doc.md
@@ -0,0 +1,423 @@
+# Precompute Engine Design Document
+
+## 1. Overview
+
+### Why this PR is needed
+
+ASAPQuery already has a query path over precomputed summaries, but before this PR
+there was no standalone runtime inside `asap-query-engine` that could continuously:
+
+- accept raw metric samples,
+- turn them into windowed precomputed outputs, and
+- write those outputs into the same store that the query engine reads.
+
+PR #228 fills that gap by introducing a first working version of a **precompute
+engine**. The engine runs as a separate binary, accepts Prometheus remote write
+traffic, partitions incoming series across workers, computes windowed
+accumulators, and stores the results for later query-time retrieval.
+
+### Why not ArroyoSketch?
+
+The existing precompute path — **ArroyoSketch** (`asap-summary-ingest/run_arroyosketch.py`)
+— already performs windowed sketch aggregation, but it does so through an
+entirely separate operational stack:
+
+| Dimension | ArroyoSketch | Precompute Engine (this PR) |
+|---|---|---|
+| **Runtime** | External Arroyo cluster (separate process, separate binary) | In-process Rust binary alongside the query engine |
+| **Orchestration language** | Python (Jinja2 SQL templates deployed via REST API to Arroyo) | Native Rust, driven directly by `StreamingConfig` |
+| **Ingest transport** | Kafka topic or Prometheus remote write → Arroyo pipeline | Prometheus remote write directly to the engine |
+| **Output transport** | Kafka topic → consumed by a separate pipeline stage | Direct write to the store already read by the query engine |
+| **Operational dependencies** | Arroyo cluster + Kafka brokers must be running and healthy | None beyond the query engine process itself |
+| **Configuration coupling** | Arroyo pipeline SQL is rendered from `streaming_config.yaml` by a Python script; any config change requires re-deploying pipelines via the Arroyo REST API | Engine reads `StreamingConfig` directly at startup; same structs used throughout `asap-query-engine` |
+| **Failure boundary** | Arroyo crash or Kafka lag is invisible to the query engine until queries begin returning stale results | Precompute workers and query engine share the same process and store; failures surface immediately |
+
+In short, ArroyoSketch trades simplicity for power: it is a general-purpose
+streaming SQL engine that can express complex multi-stage pipelines, but it
+requires standing up and operating Arroyo and Kafka as separate infrastructure.
+That operational overhead is the main barrier to running the precompute path in
+development, in CI, or in environments where Kafka is not already present.
+
+This PR replaces the ingest-and-aggregate role of ArroyoSketch with a
+self-contained Rust implementation that has no external service dependencies,
+shares the same store and configuration types as the rest of `asap-query-engine`,
+and can be validated end to end in a single process. ArroyoSketch remains useful
+as a production deployment option when Arroyo and Kafka are already available,
+but the precompute engine is the path forward for native integration within the
+Rust codebase.
+
+This PR is primarily about establishing the end-to-end execution path and the
+core abstractions:
+
+- ingest endpoint,
+- worker sharding model,
+- window management,
+- accumulator construction and update,
+- output sink abstraction, and
+- integration with the existing store and query engine.
+
+### Requirements
+
+The implementation in this PR is driven by the following requirements:
+
+1. ASAPQuery needs a native precompute path inside the Rust query engine codebase.
+2. The system must ingest a high volume of time-series samples without forcing
+   cross-worker coordination on every sample.
+3. Samples for the same series must be processed consistently by the same worker
+   so per-series state can stay local.
+4. The engine must support windowed precomputation for the aggregation
+   configurations already defined in `StreamingConfig`.
+5. The output must be written in the same `PrecomputedOutput` form already
+   consumed by the store and query engine.
+6. The design must stay simple enough to validate correctness end to end before
+   adding more advanced features such as richer late-data policies or multi-stage
+   aggregation.
+
+### Scope of this PR
+
+This PR delivers a pragmatic v1:
+
+- single-process, multi-worker execution,
+- Prometheus remote write ingest,
+- store-backed output,
+- watermark-based window closing,
+- bounded per-series buffering,
+- best-effort handling of out-of-order data via a lateness threshold,
+- optional raw passthrough mode.
+
+It does **not** try to solve every future concern yet. In particular, it does
+not add multi-stage aggregation, explicit late-data re-emission policies,
+cross-worker merge coordination, or pane-based sliding-window optimization.
+
+## 2. Architecture
+
+### High-level data flow
+
+```text
+Prometheus Remote Write
+        |
+        v
+POST /api/v1/write (Axum)
+        |
+        v
+decode_prometheus_remote_write()
+        |
+        v
+group samples by series key
+        |
+        v
+SeriesRouter (xxhash(series_key) % num_workers)
+        |
+        +-------------------+-------------------+-------------------+
+        |                   |                   |                   |
+        v                   v                   v                   v
+    Worker 0            Worker 1            Worker 2          Worker N-1
+        |                   |                   |                   |
+        |  per-series buffer + per-aggregation active windows       |
+        +-------------------+-------------------+-------------------+
+                                |
+                                v
+                        OutputSink::emit_batch()
+                                |
+                                v
+                               Store
+                                |
+                                v
+                           Query Engine
+```
+
+### Main components
+
+#### `PrecomputeEngine` (`mod.rs`)
+
+`PrecomputeEngine` is the top-level orchestrator. It:
+
+- loads aggregation configs from `StreamingConfig`,
+- creates one bounded MPSC channel per worker,
+- builds a `SeriesRouter`,
+- spawns worker tasks,
+- starts the ingest HTTP server, and
+- starts a periodic flush loop.
+
+The engine keeps the worker model intentionally simple: workers are symmetric,
+and routing is deterministic.
+
+#### `SeriesRouter` (`series_router.rs`)
+
+The router computes:
+
+```text
+worker_idx = xxhash64(series_key) % num_workers
+```
+
+This guarantees that all samples for one exact series key go to the same worker.
+That is the main design decision that keeps worker-local state lock-free.
+
+#### `Worker` (`worker.rs`)
+
+Each worker owns a shard of the series space. For each series it stores:
+
+- a `SeriesBuffer`,
+- the previous watermark seen for that series,
+- one `AggregationState` per matching aggregation config.
+
+Each `AggregationState` contains:
+
+- the copied `AggregationConfig`,
+- a `WindowManager`,
+- a map of active window accumulators.
+
+Workers receive `Samples`, `Flush`, and `Shutdown` messages. On samples, the
+worker inserts data into the series buffer, applies lateness filtering, updates
+active window accumulators, detects newly closed windows, and emits completed
+accumulators to the sink.
+
+#### `SeriesBuffer` (`series_buffer.rs`)
+
+The buffer stores timestamped samples per series in timestamp order and tracks a
+monotonic watermark. It is bounded by `max_buffer_per_series`, which prevents a
+single hot or stalled series from growing unbounded in memory.
+
+#### `WindowManager` (`window_manager.rs`)
+
+`WindowManager` encapsulates window boundary logic:
+
+- map a timestamp to an aligned window start,
+- decide which windows became closed after watermark advancement,
+- return `[window_start, window_end)` bounds.
+
+The current implementation supports both tumbling and slide-aligned window
+closure logic. Window close is driven by event-time watermark progression, not
+wall-clock time.
+
+#### `AccumulatorUpdater` factory (`accumulator_factory.rs`)
+
+Workers do not hardcode sketch logic. Instead, they construct accumulator
+updaters from the aggregation config. This keeps the precompute engine generic
+across supported aggregation types and lets it emit the same accumulator objects
+already used elsewhere in ASAPQuery.
+
+#### `OutputSink` (`output_sink.rs`)
+
+`OutputSink` separates computation from persistence. This PR ships three useful
+implementations:
+
+- `StoreOutputSink` for normal precompute writes,
+- `RawPassthroughSink` for writing raw samples as `SumAccumulator`s,
+- `NoopOutputSink` for tests.
+
+### Execution model
+
+The execution model is:
+
+1. Decode one remote-write request.
+2. Group samples by exact series key.
+3. Route each grouped batch to one worker.
+4. Process series state only on that worker.
+5. Emit completed windows in batches to the sink.
+
+This design avoids per-sample cross-worker synchronization and keeps the first
+version operationally understandable.
+
+## 3. Key Features Derived From the Requirements
+
+### Deterministic per-series routing
+
+Requirement: samples for one series must share local state.
+
+Derived feature: the hash-based router always sends the same series key to the
+same worker. This means:
+
+- no shared mutable state across workers for a given series,
+- no locking around per-series accumulators,
+- predictable ownership of series-local watermarks and buffers.
+
+### Config-driven aggregation matching
+
+Requirement: reuse aggregation definitions already present in the system.
+
+Derived feature: each worker matches a series against the loaded
+`AggregationConfig`s and creates aggregation state only for the configs relevant
+to that series. The engine therefore stays driven by `StreamingConfig` instead
+of inventing a separate configuration model.
+
+### Windowed precomputation with watermark closure
+
+Requirement: emit queryable precomputed windows rather than raw streams only.
+
+Derived feature: each aggregation uses a `WindowManager` to:
+
+- align samples to windows,
+- detect when watermark movement closes a window,
+- emit `PrecomputedOutput` records with exact window bounds.
+
+This gives the query engine stable window ranges to read later.
+
+### Bounded memory for series-local state
+
+Requirement: the engine must remain safe under continuous ingestion.
+
+Derived feature: each series uses a bounded `SeriesBuffer`, and each worker uses
+bounded channels from the router. This does not solve every overload scenario,
+but it prevents the obvious unbounded growth cases in the v1 design.
+
+### Optional raw passthrough mode
+
+Requirement: support bring-up, debugging, and staged rollout.
+
+Derived feature: when `pass_raw_samples=true`, the worker bypasses windowed
+aggregation and emits one `SumAccumulator` per sample. This is useful for
+testing the ingest-to-store plumbing independently from sketch behavior.
+
+### Direct integration with the existing store and query engine
+
+Requirement: the precompute path must fit ASAPQuery's existing runtime.
+
+Derived feature: the standalone `precompute_engine` binary can be launched with:
+
+- a `StreamingConfig`,
+- a store implementation,
+- an optional query HTTP server in the same process.
+
+That makes the PR immediately testable end to end.
+
+## 4. System Implementation Corner Cases
+
+### Late and out-of-order samples
+
+The current policy is intentionally simple:
+
+- if `timestamp < watermark - allowed_lateness_ms`, the sample is dropped;
+- otherwise it is accepted.
+
+This means the PR chooses predictability over replay complexity. There is no
+secondary path yet for re-opening or patching already emitted windows.
+
+### Idle series and flush behavior
+
+The engine has a periodic flush loop, but the current implementation does **not**
+advance watermarks on its own. As a result, a flush only emits windows that have
+become closable due to prior event-time progress. If a series stops receiving
+samples before a later sample advances the watermark, the worker does not invent
+time progress just because wall-clock time passed.
+
+This is an important behavior boundary for this PR.
+
+### Sliding-window semantics in v1
+
+`WindowManager` understands slide intervals, and tests cover slide-aligned
+window closing. However, this PR keeps the worker update path simple: samples
+are placed into the accumulator keyed by `window_start_for(ts)`, and the design
+does not yet implement the more advanced pane-sharing or multi-window fan-out
+approach described in earlier discussion branches.
+
+So the current PR establishes the reusable windowing abstraction first, while
+leaving richer sliding-window execution strategies for follow-up work.
+
+### Cross-series aggregation across workers
+
+Routing is based on the full series key, not on the final grouping key. That
+keeps ingestion simple, but it also means different source series that
+contribute to the same logical grouped result may be processed on different
+workers. This PR does not introduce a second-tier reduce stage; it relies on the
+existing downstream model of storing precomputed outputs and reading them later.
+
+### Series-key parsing assumptions
+
+Grouping-label extraction currently parses series keys in the expected Prometheus
+text form:
+
+```text
+metric_name{label1="value1",label2="value2"}
+```
+
+Missing grouping labels are converted to empty strings. This keeps the worker
+path robust, but it is worth documenting because output keys depend on this
+parsing behavior.
+
+### Raw mode loses label-group semantics
+
+In raw passthrough mode, the engine emits one point output per sample with
+`key=None`. That is acceptable for the intended debugging and plumbing use case,
+but it is deliberately not equivalent to fully configured grouped aggregation.
+
+## 5. Examples
+
+### Example 1: basic tumbling-window flow
+
+Assume:
+
+- metric: `fake_metric`
+- window size: 60 seconds
+- slide interval: 0 (tumbling)
+- one sample arrives at `t=12_000 ms`
+
+The worker computes:
+
+- `window_start = 0`
+- `window_end = 60_000`
+
+The sample updates the active accumulator for window `[0, 60_000)`. Once the
+watermark later reaches at least `60_000`, the worker emits:
+
+- `PrecomputedOutput(start=0, end=60_000, aggregation_id=...)`
+- the finished accumulator for that window
+
+### Example 2: out-of-order sample handling
+
+Assume:
+
+- current series watermark is `100_000 ms`
+- `allowed_lateness_ms = 5_000`
+
+Then:
+
+- sample at `97_000 ms` is accepted,
+- sample at `94_999 ms` is dropped.
+
+This keeps the lateness rule easy to reason about.
+
+### Example 3: deterministic sharding
+
+Assume two incoming series:
+
+- `cpu_usage{host="a",job="node"}`
+- `cpu_usage{host="b",job="node"}`
+
+The router hashes each full series key independently. Each series is assigned to
+one worker, and every later batch for that same series goes back to that same
+worker. The benefit is that each worker can maintain series-local state without
+coordination.
+
+### Example 4: raw passthrough mode
+
+If `pass_raw_samples=true` and a sample arrives:
+
+```text
+series_key = fake_metric{instance="i1"}
+timestamp  = 25_000
+value      = 42.0
+```
+
+The worker emits one point output immediately:
+
+- `PrecomputedOutput(start=25_000, end=25_000, key=None, aggregation_id=raw_mode_aggregation_id)`
+- `SumAccumulator::with_sum(42.0)`
+
+This mode is useful when validating the ingest path independently from
+windowed aggregation correctness.
+
+## 6. Summary
+
+PR #228 introduces the first integrated precompute engine inside
+`asap-query-engine`. The design deliberately favors a clear and testable v1:
+
+- one process,
+- deterministic worker sharding,
+- config-driven accumulator creation,
+- watermark-based window emission,
+- direct store integration.
+
+That foundation is the reason this PR is needed. It creates the runtime path
+that later PRs can extend with more sophisticated window execution, richer late
+data handling, and more advanced cross-worker aggregation strategies.
diff --git a/asap-query-engine/src/precompute_engine/series_buffer.rs b/asap-query-engine/src/precompute_engine/series_buffer.rs
new file mode 100644
index 0000000..a663142
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/series_buffer.rs
@@ -0,0 +1,161 @@
+use std::collections::BTreeMap;
+
+/// Per-series sample buffer backed by a `BTreeMap<i64, f64>` for automatic
+/// ordering by timestamp. Tracks a per-series watermark.
+pub struct SeriesBuffer {
+    /// Samples keyed by timestamp_ms. BTreeMap keeps them sorted.
+    samples: BTreeMap<i64, f64>,
+    /// High-watermark: the maximum timestamp seen so far for this series.
+    watermark_ms: i64,
+    /// Maximum number of samples to retain. When exceeded, oldest are evicted.
+    max_buffer_size: usize,
+}
+
+impl SeriesBuffer {
+    pub fn new(max_buffer_size: usize) -> Self {
+        Self {
+            samples: BTreeMap::new(),
+            watermark_ms: i64::MIN,
+            max_buffer_size,
+        }
+    }
+
+    /// Insert a sample. Updates the watermark if `timestamp_ms` is the new max.
+    /// Returns `true` if the sample was actually inserted (not a duplicate timestamp
+    /// with the same value).
+    pub fn insert(&mut self, timestamp_ms: i64, value: f64) -> bool {
+        if timestamp_ms > self.watermark_ms {
+            self.watermark_ms = timestamp_ms;
+        }
+        self.samples.insert(timestamp_ms, value);
+
+        // Enforce max buffer size by evicting oldest entries
+        while self.samples.len() > self.max_buffer_size {
+            self.samples.pop_first();
+        }
+
+        true
+    }
+
+    /// Return current watermark.
+    pub fn watermark_ms(&self) -> i64 {
+        self.watermark_ms
+    }
+
+    /// Read all samples in `[start_ms, end_ms)` — inclusive start, exclusive end.
+    /// Returns them in timestamp order.
+    pub fn read_range(&self, start_ms: i64, end_ms: i64) -> Vec<(i64, f64)> {
+        self.samples
+            .range(start_ms..end_ms)
+            .map(|(&ts, &val)| (ts, val))
+            .collect()
+    }
+
+    /// Drain (remove and return) all samples with `timestamp_ms < up_to_ms`.
+    pub fn drain_up_to(&mut self, up_to_ms: i64) -> Vec<(i64, f64)> {
+        let mut drained = Vec::new();
+        // split_off returns everything >= up_to_ms; we keep that part
+        let remaining = self.samples.split_off(&up_to_ms);
+        // self.samples now contains everything < up_to_ms
+        drained.extend(self.samples.iter().map(|(&ts, &val)| (ts, val)));
+        self.samples = remaining;
+        drained
+    }
+
+    /// Number of buffered samples.
+    pub fn len(&self) -> usize {
+        self.samples.len()
+    }
+
+    /// Whether the buffer is empty.
+    pub fn is_empty(&self) -> bool {
+        self.samples.is_empty()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_insert_and_watermark() {
+        let mut buf = SeriesBuffer::new(100);
+        assert_eq!(buf.watermark_ms(), i64::MIN);
+
+        buf.insert(1000, 1.0);
+        assert_eq!(buf.watermark_ms(), 1000);
+
+        buf.insert(500, 0.5); // out-of-order
+        assert_eq!(buf.watermark_ms(), 1000); // watermark should not go back
+
+        buf.insert(2000, 2.0);
+        assert_eq!(buf.watermark_ms(), 2000);
+    }
+
+    #[test]
+    fn test_sorted_order() {
+        let mut buf = SeriesBuffer::new(100);
+        buf.insert(3000, 3.0);
+        buf.insert(1000, 1.0);
+        buf.insert(2000, 2.0);
+
+        let all = buf.read_range(0, 4000);
+        assert_eq!(all, vec![(1000, 1.0), (2000, 2.0), (3000, 3.0)]);
+    }
+
+    #[test]
+    fn test_read_range() {
+        let mut buf = SeriesBuffer::new(100);
+        for t in [1000, 2000, 3000, 4000, 5000] {
+            buf.insert(t, t as f64);
+        }
+
+        // [2000, 4000) should return 2000, 3000
+        let range = buf.read_range(2000, 4000);
+        assert_eq!(range, vec![(2000, 2000.0), (3000, 3000.0)]);
+    }
+
+    #[test]
+    fn test_drain_up_to() {
+        let mut buf = SeriesBuffer::new(100);
+        for t in [1000, 2000, 3000, 4000, 5000] {
+            buf.insert(t, t as f64);
+        }
+
+        let drained = buf.drain_up_to(3000);
+        assert_eq!(drained, vec![(1000, 1000.0), (2000, 2000.0)]);
+        assert_eq!(buf.len(), 3); // 3000, 4000, 5000 remain
+    }
+
+    #[test]
+    fn test_max_buffer_enforcement() {
+        let mut buf = SeriesBuffer::new(3);
+        buf.insert(1000, 1.0);
+        buf.insert(2000, 2.0);
+        buf.insert(3000, 3.0);
+        buf.insert(4000, 4.0); // should evict 1000
+        assert_eq!(buf.len(), 3);
+
+        let all = buf.read_range(0, 5000);
+        assert_eq!(all, vec![(2000, 2.0), (3000, 3.0), (4000, 4.0)]);
+    }
+
+    #[test]
+    fn test_dedup_by_timestamp() {
+        let mut buf = SeriesBuffer::new(100);
+        buf.insert(1000, 1.0);
+        buf.insert(1000, 2.0); // same timestamp, overwrites
+        assert_eq!(buf.len(), 1);
+
+        let all = buf.read_range(0, 2000);
+        assert_eq!(all, vec![(1000, 2.0)]);
+    }
+
+    #[test]
+    fn test_empty_operations() {
+        let buf = SeriesBuffer::new(100);
+        assert!(buf.is_empty());
+        assert_eq!(buf.len(), 0);
+        assert_eq!(buf.read_range(0, 1000), vec![]);
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/series_router.rs b/asap-query-engine/src/precompute_engine/series_router.rs
new file mode 100644
index 0000000..1f8533c
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/series_router.rs
@@ -0,0 +1,106 @@
+use std::time::Instant;
+use tokio::sync::mpsc;
+use xxhash_rust::xxh64::xxh64;
+
+/// A message sent from the router to a worker.
+#[derive(Debug)]
+pub enum WorkerMessage {
+    /// A batch of samples for the same series.
+    Samples {
+        series_key: String,
+        samples: Vec<(i64, f64)>, // (timestamp_ms, value)
+        ingest_received_at: Instant,
+    },
+    /// Signal the worker to flush/check idle windows.
+    Flush,
+    /// Graceful shutdown.
+    Shutdown,
+}
+
+/// Routes incoming samples to one of N workers based on a consistent hash
+/// of the series label string.
+pub struct SeriesRouter {
+    senders: Vec<mpsc::Sender<WorkerMessage>>,
+    num_workers: usize,
+}
+
+impl SeriesRouter {
+    pub fn new(senders: Vec<mpsc::Sender<WorkerMessage>>) -> Self {
+        let num_workers = senders.len();
+        Self {
+            senders,
+            num_workers,
+        }
+    }
+
+    /// Route a batch of samples for one series to the appropriate worker.
+    pub async fn route(
+        &self,
+        series_key: &str,
+        samples: Vec<(i64, f64)>,
+        ingest_received_at: Instant,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        let worker_idx = self.worker_for(series_key);
+        self.senders[worker_idx]
+            .send(WorkerMessage::Samples {
+                series_key: series_key.to_string(),
+                samples,
+                ingest_received_at,
+            })
+            .await
+            .map_err(|e| format!("Failed to send to worker {}: {}", worker_idx, e))?;
+        Ok(())
+    }
+
+    /// Broadcast a flush signal to all workers.
+    pub async fn broadcast_flush(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        for (i, sender) in self.senders.iter().enumerate() {
+            sender
+                .send(WorkerMessage::Flush)
+                .await
+                .map_err(|e| format!("Failed to send flush to worker {}: {}", i, e))?;
+        }
+        Ok(())
+    }
+
+    /// Broadcast shutdown to all workers.
+    pub async fn broadcast_shutdown(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        for (i, sender) in self.senders.iter().enumerate() {
+            sender
+                .send(WorkerMessage::Shutdown)
+                .await
+                .map_err(|e| format!("Failed to send shutdown to worker {}: {}", i, e))?;
+        }
+        Ok(())
+    }
+
+    /// Determine which worker handles a given series key.
+    fn worker_for(&self, series_key: &str) -> usize {
+        let hash = xxh64(series_key.as_bytes(), 0);
+        (hash as usize) % self.num_workers
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_consistent_routing() {
+        // Build a router with dummy senders (we only test the hash logic)
+        let (senders, _receivers): (Vec<_>, Vec<_>) =
+            (0..4).map(|_| mpsc::channel::<WorkerMessage>(10)).unzip();
+
+        let router = SeriesRouter::new(senders);
+
+        // Same key should always go to the same worker
+        let w1 = router.worker_for("cpu{host=\"a\"}");
+        let w2 = router.worker_for("cpu{host=\"a\"}");
+        assert_eq!(w1, w2);
+
+        // Different keys may go to different workers (probabilistic, but verifiable)
+        let _ = router.worker_for("cpu{host=\"b\"}");
+        // Just ensure no panic and result is in range
+        assert!(router.worker_for("mem{host=\"a\"}") < 4);
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/window_manager.rs b/asap-query-engine/src/precompute_engine/window_manager.rs
new file mode 100644
index 0000000..4d329da
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/window_manager.rs
@@ -0,0 +1,153 @@
+/// Manages tumbling window boundaries and detects which windows have closed
+/// based on watermark advancement.
+pub struct WindowManager {
+    /// Window size in milliseconds.
+    window_size_ms: i64,
+    /// Slide interval in milliseconds (== window_size_ms for tumbling windows).
+    slide_interval_ms: i64,
+}
+
+impl WindowManager {
+    /// Create a new WindowManager.
+    ///
+    /// `window_size_secs` and `slide_interval_secs` come from `AggregationConfig`
+    /// (which stores them in seconds). They are converted to milliseconds internally.
+    pub fn new(window_size_secs: u64, slide_interval_secs: u64) -> Self {
+        let window_size_ms = (window_size_secs * 1000) as i64;
+        let slide_interval_ms = if slide_interval_secs == 0 {
+            window_size_ms // tumbling window
+        } else {
+            (slide_interval_secs * 1000) as i64
+        };
+        Self {
+            window_size_ms,
+            slide_interval_ms,
+        }
+    }
+
+    pub fn window_size_ms(&self) -> i64 {
+        self.window_size_ms
+    }
+
+    /// Compute the window start for a given timestamp.
+    /// Windows are aligned to epoch (multiples of slide_interval_ms).
+    pub fn window_start_for(&self, timestamp_ms: i64) -> i64 {
+        // Floor-divide to the nearest slide interval boundary
+        let n = timestamp_ms.div_euclid(self.slide_interval_ms);
+        n * self.slide_interval_ms
+    }
+
+    /// Return window starts whose windows are now closed, given that the
+    /// watermark advanced from `previous_wm` to `current_wm`.
+    ///
+    /// A window `[start, start + window_size_ms)` is closed when
+    /// `current_wm >= start + window_size_ms`.
+    ///
+    /// Returns window starts in ascending order.
+    pub fn closed_windows(&self, previous_wm: i64, current_wm: i64) -> Vec<i64> {
+        if current_wm <= previous_wm || previous_wm == i64::MIN {
+            // No watermark advancement, or first sample ever (nothing to close yet
+            // — the window that contains the first sample is still open).
+            return Vec::new();
+        }
+
+        let mut closed = Vec::new();
+
+        // The earliest window start that *could* have been open at previous_wm.
+        // A window is open if its end (start + window_size_ms) > previous_wm.
+        // So the oldest open window start was: previous_wm - window_size_ms + 1,
+        // aligned down to slide_interval.
+        let earliest_open_start =
+            self.window_start_for((previous_wm - self.window_size_ms + 1).max(0));
+
+        let mut start = earliest_open_start;
+        while start + self.window_size_ms <= current_wm {
+            // This window was NOT closed at previous_wm but IS closed at current_wm
+            if start + self.window_size_ms > previous_wm {
+                closed.push(start);
+            }
+            start += self.slide_interval_ms;
+        }
+
+        closed
+    }
+
+    /// Return the window `[start, end)` boundaries for a given window start.
+    pub fn window_bounds(&self, window_start: i64) -> (i64, i64) {
+        (window_start, window_start + self.window_size_ms)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_tumbling_window_start() {
+        // 60-second (60000ms) tumbling windows
+        let wm = WindowManager::new(60, 0);
+
+        assert_eq!(wm.window_start_for(0), 0);
+        assert_eq!(wm.window_start_for(59_999), 0);
+        assert_eq!(wm.window_start_for(60_000), 60_000);
+        assert_eq!(wm.window_start_for(119_999), 60_000);
+        assert_eq!(wm.window_start_for(120_000), 120_000);
+    }
+
+    #[test]
+    fn test_no_closed_windows_on_first_sample() {
+        let wm = WindowManager::new(60, 0);
+        let closed = wm.closed_windows(i64::MIN, 30_000);
+        assert!(closed.is_empty());
+    }
+
+    #[test]
+    fn test_tumbling_window_close() {
+        // 60s tumbling windows
+        let wm = WindowManager::new(60, 0);
+
+        // Watermark advances from 30_000 to 70_000
+        // Window [0, 60_000) closes when wm >= 60_000
+        let closed = wm.closed_windows(30_000, 70_000);
+        assert_eq!(closed, vec![0]);
+    }
+
+    #[test]
+    fn test_multiple_window_closes() {
+        // 10s (10000ms) tumbling windows
+        let wm = WindowManager::new(10, 0);
+
+        // Watermark jumps from 5_000 to 35_000 — closes windows 0, 10_000, 20_000
+        let closed = wm.closed_windows(5_000, 35_000);
+        assert_eq!(closed, vec![0, 10_000, 20_000]);
+    }
+
+    #[test]
+    fn test_no_close_when_watermark_stagnant() {
+        let wm = WindowManager::new(60, 0);
+        let closed = wm.closed_windows(30_000, 30_000);
+        assert!(closed.is_empty());
+    }
+
+    #[test]
+    fn test_window_bounds() {
+        let wm = WindowManager::new(60, 0);
+        assert_eq!(wm.window_bounds(0), (0, 60_000));
+        assert_eq!(wm.window_bounds(60_000), (60_000, 120_000));
+    }
+
+    #[test]
+    fn test_sliding_window() {
+        // 30s window, 10s slide
+        let wm = WindowManager::new(30, 10);
+
+        assert_eq!(wm.window_start_for(0), 0);
+        assert_eq!(wm.window_start_for(9_999), 0);
+        assert_eq!(wm.window_start_for(10_000), 10_000);
+
+        // Watermark advances from 15_000 to 35_000
+        // Window [0, 30_000) closes at wm=30_000 (was open at 15_000)
+        let closed = wm.closed_windows(15_000, 35_000);
+        assert_eq!(closed, vec![0]);
+    }
+}
diff --git a/asap-query-engine/src/precompute_engine/worker.rs b/asap-query-engine/src/precompute_engine/worker.rs
new file mode 100644
index 0000000..fac2d23
--- /dev/null
+++ b/asap-query-engine/src/precompute_engine/worker.rs
@@ -0,0 +1,516 @@
+use crate::data_model::{AggregateCore, KeyByLabelValues, PrecomputedOutput};
+use crate::precompute_engine::accumulator_factory::{
+    create_accumulator_updater, AccumulatorUpdater,
+};
+use crate::precompute_engine::output_sink::OutputSink;
+use crate::precompute_engine::series_buffer::SeriesBuffer;
+use crate::precompute_engine::series_router::WorkerMessage;
+use crate::precompute_engine::window_manager::WindowManager;
+use crate::precompute_operators::sum_accumulator::SumAccumulator;
+use sketch_db_common::aggregation_config::AggregationConfig;
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::mpsc;
+use tracing::{debug, debug_span, info, warn};
+
+/// Per-aggregation state within a series: the window manager and active
+/// window accumulators.
+struct AggregationState {
+    config: AggregationConfig,
+    window_manager: WindowManager,
+    /// Active windows keyed by window_start_ms.
+    active_windows: HashMap<i64, Box<dyn AccumulatorUpdater>>,
+}
+
+/// Per-series state owned by the worker.
+struct SeriesState {
+    buffer: SeriesBuffer,
+    previous_watermark_ms: i64,
+    /// One AggregationState per matching aggregation config.
+    aggregations: Vec<AggregationState>,
+}
+
+/// Worker that processes samples for a shard of the series space.
+pub struct Worker {
+    id: usize,
+    receiver: mpsc::Receiver<WorkerMessage>,
+    output_sink: Arc<dyn OutputSink>,
+    /// Map from series key to per-series state.
+    series_map: HashMap<String, SeriesState>,
+    /// Aggregation configs, keyed by aggregation_id.
+    agg_configs: HashMap<u64, AggregationConfig>,
+    /// Max buffer size per series.
+    max_buffer_per_series: usize,
+    /// Allowed lateness in ms.
+    allowed_lateness_ms: i64,
+    /// When true, skip aggregation and pass raw samples through.
+    pass_raw_samples: bool,
+    /// Aggregation ID stamped on each raw-mode output.
+    raw_mode_aggregation_id: u64,
+}
+
+impl Worker {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        id: usize,
+        receiver: mpsc::Receiver<WorkerMessage>,
+        output_sink: Arc<dyn OutputSink>,
+        agg_configs: HashMap<u64, AggregationConfig>,
+        max_buffer_per_series: usize,
+        allowed_lateness_ms: i64,
+        pass_raw_samples: bool,
+        raw_mode_aggregation_id: u64,
+    ) -> Self {
+        Self {
+            id,
+            receiver,
+            output_sink,
+            series_map: HashMap::new(),
+            agg_configs,
+            max_buffer_per_series,
+            allowed_lateness_ms,
+            pass_raw_samples,
+            raw_mode_aggregation_id,
+        }
+    }
+
+    /// Run the worker loop. Blocks until shutdown.
+    pub async fn run(mut self) {
+        info!("Worker {} started", self.id);
+
+        while let Some(msg) = self.receiver.recv().await {
+            match msg {
+                WorkerMessage::Samples {
+                    series_key,
+                    samples,
+                    ingest_received_at,
+                } => {
+                    let sample_count = samples.len();
+                    let _span = debug_span!(
+                        "worker_process",
+                        worker_id = self.id,
+                        series = %series_key,
+                        sample_count,
+                    )
+                    .entered();
+                    if let Err(e) = self.process_samples(&series_key, samples) {
+                        warn!("Worker {} error processing {}: {}", self.id, series_key, e);
+                    }
+                    debug!(
+                        e2e_latency_us = ingest_received_at.elapsed().as_micros() as u64,
+                        "e2e: ingest->worker complete"
+                    );
+                }
+                WorkerMessage::Flush => {
+                    if let Err(e) = self.flush_all() {
+                        warn!("Worker {} flush error: {}", self.id, e);
+                    }
+                }
+                WorkerMessage::Shutdown => {
+                    info!("Worker {} shutting down", self.id);
+                    // Final flush before shutdown
+                    if let Err(e) = self.flush_all() {
+                        warn!("Worker {} final flush error: {}", self.id, e);
+                    }
+                    break;
+                }
+            }
+        }
+
+        info!(
+            "Worker {} stopped, {} active series",
+            self.id,
+            self.series_map.len()
+        );
+    }
+
+    /// Find all aggregation configs whose metric/spatial_filter matches this series.
+    fn matching_agg_configs(&self, series_key: &str) -> Vec<(u64, &AggregationConfig)> {
+        let metric_name = extract_metric_name(series_key);
+
+        self.agg_configs
+            .iter()
+            .filter(|(_, config)| {
+                // Match on metric name
+                config.metric == metric_name
+                    || config.spatial_filter_normalized == metric_name
+                    || config.spatial_filter == metric_name
+            })
+            .map(|(&id, config)| (id, config))
+            .collect()
+    }
+
+    /// Get or create the SeriesState for a series key.
+    fn get_or_create_series_state(&mut self, series_key: &str) -> &mut SeriesState {
+        if !self.series_map.contains_key(series_key) {
+            let matching = self.matching_agg_configs(series_key);
+            let aggregations = matching
+                .into_iter()
+                .map(|(_, config)| AggregationState {
+                    window_manager: WindowManager::new(config.window_size, config.slide_interval),
+                    config: config.clone(),
+                    active_windows: HashMap::new(),
+                })
+                .collect();
+
+            self.series_map.insert(
+                series_key.to_string(),
+                SeriesState {
+                    buffer: SeriesBuffer::new(self.max_buffer_per_series),
+                    previous_watermark_ms: i64::MIN,
+                    aggregations,
+                },
+            );
+        }
+
+        self.series_map.get_mut(series_key).unwrap()
+    }
+
+    fn process_samples(
+        &mut self,
+        series_key: &str,
+        samples: Vec<(i64, f64)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        if self.pass_raw_samples {
+            return self.process_samples_raw(series_key, samples);
+        }
+
+        // Copy scalars out of self before taking &mut self.series_map
+        let worker_id = self.id;
+        let allowed_lateness_ms = self.allowed_lateness_ms;
+
+        // Ensure state exists
+        self.get_or_create_series_state(series_key);
+
+        let state = self.series_map.get_mut(series_key).unwrap();
+
+        if state.aggregations.is_empty() {
+            return Ok(());
+        }
+
+        // Insert samples into buffer, dropping late arrivals
+        for &(ts, val) in &samples {
+            if state.buffer.watermark_ms() != i64::MIN
+                && ts < state.buffer.watermark_ms() - allowed_lateness_ms
+            {
+                debug!(
+                    "Worker {} dropping late sample for {}: ts={} watermark={}",
+                    worker_id,
+                    series_key,
+                    ts,
+                    state.buffer.watermark_ms()
+                );
+                continue;
+            }
+            state.buffer.insert(ts, val);
+        }
+
+        let current_wm = state.buffer.watermark_ms();
+        let previous_wm = state.previous_watermark_ms;
+
+        let mut emit_batch: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)> = Vec::new();
+
+        for agg_state in &mut state.aggregations {
+            let closed = agg_state
+                .window_manager
+                .closed_windows(previous_wm, current_wm);
+
+            // Feed each incoming sample to the correct active window accumulator
+            for &(ts, val) in &samples {
+                if current_wm != i64::MIN && ts < current_wm - allowed_lateness_ms {
+                    continue; // already dropped
+                }
+
+                let window_start = agg_state.window_manager.window_start_for(ts);
+
+                let updater = agg_state
+                    .active_windows
+                    .entry(window_start)
+                    .or_insert_with(|| create_accumulator_updater(&agg_state.config));
+
+                if updater.is_keyed() {
+                    let key = extract_key_from_series(series_key, &agg_state.config);
+                    updater.update_keyed(&key, val, ts);
+                } else {
+                    updater.update_single(val, ts);
+                }
+            }
+
+            // Emit closed windows
+            for window_start in &closed {
+                if let Some(mut updater) = agg_state.active_windows.remove(window_start) {
+                    let (_, window_end) = agg_state.window_manager.window_bounds(*window_start);
+
+                    let key = if updater.is_keyed() {
+                        Some(extract_key_from_series(series_key, &agg_state.config))
+                    } else {
+                        None
+                    };
+
+                    let output = PrecomputedOutput::new(
+                        *window_start as u64,
+                        window_end as u64,
+                        key,
+                        agg_state.config.aggregation_id,
+                    );
+
+                    let accumulator = updater.take_accumulator();
+                    emit_batch.push((output, accumulator));
+                }
+            }
+        }
+
+        state.previous_watermark_ms = current_wm;
+
+        // Emit to output sink
+        if !emit_batch.is_empty() {
+            debug!(
+                "Worker {} emitting {} outputs for {}",
+                worker_id,
+                emit_batch.len(),
+                series_key
+            );
+            self.output_sink.emit_batch(emit_batch)?;
+        }
+
+        Ok(())
+    }
+
+    /// Raw fast-path: emit each sample as a standalone `SumAccumulator`.
+    fn process_samples_raw(
+        &self,
+        series_key: &str,
+        samples: Vec<(i64, f64)>,
+    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        let mut emit_batch: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)> =
+            Vec::with_capacity(samples.len());
+
+        for (ts, val) in samples {
+            let output =
+                PrecomputedOutput::new(ts as u64, ts as u64, None, self.raw_mode_aggregation_id);
+            let accumulator = SumAccumulator::with_sum(val);
+            emit_batch.push((output, Box::new(accumulator)));
+        }
+
+        if !emit_batch.is_empty() {
+            debug!(
+                "Worker {} raw-emitting {} samples for {}",
+                self.id,
+                emit_batch.len(),
+                series_key
+            );
+            self.output_sink.emit_batch(emit_batch)?;
+        }
+
+        Ok(())
+    }
+
+    /// Flush all series — force-close windows that are past due.
+    fn flush_all(&mut self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
+        if self.pass_raw_samples {
+            return Ok(());
+        }
+
+        let mut emit_batch: Vec<(PrecomputedOutput, Box<dyn AggregateCore>)> = Vec::new();
+
+        for (series_key, state) in &mut self.series_map {
+            let current_wm = state.buffer.watermark_ms();
+            let previous_wm = state.previous_watermark_ms;
+
+            for agg_state in &mut state.aggregations {
+                let closed = agg_state
+                    .window_manager
+                    .closed_windows(previous_wm, current_wm);
+
+                for window_start in &closed {
+                    if let Some(mut updater) = agg_state.active_windows.remove(window_start) {
+                        let (_, window_end) = agg_state.window_manager.window_bounds(*window_start);
+
+                        let key = if updater.is_keyed() {
+                            Some(extract_key_from_series(series_key, &agg_state.config))
+                        } else {
+                            None
+                        };
+
+                        let output = PrecomputedOutput::new(
+                            *window_start as u64,
+                            window_end as u64,
+                            key,
+                            agg_state.config.aggregation_id,
+                        );
+
+                        let accumulator = updater.take_accumulator();
+                        emit_batch.push((output, accumulator));
+                    }
+                }
+            }
+
+            state.previous_watermark_ms = current_wm;
+        }
+
+        if !emit_batch.is_empty() {
+            debug!(
+                "Worker {} flush emitting {} outputs",
+                self.id,
+                emit_batch.len()
+            );
+            self.output_sink.emit_batch(emit_batch)?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Extract the metric name from a series key like `"metric_name{key1=\"val1\"}"`.
+pub fn extract_metric_name(series_key: &str) -> &str {
+    match series_key.find('{') {
+        Some(pos) => &series_key[..pos],
+        None => series_key,
+    }
+}
+
+/// Extract grouping label values from a series key string based on the
+/// aggregation config's `grouping_labels`.
+///
+/// The series key format is: `metric_name{label1="val1",label2="val2",...}`
+pub fn extract_key_from_series(series_key: &str, config: &AggregationConfig) -> KeyByLabelValues {
+    let labels = parse_labels_from_series_key(series_key);
+    let mut values = Vec::new();
+
+    for label_name in &config.grouping_labels.labels {
+        if let Some(val) = labels.get(label_name.as_str()) {
+            values.push(val.to_string());
+        } else {
+            values.push(String::new());
+        }
+    }
+
+    KeyByLabelValues::new_with_labels(values)
+}
+
+/// Parse label key-value pairs from a series key string.
+/// `"metric{a=\"b\",c=\"d\"}"` → `{("a", "b"), ("c", "d")}`
+fn parse_labels_from_series_key(series_key: &str) -> HashMap<&str, &str> {
+    let mut labels = HashMap::new();
+
+    let start = match series_key.find('{') {
+        Some(pos) => pos + 1,
+        None => return labels,
+    };
+    let end = match series_key.rfind('}') {
+        Some(pos) => pos,
+        None => return labels,
+    };
+
+    if start >= end {
+        return labels;
+    }
+
+    let label_str = &series_key[start..end];
+
+    // Parse comma-separated key="value" pairs
+    // Simple parser that handles the expected format
+    let mut remaining = label_str;
+    while !remaining.is_empty() {
+        // Find the '=' separator
+        let eq_pos = match remaining.find('=') {
+            Some(pos) => pos,
+            None => break,
+        };
+        let key = remaining[..eq_pos].trim();
+
+        // Expect "value" after =
+        let after_eq = &remaining[eq_pos + 1..];
+        if !after_eq.starts_with('"') {
+            break;
+        }
+
+        // Find closing quote
+        let value_start = 1; // skip opening quote
+        let value_end = match after_eq[value_start..].find('"') {
+            Some(pos) => value_start + pos,
+            None => break,
+        };
+
+        let value = &after_eq[value_start..value_end];
+        labels.insert(key, value);
+
+        // Move past the closing quote and optional comma
+        let consumed = value_end + 1; // past closing quote
+        remaining = &after_eq[consumed..];
+        if remaining.starts_with(',') {
+            remaining = &remaining[1..];
+        }
+    }
+
+    labels
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_metric_name() {
+        assert_eq!(
+            extract_metric_name("http_requests_total{method=\"GET\"}"),
+            "http_requests_total"
+        );
+        assert_eq!(extract_metric_name("up"), "up");
+        assert_eq!(
+            extract_metric_name("cpu_usage{host=\"a\",zone=\"us\"}"),
+            "cpu_usage"
+        );
+    }
+
+    #[test]
+    fn test_parse_labels() {
+        let labels = parse_labels_from_series_key("metric{method=\"GET\",status=\"200\"}");
+        assert_eq!(labels.get("method"), Some(&"GET"));
+        assert_eq!(labels.get("status"), Some(&"200"));
+    }
+
+    #[test]
+    fn test_parse_labels_no_labels() {
+        let labels = parse_labels_from_series_key("metric");
+        assert!(labels.is_empty());
+    }
+
+    #[test]
+    fn test_parse_labels_empty_braces() {
+        let labels = parse_labels_from_series_key("metric{}");
+        assert!(labels.is_empty());
+    }
+
+    #[test]
+    fn test_extract_key_from_series() {
+        let config = AggregationConfig::new(
+            1,
+            "SingleSubpopulation".to_string(),
+            "Sum".to_string(),
+            HashMap::new(),
+            promql_utilities::data_model::key_by_label_names::KeyByLabelNames::new(vec![
+                "method".to_string(),
+                "status".to_string(),
+            ]),
+            promql_utilities::data_model::key_by_label_names::KeyByLabelNames::new(vec![]),
+            promql_utilities::data_model::key_by_label_names::KeyByLabelNames::new(vec![]),
+            String::new(),
+            60,
+            30,
+            "tumbling".to_string(),
+            "http_requests_total".to_string(),
+            "http_requests_total".to_string(),
+            Some(60),
+            Some(0),
+            None,
+            None,
+        );
+
+        let key = extract_key_from_series(
+            "http_requests_total{method=\"GET\",status=\"200\"}",
+            &config,
+        );
+        assert_eq!(key.labels, vec!["GET".to_string(), "200".to_string()]);
+    }
+}