Skip to content

Commit ef13416

Browse files
committed
Add backend abstraction foundation for sketch implementations
1 parent 0226a98 commit ef13416

File tree

10 files changed

+336
-163
lines changed

10 files changed

+336
-163
lines changed

Cargo.lock

Lines changed: 28 additions & 162 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

asap-common/sketch-core/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ serde.workspace = true
99
rmp-serde = "1.1"
1010
xxhash-rust = { version = "0.8", features = ["xxh32"] }
1111
dsrs = { git = "https://github.com/ProjectASAP/datasketches-rs" }
12+
clap = { version = "4.0", features = ["derive"] }
13+
14+
[dev-dependencies]
15+
ctor = "0.2"

asap-common/sketch-core/report.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Sketchlib Fidelity Report
2+
3+
Compares the **legacy** sketch implementations in `sketch-core` vs the new **sketchlib-rust** backends for:
4+
5+
- `CountMinSketch`
6+
- `CountMinSketchWithHeap` (Count-Min portion)
7+
- `KllSketch`
8+
- `HydraKllSketch` (via `KllSketch`)
9+
10+
## Running Fidelity Tests
11+
12+
The fidelity binary selects backends via CLI flags instead of environment variables.
13+
14+
| Goal | Command |
15+
|--------------------------|--------------------------------------------------------------------------------------------------------------|
16+
| Default (all sketchlib) | `cargo run -p sketch-core --bin sketchlib_fidelity` |
17+
| All legacy | `cargo run -p sketch-core --bin sketchlib_fidelity -- --cms-impl legacy --kll-impl legacy --cmwh-impl legacy` |
18+
| Legacy KLL only | `cargo run -p sketch-core --bin sketchlib_fidelity -- --cms-impl sketchlib --kll-impl legacy --cmwh-impl sketchlib` |
19+
20+
## Unit Tests
21+
22+
Unit tests always run with **legacy** backends enabled (the test ctor calls
23+
`force_legacy_mode_for_tests()`), so you only need:
24+
25+
```bash
26+
cargo test -p sketch-core
27+
```
28+
29+
## Results
30+
31+
Fidelity results will be added as sketch implementations are integrated in subsequent PRs.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
use clap::Parser;
2+
use sketch_core::config::{self, ImplMode};
3+
4+
#[derive(Clone)]
5+
struct Lcg64 {
6+
state: u64,
7+
}
8+
9+
impl Lcg64 {
10+
fn new(seed: u64) -> Self {
11+
Self { state: seed }
12+
}
13+
14+
fn next_u64(&mut self) -> u64 {
15+
self.state = self
16+
.state
17+
.wrapping_mul(6364136223846793005)
18+
.wrapping_add(1442695040888963407);
19+
self.state
20+
}
21+
22+
fn next_f64_0_1(&mut self) -> f64 {
23+
let x = self.next_u64() >> 11;
24+
(x as f64) / ((1u64 << 53) as f64)
25+
}
26+
}
27+
28+
fn pearson_corr(exact: &[f64], est: &[f64]) -> f64 {
29+
let n = exact.len().min(est.len());
30+
if n == 0 {
31+
return f64::NAN;
32+
}
33+
let (mut sum_x, mut sum_y) = (0.0, 0.0);
34+
for i in 0..n {
35+
sum_x += exact[i];
36+
sum_y += est[i];
37+
}
38+
let mean_x = sum_x / (n as f64);
39+
let mean_y = sum_y / (n as f64);
40+
let (mut num, mut den_x, mut den_y) = (0.0, 0.0, 0.0);
41+
for i in 0..n {
42+
let dx = exact[i] - mean_x;
43+
let dy = est[i] - mean_y;
44+
num += dx * dy;
45+
den_x += dx * dx;
46+
den_y += dy * dy;
47+
}
48+
if den_x == 0.0 || den_y == 0.0 {
49+
return f64::NAN;
50+
}
51+
num / (den_x.sqrt() * den_y.sqrt())
52+
}
53+
54+
fn mape(exact: &[f64], est: &[f64]) -> f64 {
55+
let n = exact.len().min(est.len());
56+
let mut num = 0.0;
57+
let mut denom = 0.0;
58+
for i in 0..n {
59+
if exact[i] == 0.0 {
60+
continue;
61+
}
62+
num += ((exact[i] - est[i]) / exact[i]).abs();
63+
denom += 1.0;
64+
}
65+
if denom == 0.0 {
66+
return if exact == est { 0.0 } else { f64::INFINITY };
67+
}
68+
(num / denom) * 100.0
69+
}
70+
71+
fn rmse_percentage(exact: &[f64], est: &[f64]) -> f64 {
72+
let n = exact.len().min(est.len());
73+
let mut sum_sq = 0.0;
74+
let mut denom = 0.0;
75+
for i in 0..n {
76+
if exact[i] == 0.0 {
77+
continue;
78+
}
79+
let rel = (exact[i] - est[i]) / exact[i];
80+
sum_sq += rel * rel;
81+
denom += 1.0;
82+
}
83+
if denom == 0.0 {
84+
return if exact == est { 0.0 } else { f64::INFINITY };
85+
}
86+
(sum_sq / denom).sqrt() * 100.0
87+
}
88+
89+
fn rank_fraction(sorted: &[f64], x: f64) -> f64 {
90+
if sorted.is_empty() {
91+
return 0.0;
92+
}
93+
let idx = sorted.partition_point(|v| *v <= x);
94+
(idx as f64) / (sorted.len() as f64)
95+
}
96+
97+
#[derive(Parser)]
98+
struct Args {
99+
#[arg(long, value_enum, default_value = "sketchlib")]
100+
cms_impl: ImplMode,
101+
#[arg(long, value_enum, default_value = "sketchlib")]
102+
kll_impl: ImplMode,
103+
#[arg(long, value_enum, default_value = "sketchlib")]
104+
cmwh_impl: ImplMode,
105+
}
106+
107+
fn main() {
108+
let args = Args::parse();
109+
config::configure(args.cms_impl, args.kll_impl, args.cmwh_impl)
110+
.expect("sketch backend already initialised");
111+
112+
let mode = if matches!(args.cms_impl, ImplMode::Legacy)
113+
|| matches!(args.kll_impl, ImplMode::Legacy)
114+
|| matches!(args.cmwh_impl, ImplMode::Legacy)
115+
{
116+
"Legacy"
117+
} else {
118+
"sketchlib-rust"
119+
};
120+
121+
println!("# Sketchlib Fidelity Report ({})", mode);
122+
println!();
123+
println!("Fidelity tests will be added as sketch implementations are integrated.");
124+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
use std::sync::OnceLock;
2+
3+
/// Implementation mode for sketch-core internals.
4+
#[derive(Debug, Clone, Copy, PartialEq, Eq, clap::ValueEnum)]
5+
pub enum ImplMode {
6+
/// Use the original hand-written implementations.
7+
Legacy,
8+
/// Use sketchlib-rust backed implementations.
9+
Sketchlib,
10+
}
11+
12+
static COUNTMIN_MODE: OnceLock<ImplMode> = OnceLock::new();
13+
14+
/// Returns true if Count-Min operations should use sketchlib-rust internally.
15+
pub fn use_sketchlib_for_count_min() -> bool {
16+
*COUNTMIN_MODE.get_or_init(|| ImplMode::Sketchlib) == ImplMode::Sketchlib
17+
}
18+
19+
static KLL_MODE: OnceLock<ImplMode> = OnceLock::new();
20+
21+
/// Returns true if KLL operations should use sketchlib-rust internally.
22+
pub fn use_sketchlib_for_kll() -> bool {
23+
*KLL_MODE.get_or_init(|| ImplMode::Sketchlib) == ImplMode::Sketchlib
24+
}
25+
26+
static COUNTMIN_WITH_HEAP_MODE: OnceLock<ImplMode> = OnceLock::new();
27+
28+
/// Returns true if Count-Min-With-Heap operations should use sketchlib-rust internally for the
29+
/// Count-Min portion.
30+
pub fn use_sketchlib_for_count_min_with_heap() -> bool {
31+
*COUNTMIN_WITH_HEAP_MODE.get_or_init(|| ImplMode::Sketchlib) == ImplMode::Sketchlib
32+
}
33+
34+
/// Set backend modes for all sketch types. Call once at process startup,
35+
/// before any sketch operation. Returns Err if any OnceLock was already set.
36+
pub fn configure(cms: ImplMode, kll: ImplMode, cmwh: ImplMode) -> Result<(), &'static str> {
37+
let a = COUNTMIN_MODE.set(cms);
38+
let b = KLL_MODE.set(kll);
39+
let c = COUNTMIN_WITH_HEAP_MODE.set(cmwh);
40+
if a.is_err() || b.is_err() || c.is_err() {
41+
Err("configure() called after sketch backends were already initialised")
42+
} else {
43+
Ok(())
44+
}
45+
}
46+
47+
pub fn force_legacy_mode_for_tests() {
48+
let _ = COUNTMIN_MODE.set(ImplMode::Legacy);
49+
let _ = KLL_MODE.set(ImplMode::Legacy);
50+
let _ = COUNTMIN_WITH_HEAP_MODE.set(ImplMode::Legacy);
51+
}
52+
53+
/// Helper used by UDF templates and documentation examples to parse implementation mode
54+
/// from environment variables in a robust way. This is not used in the hot path.
55+
pub fn parse_mode(var: Result<String, std::env::VarError>) -> ImplMode {
56+
match var {
57+
Ok(v) => match v.to_ascii_lowercase().as_str() {
58+
"legacy" => ImplMode::Legacy,
59+
"sketchlib" => ImplMode::Sketchlib,
60+
other => {
61+
eprintln!(
62+
"sketch-core: unrecognised IMPL value {other:?}, defaulting to Sketchlib"
63+
);
64+
ImplMode::Sketchlib
65+
}
66+
},
67+
Err(std::env::VarError::NotPresent) => ImplMode::Sketchlib,
68+
Err(std::env::VarError::NotUnicode(v)) => {
69+
eprintln!(
70+
"sketch-core: IMPL env var has invalid UTF-8 ({v:?}), defaulting to Sketchlib"
71+
);
72+
ImplMode::Sketchlib
73+
}
74+
}
75+
}

asap-common/sketch-core/src/lib.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
#[cfg(test)]
2+
#[ctor::ctor]
3+
fn init_sketch_legacy_for_tests() {
4+
crate::config::force_legacy_mode_for_tests();
5+
}
6+
7+
pub mod config;
18
pub mod count_min;
29
pub mod count_min_with_heap;
310
pub mod delta_set_aggregator;

asap-query-engine/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ flate2 = "1.0"
3939
async-trait = "0.1"
4040
xxhash-rust = { version = "0.8", features = ["xxh32", "xxh64"] }
4141
dsrs = { git = "https://github.com/ProjectASAP/datasketches-rs" }
42-
sketchlib-rust = { git = "https://github.com/ProjectASAP/sketchlib-rust" }
4342
base64 = "0.21"
4443
hex = "0.4"
4544
sqlparser = "0.59.0"
@@ -59,6 +58,7 @@ reqwest = { version = "0.11", features = ["json"] }
5958
tracing-appender = "0.2"
6059

6160
[dev-dependencies]
61+
ctor = "0.2"
6262
tempfile = "3.20.0"
6363

6464
[features]
@@ -68,3 +68,4 @@ default = []
6868
lock_profiling = []
6969
# Enable extra debugging output
7070
extra_debugging = []
71+
sketchlib-tests = []

asap-query-engine/src/lib.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
#[cfg(test)]
2+
#[ctor::ctor]
3+
fn init_sketch_backend_for_tests() {
4+
#[cfg(feature = "sketchlib-tests")]
5+
let _ = sketch_core::config::configure(
6+
sketch_core::config::ImplMode::Sketchlib,
7+
sketch_core::config::ImplMode::Sketchlib,
8+
sketch_core::config::ImplMode::Sketchlib,
9+
);
10+
#[cfg(not(feature = "sketchlib-tests"))]
11+
sketch_core::config::force_legacy_mode_for_tests();
12+
}
13+
114
pub mod data_model;
215
pub mod drivers;
316
pub mod engines;

asap-query-engine/src/main.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use std::sync::Arc;
55
use tokio::signal;
66
use tracing::{error, info};
77

8+
use sketch_core::config::{self, ImplMode};
9+
810
use query_engine_rust::data_model::enums::{InputFormat, LockStrategy, StreamingEngine};
911
use query_engine_rust::drivers::AdapterConfig;
1012
use query_engine_rust::utils::file_io::{read_inference_config, read_streaming_config};
@@ -108,6 +110,18 @@ struct Args {
108110
#[arg(long)]
109111
promsketch_config: Option<String>,
110112

113+
/// Backend implementation for Count-Min Sketch (legacy | sketchlib)
114+
#[arg(long, value_enum, default_value = "sketchlib")]
115+
sketch_cms_impl: ImplMode,
116+
117+
/// Backend implementation for KLL Sketch (legacy | sketchlib)
118+
#[arg(long, value_enum, default_value = "sketchlib")]
119+
sketch_kll_impl: ImplMode,
120+
121+
/// Backend implementation for Count-Min-With-Heap (legacy | sketchlib)
122+
#[arg(long, value_enum, default_value = "sketchlib")]
123+
sketch_cmwh_impl: ImplMode,
124+
111125
/// Enable OTLP metrics ingest (gRPC + HTTP)
112126
#[arg(long)]
113127
enable_otel_ingest: bool,
@@ -125,6 +139,14 @@ struct Args {
125139
async fn main() -> Result<()> {
126140
let args = Args::parse();
127141

142+
// Configure sketch-core backends before any sketch operations.
143+
config::configure(
144+
args.sketch_cms_impl,
145+
args.sketch_kll_impl,
146+
args.sketch_cmwh_impl,
147+
)
148+
.expect("sketch backend already initialised");
149+
128150
// Create output directory
129151
fs::create_dir_all(&args.output_dir)?;
130152

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
//! Integration test that runs the library test suite with the sketchlib backend.
2+
//!
3+
//! When you run `cargo test -p query_engine_rust` (without --features sketchlib-tests),
4+
//! the lib tests run with the legacy backend. This test spawns a second run with the
5+
//! sketchlib backend so both modes are exercised in one `cargo test` invocation.
6+
//!
7+
//! This test is only compiled when sketchlib-tests is NOT enabled, to avoid recursion.
8+
9+
#[cfg(not(feature = "sketchlib-tests"))]
10+
#[test]
11+
fn test_sketchlib_backend() {
12+
use std::process::Command;
13+
14+
let status = Command::new(env!("CARGO"))
15+
.args([
16+
"test",
17+
"-p",
18+
"query_engine_rust",
19+
"--lib",
20+
"--features",
21+
"sketchlib-tests",
22+
])
23+
.status()
24+
.expect("failed to spawn cargo test");
25+
26+
assert!(
27+
status.success(),
28+
"sketchlib backend tests failed (run `cargo test -p query_engine_rust --lib --features sketchlib-tests` for details)"
29+
);
30+
}

0 commit comments

Comments
 (0)