From 4f2f549a14a5faeaf371f22875e5bb3989a99396 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 12:40:37 -0400 Subject: [PATCH 01/25] step 1: adding pagination parameters --- api/src/helpers/mod.rs | 5 ++++- api/src/main.rs | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index cde8b86..736061f 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -8,4 +8,7 @@ pub mod schema; pub use schema::*; pub mod filters; -pub use filters::*; \ No newline at end of file +pub use filters::*; + +pub mod dataset_config; +pub use dataset_config::*; \ No newline at end of file diff --git a/api/src/main.rs b/api/src/main.rs index 42df6ec..374a5d5 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -17,6 +17,7 @@ use api::helpers::filters; use api::helpers::transforms; use api::helpers::schema; use api::helpers::helpers; +use api::helpers::dataset_config; use mongodb::{options::FindOptions, bson::Document, error::Result}; use actix_web::{get, web, App, HttpResponse, HttpServer, Responder}; @@ -37,6 +38,11 @@ static TIMESERIES: Lazy>>> = Lazy::new(|| Mutex::new( async fn search_data_schema(query_params: web::Query) -> impl Responder { let params = query_params.into_inner(); + // Dataset-specific request-size policy. Step 1 of the pagination work + // just binds this; later steps will consume `tile_degrees` (for tile + // generation) and `max_radius_meters` (for center+radius caps). + let _config = &dataset_config::BSOSE_CONFIG; + // validate query params //////////////////////////////////////// match helpers::validate_query_params(¶ms) { Ok(_) => {}, From dde38481a5375d4354091283be12e37ddb04b393 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 12:41:10 -0400 Subject: [PATCH 02/25] step 1: adding pagination parameters --- api/src/helpers/dataset_config.rs | 67 +++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 api/src/helpers/dataset_config.rs diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs new file mode 100644 index 0000000..3811335 --- /dev/null +++ b/api/src/helpers/dataset_config.rs @@ -0,0 +1,67 @@ +//! Per-dataset configuration governing request-size limits. +//! +//! This module is the seam where pagination decisions hang off the dataset +//! identity. Future steps will consult `tile_degrees` to generate spatial +//! pagination tiles, and `max_radius_meters` to reject oversize `center + +//! radius` queries (which go through MongoDB `$near` / `$geoNear` and aren't +//! paginated). +//! +//! Step 1 (current): introduce the type and a BSOSE-specific instance. No +//! behaviour change yet — the handler binds the config but does not act on it. + +/// Per-dataset request-size policy. +/// +/// `tile_degrees`: edge length (degrees of longitude and latitude) of one +/// spatial pagination tile. For grid-uniform datasets, choose this so that +/// one (tile × single level) page contains at most ~1600 documents. For +/// BSOSE (1/4° grid) that means 10° tiles. +/// +/// `max_radius_meters`: hard upper bound on the `radius` query parameter for +/// `center + radius` requests. These bypass tile pagination because Mongo's +/// `$near` enforces its own bound; we cap the bound so a malicious or naive +/// caller can't ask for a half-globe disk. +pub struct DatasetConfig { + pub tile_degrees: f64, + pub max_radius_meters: f64, +} + +/// Configuration for the BSOSE timeseries dataset. +/// +/// 10° tiles × 4 grid cells/degree = 40 × 40 = 1600 cells per (tile, level). +/// `max_radius_meters` is a placeholder; revisit with a real +/// operational limit once we have request-distribution data. +pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 2_000_000.0, // 2000 km — placeholder +}; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn bsose_tile_degrees_is_positive_and_divides_a_hemisphere() { + assert!(BSOSE_CONFIG.tile_degrees > 0.0); + // We don't strictly require integer-divisibility of 180/360 by + // tile_degrees (the tile generator will handle ragged remainders), + // but a divisor is a useful invariant to flag if someone bumps the + // value to something exotic like 7.0. + assert!( + (180.0_f64 % BSOSE_CONFIG.tile_degrees).abs() < 1e-9, + "tile_degrees should evenly divide 180° for clean global coverage" + ); + assert!( + (360.0_f64 % BSOSE_CONFIG.tile_degrees).abs() < 1e-9, + "tile_degrees should evenly divide 360° for clean global coverage" + ); + } + + #[test] + fn bsose_max_radius_is_positive_and_subhemispheric() { + assert!(BSOSE_CONFIG.max_radius_meters > 0.0); + // Earth's mean radius is ~6.371e6 m; a half-circumference is ~2.0e7 m. + // We want our cap well under that so we never approach the + // antipode-degenerate case that breaks Mongo geo queries. + assert!(BSOSE_CONFIG.max_radius_meters < 1.0e7); + } +} From 0d46f6ab55e82d6e273f0805baa9c0a0660cafa0 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 12:49:31 -0400 Subject: [PATCH 03/25] step 2: level awareness --- api/src/helpers/dataset_config.rs | 54 +++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 3811335..79b561b 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -20,11 +20,31 @@ /// `center + radius` requests. These bypass tile pagination because Mongo's /// `$near` enforces its own bound; we cap the bound so a malicious or naive /// caller can't ask for a half-globe disk. +/// +/// `levels`: the discrete vertical levels the dataset is sampled at, in +/// strictly increasing order (shallowest first). Pagination treats each +/// level as a separate page within a spatial tile. Datasets without a +/// vertical dimension can pass a single-element slice (effectively a single +/// "level" per tile). pub struct DatasetConfig { pub tile_degrees: f64, pub max_radius_meters: f64, + pub levels: &'static [f64], } +/// Placeholder BSOSE level spectrum. +/// +/// These are *not* the real BSOSE levels — Katie will overwrite them with +/// the actual depths once we have them in hand. The shape (roughly: +/// near-surface dense, deep-ocean coarse, ~5500 m bottom) is representative +/// of typical Southern Ocean gridded products so the rest of the pagination +/// machinery sees plausible input. +pub const BSOSE_LEVELS: &[f64] = &[ + 5.0, 15.0, 25.0, 40.0, 60.0, 85.0, 120.0, 165.0, 220.0, 290.0, + 380.0, 490.0, 625.0, 790.0, 990.0, 1230.0, 1520.0, 1870.0, + 2290.0, 2790.0, 3380.0, 4070.0, 4870.0, 5575.0, +]; + /// Configuration for the BSOSE timeseries dataset. /// /// 10° tiles × 4 grid cells/degree = 40 × 40 = 1600 cells per (tile, level). @@ -33,6 +53,7 @@ pub struct DatasetConfig { pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { tile_degrees: 10.0, max_radius_meters: 2_000_000.0, // 2000 km — placeholder + levels: BSOSE_LEVELS, }; #[cfg(test)] @@ -64,4 +85,37 @@ mod tests { // antipode-degenerate case that breaks Mongo geo queries. assert!(BSOSE_CONFIG.max_radius_meters < 1.0e7); } + + #[test] + fn bsose_levels_is_non_empty() { + // Pagination treats each level as a page; an empty level list would + // produce a dataset with zero pages, which is almost certainly a + // misconfiguration rather than an intentional state. + assert!(!BSOSE_CONFIG.levels.is_empty()); + } + + #[test] + fn bsose_levels_is_strictly_increasing() { + // The tile generator will rely on level order to map a level index + // to a (lower, upper) depth bracket. If two levels collide or the + // sequence reverses, that mapping is ambiguous. + for w in BSOSE_CONFIG.levels.windows(2) { + assert!( + w[0] < w[1], + "levels must be strictly increasing; found {} not < {}", + w[0], + w[1] + ); + } + } + + #[test] + fn bsose_levels_are_all_non_negative() { + // Depth is conventionally positive-downward in oceanographic data; + // a negative value would indicate a sign-convention bug we'd want + // to catch early. + for &d in BSOSE_CONFIG.levels { + assert!(d >= 0.0, "level depths should be non-negative; got {}", d); + } + } } From 7901160d8e94f1c0a2c0aada6a39b9ce0c99d768 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 13:18:08 -0400 Subject: [PATCH 04/25] step 3: tile generation --- api/src/helpers/mod.rs | 5 +- api/src/helpers/tile_generator.rs | 502 ++++++++++++++++++++++++++++++ 2 files changed, 506 insertions(+), 1 deletion(-) create mode 100644 api/src/helpers/tile_generator.rs diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index 736061f..795d6dc 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -11,4 +11,7 @@ pub mod filters; pub use filters::*; pub mod dataset_config; -pub use dataset_config::*; \ No newline at end of file +pub use dataset_config::*; + +pub mod tile_generator; +pub use tile_generator::*; \ No newline at end of file diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs new file mode 100644 index 0000000..624d85e --- /dev/null +++ b/api/src/helpers/tile_generator.rs @@ -0,0 +1,502 @@ +//! Pure-function tile generator for paginated geo+depth queries. +//! +//! Given the user's query params and a per-dataset config, produce the +//! ordered sequence of `TileSpec`s that pagination will walk through. Each +//! `TileSpec` carries an optional grid-aligned bounding box (the spatial +//! page boundary) and an optional level index (the depth page). +//! +//! Iteration order is **spatial outer, level inner**: for each spatial tile, +//! all dataset levels are emitted before moving to the next spatial tile. +//! This means a client paginating linearly sees the full water column at +//! one location before getting any data from the next location — friendlier +//! for vertical-profile analysis than horizontal-slab analysis. +//! +//! The tiles are *grid-aligned* multiples of `config.tile_degrees`. We do +//! NOT clip tiles to the user's box or polygon: the user's geo filter stays +//! in the Mongo query, and Mongo intersects it with the tile bbox at query +//! time. This means a 10°×10° tile gives the BSOSE-class upper bound of +//! ~1600 docs even when the user's query covers only a sliver of it. +//! +//! Step 3 scope: this is a pure function. It does not look up the database, +//! it does not skip empty tiles, and it does not compose with the existing +//! Mongo filter. Those are Steps 4–6. +//! +//! Known limitation: polygons that cross the antimeridian produce a naive +//! bbox that spans most of the globe (min_lon ≈ -180, max_lon ≈ +180). The +//! existing filter code doesn't handle antimeridian polygons either, so we +//! match that behaviour for now and leave a proper fix for later. + +use serde_json::Value; + +use super::dataset_config::DatasetConfig; + +/// A longitude/latitude bounding box. `sw` is the south-west corner +/// (min lon, min lat); `ne` is the north-east corner (max lon, max lat). +/// The box is *half-open* in both dimensions: `[sw_lon, ne_lon) × [sw_lat, +/// ne_lat)`. Documents on the south or west edge belong to the tile; +/// documents on the north or east edge belong to the next tile over. This +/// matters at tile boundaries for grid-aligned datasets like BSOSE — see +/// the box construction in `grid_aligned_tiles`. +#[derive(Debug, Clone, PartialEq)] +pub struct BoundingBox { + pub sw: [f64; 2], + pub ne: [f64; 2], +} + +/// One unit of pagination. Both fields are `Option` because some query +/// shapes naturally suppress one or the other: +/// +/// - `id` lookups have no spatial or level constraint beyond the id +/// itself, so both fields are `None`. +/// - `center + radius` is level-paginated but not spatially tiled, so +/// `tile_bbox` is `None` and `level_index` walks the dataset's levels. +/// - polygon / box / whole-globe requests produce a full grid of +/// `Some(bbox) + Some(level_index)` tiles. +#[derive(Debug, Clone, PartialEq)] +pub struct TileSpec { + pub tile_bbox: Option, + pub level_index: Option, +} + +/// Produce the ordered tile sequence for one request. +/// +/// Precondition: the caller has already validated `params` via +/// `helpers::validate_query_params`, so polygon strings are parseable, box +/// arrays have the right shape, and center/radius are paired. This function +/// is defensive about parse failures (returns an empty vec) but does not +/// re-validate semantics. +pub fn generate_tiles(params: &Value, config: &DatasetConfig) -> Vec { + // id wins over everything else: it's a primary-key lookup, no tiling + // adds value. One TileSpec with no extra constraints. + if params.get("id").is_some() { + return vec![TileSpec { + tile_bbox: None, + level_index: None, + }]; + } + + // center + radius: level-only pagination. The `$near` query is bounded + // by `max_radius_meters` (enforced in a later step), so we don't tile + // it spatially. + if params.get("center").is_some() { + return level_only_tiles(config); + } + + // polygon: tile the polygon's naive bbox. The polygon itself remains in + // the Mongo filter; Mongo computes (polygon ∩ tile_bbox) per page. + if let Some(polygon) = params.get("polygon").and_then(|v| v.as_str()) { + return polygon_tiles(polygon, config); + } + + // box: tile the box, splitting at the antimeridian if needed. + if let Some(boxregion) = params.get("box").and_then(|v| v.as_str()) { + return box_tiles(boxregion, config); + } + + // no spatial filter: tile the whole globe. + whole_globe_tiles(config) +} + +// --------------------------------------------------------------------------- + +fn level_only_tiles(config: &DatasetConfig) -> Vec { + (0..config.levels.len()) + .map(|i| TileSpec { + tile_bbox: None, + level_index: Some(i), + }) + .collect() +} + +fn whole_globe_tiles(config: &DatasetConfig) -> Vec { + cross_levels( + grid_aligned_tiles( + BoundingBox { + sw: [-180.0, -90.0], + ne: [180.0, 90.0], + }, + config.tile_degrees, + ), + config, + ) +} + +fn box_tiles(boxregion: &str, config: &DatasetConfig) -> Vec { + let parsed: Vec> = match serde_json::from_str(boxregion) { + Ok(v) => v, + Err(_) => return Vec::new(), + }; + if parsed.len() != 2 || parsed[0].len() != 2 || parsed[1].len() != 2 { + return Vec::new(); + } + let sw = [parsed[0][0], parsed[0][1]]; + let ne = [parsed[1][0], parsed[1][1]]; + + // Dateline-crossing box: sw lon > ne lon means the box wraps the + // antimeridian. We split it into two grid-aligned pieces, one ending at + // +180 and one starting at -180. This mirrors what the existing + // `box_filter` does for the Mongo predicate. + let sub_boxes: Vec = if sw[0] > ne[0] { + vec![ + BoundingBox { + sw, + ne: [180.0, ne[1]], + }, + BoundingBox { + sw: [-180.0, sw[1]], + ne, + }, + ] + } else { + vec![BoundingBox { sw, ne }] + }; + + let mut tiles = Vec::new(); + for bbox in sub_boxes { + tiles.extend(grid_aligned_tiles(bbox, config.tile_degrees)); + } + cross_levels(tiles, config) +} + +fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { + let coords: Vec> = match serde_json::from_str(polygon) { + Ok(v) => v, + Err(_) => return Vec::new(), + }; + + let mut min_lon = f64::INFINITY; + let mut max_lon = f64::NEG_INFINITY; + let mut min_lat = f64::INFINITY; + let mut max_lat = f64::NEG_INFINITY; + for pt in &coords { + if pt.len() < 2 { + continue; + } + if pt[0] < min_lon { + min_lon = pt[0]; + } + if pt[0] > max_lon { + max_lon = pt[0]; + } + if pt[1] < min_lat { + min_lat = pt[1]; + } + if pt[1] > max_lat { + max_lat = pt[1]; + } + } + if !min_lon.is_finite() || !max_lat.is_finite() { + return Vec::new(); + } + + cross_levels( + grid_aligned_tiles( + BoundingBox { + sw: [min_lon, min_lat], + ne: [max_lon, max_lat], + }, + config.tile_degrees, + ), + config, + ) +} + +/// Tile a bbox into grid-aligned cells of side `tile_degrees`. The grid is +/// anchored at integer multiples of `tile_degrees` from the origin (so for +/// tile_degrees=10, edges are at ...,-20, -10, 0, 10, 20,...). Each emitted +/// tile is half-open: `[lon, lon+T) × [lat, lat+T)`. +fn grid_aligned_tiles(bbox: BoundingBox, tile_degrees: f64) -> Vec { + let mut out = Vec::new(); + // Snap the bbox's SW corner down to the nearest grid line. These are + // the *first* tile's SW corner. Subsequent tiles step by tile_degrees. + let lat_start = (bbox.sw[1] / tile_degrees).floor() * tile_degrees; + let lon_start = (bbox.sw[0] / tile_degrees).floor() * tile_degrees; + + let mut lat = lat_start; + while lat < bbox.ne[1] { + let mut lon = lon_start; + while lon < bbox.ne[0] { + out.push(BoundingBox { + sw: [lon, lat], + ne: [lon + tile_degrees, lat + tile_degrees], + }); + lon += tile_degrees; + } + lat += tile_degrees; + } + + out +} + +/// Cross a list of spatial tiles with the dataset's levels in +/// spatial-outer / level-inner order. +fn cross_levels(spatial: Vec, config: &DatasetConfig) -> Vec { + let n_levels = config.levels.len(); + let mut out = Vec::with_capacity(spatial.len() * n_levels.max(1)); + for bbox in spatial { + if n_levels == 0 { + // Defensive: a dataset with zero levels shouldn't pass our + // config tests, but if it did we'd still emit one tile per + // spatial cell so the request returns *something*. + out.push(TileSpec { + tile_bbox: Some(bbox), + level_index: None, + }); + } else { + for i in 0..n_levels { + out.push(TileSpec { + tile_bbox: Some(bbox.clone()), + level_index: Some(i), + }); + } + } + } + out +} + +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + /// A small, hand-checkable config. Two levels keeps the level multiplier + /// out of the way; 10° tiles match BSOSE. + const TEST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1.0e6, + levels: &[0.0, 100.0], + }; + + // ---- top-level dispatch -------------------------------------------------- + + #[test] + fn id_short_circuits_to_single_null_tile() { + let tiles = generate_tiles(&json!({"id": "doc1"}), &TEST_CONFIG); + assert_eq!( + tiles, + vec![TileSpec { + tile_bbox: None, + level_index: None + }] + ); + } + + #[test] + fn id_wins_over_other_geo_params() { + // If the caller passes id alongside a box, id should still + // short-circuit. (Validate-step normally forbids combining geo + // filters, but `id` isn't in that mutual-exclusion check.) + let tiles = generate_tiles( + &json!({"id": "doc1", "box": "[[0,0],[10,10]]"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), 1); + assert_eq!(tiles[0].tile_bbox, None); + } + + #[test] + fn center_radius_is_level_only() { + let tiles = generate_tiles( + &json!({"center": "[0,0]", "radius": "1000"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), TEST_CONFIG.levels.len()); + for (i, t) in tiles.iter().enumerate() { + assert_eq!(t.tile_bbox, None, "no spatial tile for center+radius"); + assert_eq!(t.level_index, Some(i), "level pages in order"); + } + } + + // ---- whole-globe --------------------------------------------------------- + + #[test] + fn empty_params_tile_whole_globe() { + let tiles = generate_tiles(&json!({}), &TEST_CONFIG); + // 18 lat rows × 36 lon cols × 2 levels = 1296 + assert_eq!(tiles.len(), 18 * 36 * TEST_CONFIG.levels.len()); + } + + #[test] + fn whole_globe_first_tile_is_southwest_corner_level_zero() { + let tiles = generate_tiles(&json!({}), &TEST_CONFIG); + assert_eq!( + tiles[0], + TileSpec { + tile_bbox: Some(BoundingBox { + sw: [-180.0, -90.0], + ne: [-170.0, -80.0], + }), + level_index: Some(0), + } + ); + } + + #[test] + fn whole_globe_last_tile_is_northeast_corner_top_level() { + let tiles = generate_tiles(&json!({}), &TEST_CONFIG); + let last = tiles.last().unwrap(); + assert_eq!( + last, + &TileSpec { + tile_bbox: Some(BoundingBox { + sw: [170.0, 80.0], + ne: [180.0, 90.0], + }), + level_index: Some(TEST_CONFIG.levels.len() - 1), + } + ); + } + + // ---- ordering: spatial outer, level inner -------------------------------- + + #[test] + fn ordering_is_spatial_outer_level_inner() { + let tiles = generate_tiles(&json!({"box": "[[0,0],[20,10]]"}), &TEST_CONFIG); + // 2 spatial tiles × 2 levels = 4 specs, in order: + // (tile0, lvl0), (tile0, lvl1), (tile1, lvl0), (tile1, lvl1) + assert_eq!(tiles.len(), 4); + assert_eq!(tiles[0].level_index, Some(0)); + assert_eq!(tiles[1].level_index, Some(1)); + assert_eq!(tiles[2].level_index, Some(0)); + assert_eq!(tiles[3].level_index, Some(1)); + // Same spatial tile for indices 0,1; same for 2,3; different + // between the pairs. + assert_eq!(tiles[0].tile_bbox, tiles[1].tile_bbox); + assert_eq!(tiles[2].tile_bbox, tiles[3].tile_bbox); + assert_ne!(tiles[0].tile_bbox, tiles[2].tile_bbox); + } + + // ---- box ----------------------------------------------------------------- + + #[test] + fn single_cell_box_grid_aligned() { + let tiles = generate_tiles(&json!({"box": "[[0,0],[10,10]]"}), &TEST_CONFIG); + assert_eq!(tiles.len(), TEST_CONFIG.levels.len()); + assert_eq!( + tiles[0].tile_bbox, + Some(BoundingBox { + sw: [0.0, 0.0], + ne: [10.0, 10.0] + }) + ); + } + + #[test] + fn multi_cell_box_emits_grid_tiles() { + // User box (3.5, 7.2) → (24.8, 19.1). Snaps to grid lines at 0, 10, + // 20, 30 longitude and 0, 10, 20 latitude. Tiles covering the box: + // 3 cols × 2 rows = 6 tiles. + let tiles = generate_tiles( + &json!({"box": "[[3.5,7.2],[24.8,19.1]]"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), 6 * TEST_CONFIG.levels.len()); + + // First spatial tile should be the SW grid cell [0..10, 0..10]. + // Tiles are NOT clipped to the user box — the user box stays in the + // Mongo filter. + assert_eq!( + tiles[0].tile_bbox, + Some(BoundingBox { + sw: [0.0, 0.0], + ne: [10.0, 10.0] + }) + ); + + // Distinct spatial tiles + let spatial: std::collections::HashSet<_> = + tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + assert_eq!(spatial.len(), 6); + } + + #[test] + fn dateline_crossing_box_splits_into_two_bands() { + // SW lon 170 > NE lon -170 — the box wraps the antimeridian. + let tiles = generate_tiles( + &json!({"box": "[[170,10],[-170,20]]"}), + &TEST_CONFIG, + ); + // Two sub-boxes, each one 10°×10° = one grid cell, × 2 levels = 4. + assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); + + let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + // East band tile + assert!(bboxes.contains(&Some(BoundingBox { + sw: [170.0, 10.0], + ne: [180.0, 20.0], + }))); + // West band tile + assert!(bboxes.contains(&Some(BoundingBox { + sw: [-180.0, 10.0], + ne: [-170.0, 20.0], + }))); + } + + #[test] + fn malformed_box_returns_empty() { + // Wrong nesting depth + let tiles = generate_tiles(&json!({"box": "[1,2,3,4]"}), &TEST_CONFIG); + assert!(tiles.is_empty()); + } + + // ---- polygon ------------------------------------------------------------- + + #[test] + fn polygon_uses_naive_bbox() { + // Diamond polygon centred at (15, 15). Bbox is [10,10]→[20,20]. + let tiles = generate_tiles( + &json!({"polygon": "[[10,15],[15,10],[20,15],[15,20],[10,15]]"}), + &TEST_CONFIG, + ); + // bbox spans one 10° cell, so 1 spatial tile × 2 levels = 2 specs. + assert_eq!(tiles.len(), TEST_CONFIG.levels.len()); + assert_eq!( + tiles[0].tile_bbox, + Some(BoundingBox { + sw: [10.0, 10.0], + ne: [20.0, 20.0] + }) + ); + } + + #[test] + fn polygon_spanning_multiple_cells() { + // L-shaped-ish polygon. Bbox is [0,0]→[25,15]. + let tiles = generate_tiles( + &json!({"polygon": + "[[0,0],[25,0],[25,5],[15,5],[15,15],[0,15],[0,0]]" + }), + &TEST_CONFIG, + ); + // Bbox tile count: 3 cols (0,10,20) × 2 rows (0,10) = 6 cells. + assert_eq!(tiles.len(), 6 * TEST_CONFIG.levels.len()); + } + + #[test] + fn malformed_polygon_returns_empty() { + let tiles = generate_tiles(&json!({"polygon": "not json"}), &TEST_CONFIG); + assert!(tiles.is_empty()); + } + + // ---- defensive: dataset with no levels ---------------------------------- + + #[test] + fn empty_level_list_still_emits_per_spatial_tile() { + // Pathological config (the production configs have non-empty + // levels guarded by a test), but the tile generator should still + // not crash and should emit one spec per spatial tile. + const EMPTY_LEVELS_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1.0e6, + levels: &[], + }; + let tiles = generate_tiles( + &json!({"box": "[[0,0],[10,10]]"}), + &EMPTY_LEVELS_CONFIG, + ); + assert_eq!(tiles.len(), 1); + assert_eq!(tiles[0].level_index, None); + } +} From df926c48d47d49fdb282f2a395e44c967680ed0d Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 13:25:58 -0400 Subject: [PATCH 05/25] test patching --- api/src/helpers/tile_generator.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index 624d85e..1da844a 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -405,9 +405,15 @@ mod tests { }) ); - // Distinct spatial tiles - let spatial: std::collections::HashSet<_> = - tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + // Distinct spatial tiles. BoundingBox holds f64 so we can't put it + // in a HashSet (no Eq/Hash); a linear dedup against PartialEq is + // fine for a 12-element vector. + let mut spatial: Vec> = Vec::new(); + for t in &tiles { + if !spatial.iter().any(|b| b == &t.tile_bbox) { + spatial.push(t.tile_bbox.clone()); + } + } assert_eq!(spatial.len(), 6); } From 850915e2e11e9c117f500a97fcbeadbb22a649b8 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 13:51:27 -0400 Subject: [PATCH 06/25] step 4 - compose user filter with tiling filter --- api/src/helpers/mod.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index 795d6dc..fa6c371 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -14,4 +14,7 @@ pub mod dataset_config; pub use dataset_config::*; pub mod tile_generator; -pub use tile_generator::*; \ No newline at end of file +pub use tile_generator::*; + +pub mod filter_composer; +pub use filter_composer::*; \ No newline at end of file From bcf2d3053cd2cb66c1de79a16640dbbc748056df Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 13:54:47 -0400 Subject: [PATCH 07/25] step 4 - compose user filter with tiling filter --- api/src/helpers/filter_composer.rs | 311 +++++++++++++++++++++++++++++ 1 file changed, 311 insertions(+) create mode 100644 api/src/helpers/filter_composer.rs diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs new file mode 100644 index 0000000..84be3e6 --- /dev/null +++ b/api/src/helpers/filter_composer.rs @@ -0,0 +1,311 @@ +//! Compose a tile-aware Mongo filter from query params + a `TileSpec`. +//! +//! The composer is the seam between three pieces: +//! +//! - `filters::filter_timeseries` builds the *user* filter from the raw +//! query params (id, polygon, box, center+radius, verticalRange, …). +//! - `tile_generator::generate_tiles` produces a sequence of `TileSpec`s +//! describing one page of the result. +//! - `dataset_config::DatasetConfig` knows the discrete level values that +//! a `level_index` resolves to. +//! +//! This module's job is to translate a `TileSpec` into BSON predicates and +//! AND them with the user filter. Tile bbox becomes +//! `geolocation.coordinates: { $geoWithin: { $box: [[sw], [ne]] } }`, which +//! matches the cartesian-box shape the existing user-box filter already +//! uses. Level index becomes `level: { $gte: L_i, $lt: L_{i+1} }`, with the +//! upper bound omitted for the deepest level so the bracket extends to +//! +infinity (catches anything past the configured maximum). +//! +//! Combination rule: if either side is empty, return the non-empty side +//! directly; otherwise wrap both sides in `$and`. This keeps the resulting +//! query document compact for common cases (id lookups, whole-globe tiles +//! with no user filter) and only nests when we actually need to. +//! +//! Defensive on bounds: an `level_index` past the end of `config.levels` is +//! a programming error (the tile generator should never emit one), but if +//! one slips through we drop the level clause silently rather than panic. + +use mongodb::bson::{doc, Bson, Document}; +use serde_json::Value; + +use super::dataset_config::DatasetConfig; +use super::filters::filter_timeseries; +use super::tile_generator::TileSpec; + +/// Build the Mongo filter document for one page of a paginated request. +pub fn compose_filter_with_tile( + params: Value, + tile: &TileSpec, + config: &DatasetConfig, +) -> Document { + let user = filter_timeseries(params); + let tile_doc = build_tile_filter(tile, config); + combine_user_and_tile(user, tile_doc) +} + +/// Translate a `TileSpec` into the additional Mongo predicates it imposes +/// (spatial bounding box, level bracket). Returns an empty `Document` if +/// the tile is fully null (both fields `None`) — e.g. id lookups. +fn build_tile_filter(tile: &TileSpec, config: &DatasetConfig) -> Document { + let mut out = Document::new(); + + if let Some(bbox) = &tile.tile_bbox { + // Match the format used by the existing user-box filter: + // geolocation.coordinates: { $geoWithin: { $box: [[sw], [ne]] } } + // Using cartesian $box (rather than spherical $geometry: Polygon) + // is fine at 10° tile scale where planar approximation is accurate + // enough, and it's cheap for Mongo to evaluate. + let box_array: Vec> = vec![ + vec![bbox.sw[0], bbox.sw[1]], + vec![bbox.ne[0], bbox.ne[1]], + ]; + out.insert( + "geolocation.coordinates", + doc! { "$geoWithin": { "$box": box_array } }, + ); + } + + if let Some(i) = tile.level_index { + let levels = config.levels; + if i < levels.len() { + let lower = levels[i]; + let mut level_clause = doc! { "$gte": lower }; + if i + 1 < levels.len() { + // Half-open bracket: [L_i, L_{i+1}). + level_clause.insert("$lt", levels[i + 1]); + } + // For the final level we omit `$lt`, leaving the bracket as + // [L_last, +∞) — anything at or below the configured deepest + // level lands in this page. + out.insert("level", level_clause); + } + // i out of bounds: silently drop the clause. The tile generator + // shouldn't ever produce this; treating it as "no level filter" is + // the most forgiving recovery if something upstream goes wrong. + } + + out +} + +/// Combine the user filter and the tile filter. Avoids `$and` wrapping +/// when one side is empty, both to keep query docs readable and to give +/// Mongo the simplest possible shape to plan against. +fn combine_user_and_tile(user: Document, tile: Document) -> Document { + if user.is_empty() { + return tile; + } + if tile.is_empty() { + return user; + } + let mut combined = Document::new(); + combined.insert( + "$and", + Bson::Array(vec![Bson::Document(user), Bson::Document(tile)]), + ); + combined +} + +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use crate::helpers::tile_generator::BoundingBox; + use serde_json::json; + + /// Two-level dataset config: keeps test math simple and exercises both + /// the bracketed-level case and the open-ended final-level case. + const TEST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1.0e6, + levels: &[100.0, 500.0], + }; + + fn null_tile() -> TileSpec { + TileSpec { + tile_bbox: None, + level_index: None, + } + } + + fn full_tile(level_index: usize) -> TileSpec { + TileSpec { + tile_bbox: Some(BoundingBox { + sw: [0.0, 0.0], + ne: [10.0, 10.0], + }), + level_index: Some(level_index), + } + } + + // ---- empty user filter shortcut ---------------------------------------- + + #[test] + fn empty_params_and_null_tile_yields_empty_filter() { + let f = compose_filter_with_tile(json!({}), &null_tile(), &TEST_CONFIG); + assert!(f.is_empty()); + } + + #[test] + fn empty_params_with_bbox_only_returns_bbox_directly_no_and() { + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [0.0, 0.0], + ne: [10.0, 10.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + // No $and wrapping — empty user filter means we return tile alone. + assert!(f.get_array("$and").is_err()); + let geo = f.get_document("geolocation.coordinates").unwrap(); + let within = geo.get_document("$geoWithin").unwrap(); + assert!(within.get_array("$box").is_ok()); + } + + #[test] + fn empty_params_with_level_only_returns_level_directly() { + let tile = TileSpec { + tile_bbox: None, + level_index: Some(0), + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let level = f.get_document("level").unwrap(); + assert!((level.get_f64("$gte").unwrap() - 100.0).abs() < 1e-9); + assert!((level.get_f64("$lt").unwrap() - 500.0).abs() < 1e-9); + } + + #[test] + fn empty_params_with_full_tile_has_both_clauses_no_and() { + let f = compose_filter_with_tile(json!({}), &full_tile(0), &TEST_CONFIG); + assert!(f.get_array("$and").is_err()); + assert!(f.get_document("geolocation.coordinates").is_ok()); + assert!(f.get_document("level").is_ok()); + } + + // ---- $and wrapping when user filter is non-empty ----------------------- + + #[test] + fn user_box_plus_tile_bbox_wraps_in_and() { + let f = compose_filter_with_tile( + json!({"box": "[[5.0, 5.0], [15.0, 15.0]]"}), + &full_tile(0), + &TEST_CONFIG, + ); + let parts = f.get_array("$and").expect("should be $and-wrapped"); + assert_eq!(parts.len(), 2); + // User filter has $or (from box_filter); tile filter has + // geolocation.coordinates. Both should appear, one per element. + let p0 = parts[0].as_document().unwrap(); + let p1 = parts[1].as_document().unwrap(); + assert!( + p0.get_array("$or").is_ok() || p1.get_array("$or").is_ok(), + "user box $or should land in one of the $and clauses" + ); + assert!( + p0.get_document("geolocation.coordinates").is_ok() + || p1.get_document("geolocation.coordinates").is_ok(), + "tile bbox should land in one of the $and clauses" + ); + } + + #[test] + fn user_polygon_plus_tile_bbox_wraps_in_and() { + let f = compose_filter_with_tile( + json!({"polygon": "[[0,0],[10,0],[10,10],[0,10],[0,0]]"}), + &full_tile(1), + &TEST_CONFIG, + ); + let parts = f.get_array("$and").expect("should be $and-wrapped"); + assert_eq!(parts.len(), 2); + } + + #[test] + fn user_vertical_range_plus_tile_level_wraps_in_and_and_keeps_both() { + let f = compose_filter_with_tile( + json!({"verticalRange": "[0.0, 1000.0]"}), + &TileSpec { + tile_bbox: None, + level_index: Some(0), + }, + &TEST_CONFIG, + ); + let parts = f.get_array("$and").expect("should be $and-wrapped"); + assert_eq!(parts.len(), 2); + // Both clauses must constrain `level`. Mongo will intersect them. + let p0 = parts[0].as_document().unwrap(); + let p1 = parts[1].as_document().unwrap(); + assert!(p0.get_document("level").is_ok()); + assert!(p1.get_document("level").is_ok()); + } + + #[test] + fn user_id_with_null_tile_returns_user_filter_alone() { + let f = + compose_filter_with_tile(json!({"id": "doc1"}), &null_tile(), &TEST_CONFIG); + assert!(f.get_array("$and").is_err()); + assert_eq!(f.get_str("_id").unwrap(), "doc1"); + } + + // ---- final-level open-ended bracket ------------------------------------ + + #[test] + fn final_level_omits_upper_bound() { + let f = compose_filter_with_tile( + json!({}), + &TileSpec { + tile_bbox: None, + level_index: Some(TEST_CONFIG.levels.len() - 1), + }, + &TEST_CONFIG, + ); + let level = f.get_document("level").unwrap(); + assert!((level.get_f64("$gte").unwrap() - 500.0).abs() < 1e-9); + assert!( + level.get_f64("$lt").is_err(), + "final level should have no $lt — bracket extends to +∞" + ); + } + + // ---- defensive: out-of-bounds level_index ------------------------------ + + #[test] + fn out_of_bounds_level_index_drops_level_clause() { + let f = compose_filter_with_tile( + json!({}), + &TileSpec { + tile_bbox: None, + level_index: Some(TEST_CONFIG.levels.len() + 5), + }, + &TEST_CONFIG, + ); + // No level clause emitted, and since user filter is empty too, + // the whole filter is empty. + assert!(f.is_empty()); + } + + // ---- bbox shape sanity -------------------------------------------------- + + #[test] + fn tile_bbox_emits_corner_pair_in_box_array() { + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [-10.0, -5.0], + ne: [20.0, 15.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let geo = f.get_document("geolocation.coordinates").unwrap(); + let within = geo.get_document("$geoWithin").unwrap(); + let box_array = within.get_array("$box").unwrap(); + assert_eq!(box_array.len(), 2); + let sw = box_array[0].as_array().unwrap(); + let ne = box_array[1].as_array().unwrap(); + assert!((sw[0].as_f64().unwrap() - -10.0).abs() < 1e-9); + assert!((sw[1].as_f64().unwrap() - -5.0).abs() < 1e-9); + assert!((ne[0].as_f64().unwrap() - 20.0).abs() < 1e-9); + assert!((ne[1].as_f64().unwrap() - 15.0).abs() < 1e-9); + } +} From 2d4bff2bc7bda468f71f8062f1841cd892fe3bcd Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 14:19:10 -0400 Subject: [PATCH 08/25] step 5: query on composed filters --- api/src/main.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/api/src/main.rs b/api/src/main.rs index 374a5d5..f3537d0 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -13,11 +13,12 @@ nice to have someday transform logic as traits? */ -use api::helpers::filters; use api::helpers::transforms; use api::helpers::schema; use api::helpers::helpers; use api::helpers::dataset_config; +use api::helpers::tile_generator; +use api::helpers::filter_composer; use mongodb::{options::FindOptions, bson::Document, error::Result}; use actix_web::{get, web, App, HttpResponse, HttpServer, Responder}; @@ -38,10 +39,9 @@ static TIMESERIES: Lazy>>> = Lazy::new(|| Mutex::new( async fn search_data_schema(query_params: web::Query) -> impl Responder { let params = query_params.into_inner(); - // Dataset-specific request-size policy. Step 1 of the pagination work - // just binds this; later steps will consume `tile_degrees` (for tile - // generation) and `max_radius_meters` (for center+radius caps). - let _config = &dataset_config::BSOSE_CONFIG; + // Dataset-specific request-size policy. Drives tile sizing in Step 6 + // and radius capping in Step 7. + let config = &dataset_config::BSOSE_CONFIG; // validate query params //////////////////////////////////////// match helpers::validate_query_params(¶ms) { @@ -50,7 +50,17 @@ async fn search_data_schema(query_params: web::Query) -> impl } // construct filter from query params ////////////////////////// - let filter = filters::filter_timeseries(params.clone()); + // + // Step 5: route the filter through the tile-aware composer. The tile + // is a *passthrough* — no spatial bbox, no level bracket — so the + // composer returns the user filter unchanged and behavior is identical + // to the pre-pagination handler. Step 6 will replace this with a loop + // over real tiles generated by `tile_generator::generate_tiles`. + let tile = tile_generator::TileSpec { + tile_bbox: None, + level_index: None, + }; + let filter = filter_composer::compose_filter_with_tile(params.clone(), &tile, config); // open the cursor ////////////////////////////////////////////// let options = FindOptions::builder().build(); From c645d00b1f3b219839a6321add618bae62c5028f Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 14:47:56 -0400 Subject: [PATCH 09/25] step 6: wire up new pagination in main --- api/src/helpers/mod.rs | 5 +- api/src/helpers/pagination.rs | 218 +++++++++++++++++++++++ api/src/main.rs | 327 +++++++++++++++++++++------------- api/tests/integration.rs | 274 +++++++++++++++++++--------- 4 files changed, 616 insertions(+), 208 deletions(-) create mode 100644 api/src/helpers/pagination.rs diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index fa6c371..c473eb4 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -17,4 +17,7 @@ pub mod tile_generator; pub use tile_generator::*; pub mod filter_composer; -pub use filter_composer::*; \ No newline at end of file +pub use filter_composer::*; + +pub mod pagination; +pub use pagination::*; \ No newline at end of file diff --git a/api/src/helpers/pagination.rs b/api/src/helpers/pagination.rs new file mode 100644 index 0000000..ee5f17f --- /dev/null +++ b/api/src/helpers/pagination.rs @@ -0,0 +1,218 @@ +//! Pure helpers for tile-based pagination URLs and query-param parsing. +//! +//! This module is intentionally tiny and free of MongoDB/Actix dependencies +//! so that it can be unit-tested without standing up a server or a DB. The +//! handler in `main.rs` consumes these helpers when it needs to (a) read +//! the requested tile index off the query, and (b) build the next page's +//! URL after a successful serve. + +use serde_json::Value; + +/// Read `tile_index` from the query params. +/// +/// Returns `Ok(0)` if the param is absent (default behaviour: start at the +/// first tile). Returns `Err(...)` if the param is present but doesn't +/// parse as a non-negative integer — the handler turns this into a 400. +/// +/// Why strict-on-present-but-invalid: tile_index is almost always supplied +/// by a previous response's `next_url`, so a malformed value is a client +/// bug worth surfacing rather than silently restarting pagination at zero. +pub fn parse_tile_index(params: &Value) -> Result { + match params.get("tile_index") { + None => Ok(0), + Some(v) => v + .as_str() + .ok_or_else(|| "tile_index must be a string-encoded integer".to_string()) + .and_then(|s| { + s.parse::() + .map_err(|_| format!("tile_index '{}' is not a non-negative integer", s)) + }), + } +} + +/// Build the URL for the next page of a paginated request. +/// +/// The result preserves every query param from the current request except +/// `tile_index`, which is overwritten with `next_index`. Values are +/// percent-encoded so that polygon/box payloads (which contain `[`, `]`, +/// `,`) round-trip cleanly. The output is a path + query string with no +/// scheme/host — clients are expected to resolve it against the original +/// request's origin. +/// +/// Param order in the output is alphabetical, not insertion-order. This is +/// intentional: it makes the URL deterministic for a given (path, params, +/// next_index) tuple, which makes tests easy to write and caches behave +/// predictably. +pub fn build_next_url(path: &str, params: &Value, next_index: usize) -> String { + let mut pairs: Vec<(String, String)> = Vec::new(); + + if let Some(obj) = params.as_object() { + for (k, v) in obj { + if k == "tile_index" { + // The caller's tile_index gets overwritten with next_index + // below. Skipping it here also handles the case where the + // caller passed tile_index=N and we now serve tile N+M. + continue; + } + // All Actix query params arrive as strings, but a non-string + // value would just serialize via Display. Cheap and forgiving. + let s = match v { + Value::String(s) => s.clone(), + other => other.to_string(), + }; + pairs.push((k.clone(), s)); + } + } + pairs.push(("tile_index".to_string(), next_index.to_string())); + + pairs.sort_by(|a, b| a.0.cmp(&b.0)); + + let query: String = pairs + .iter() + .map(|(k, v)| format!("{}={}", url_encode(k), url_encode(v))) + .collect::>() + .join("&"); + + if query.is_empty() { + path.to_string() + } else { + format!("{}?{}", path, query) + } +} + +/// Minimal RFC 3986 unreserved-set percent-encoder. +/// +/// We could pull in `percent-encoding` or `urlencoding` for this, but it's +/// 15 lines and we don't want a new crate dependency for one helper. The +/// unreserved set (A-Z a-z 0-9 - . _ ~) passes through; everything else is +/// encoded as `%XX` of its UTF-8 byte representation. +fn url_encode(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for b in s.bytes() { + match b { + b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~' => { + out.push(b as char); + } + _ => out.push_str(&format!("%{:02X}", b)), + } + } + out +} + +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + // ---- parse_tile_index --------------------------------------------------- + + #[test] + fn parse_tile_index_defaults_to_zero_when_absent() { + assert_eq!(parse_tile_index(&json!({})).unwrap(), 0); + } + + #[test] + fn parse_tile_index_parses_string_integer() { + assert_eq!(parse_tile_index(&json!({"tile_index": "42"})).unwrap(), 42); + } + + #[test] + fn parse_tile_index_zero_is_allowed() { + assert_eq!(parse_tile_index(&json!({"tile_index": "0"})).unwrap(), 0); + } + + #[test] + fn parse_tile_index_rejects_non_numeric() { + assert!(parse_tile_index(&json!({"tile_index": "abc"})).is_err()); + } + + #[test] + fn parse_tile_index_rejects_negative() { + // usize::parse rejects negatives, so this falls through the same + // error path as garbage input. + assert!(parse_tile_index(&json!({"tile_index": "-1"})).is_err()); + } + + // ---- build_next_url ----------------------------------------------------- + + #[test] + fn build_next_url_just_tile_index_when_no_other_params() { + let out = build_next_url("/timeseries/bsose", &json!({}), 1); + assert_eq!(out, "/timeseries/bsose?tile_index=1"); + } + + #[test] + fn build_next_url_preserves_other_params() { + let out = build_next_url( + "/timeseries/bsose", + &json!({"data": "all"}), + 3, + ); + assert_eq!(out, "/timeseries/bsose?data=all&tile_index=3"); + } + + #[test] + fn build_next_url_overwrites_existing_tile_index() { + let out = build_next_url( + "/timeseries/bsose", + &json!({"data": "all", "tile_index": "0"}), + 5, + ); + // The 0 should not appear; the new value is 5. + assert_eq!(out, "/timeseries/bsose?data=all&tile_index=5"); + } + + #[test] + fn build_next_url_percent_encodes_brackets_and_commas() { + // Box payloads contain `[`, `]`, `,` — all reserved characters. + let out = build_next_url( + "/timeseries/bsose", + &json!({"box": "[[0,0],[10,10]]"}), + 1, + ); + assert!( + out.contains("box=%5B%5B0%2C0%5D%2C%5B10%2C10%5D%5D"), + "expected percent-encoded brackets and commas, got: {}", + out + ); + } + + #[test] + fn build_next_url_sorts_params_alphabetically() { + let out = build_next_url( + "/timeseries/bsose", + &json!({"data": "all", "id": "doc1"}), + 7, + ); + // Alphabetical: data, id, tile_index. + assert_eq!(out, "/timeseries/bsose?data=all&id=doc1&tile_index=7"); + } + + #[test] + fn build_next_url_handles_iso_dates() { + // ISO 8601 dates contain `:` which is reserved. Encoder should + // turn it into %3A. + let out = build_next_url( + "/timeseries/bsose", + &json!({"startDate": "2020-01-01T00:00:00Z"}), + 0, + ); + assert!(out.contains("startDate=2020-01-01T00%3A00%3A00Z"), "got: {}", out); + } + + #[test] + fn url_encode_passes_through_unreserved_set() { + assert_eq!(url_encode("abcXYZ-._~0123"), "abcXYZ-._~0123"); + } + + #[test] + fn url_encode_handles_multibyte_utf8() { + // The percent-encoding spec operates on UTF-8 bytes. A single + // emoji is 4 bytes → 4 %XX groups. + let out = url_encode("🐱"); + assert_eq!(out.len(), 12); // 4 bytes × 3 chars per byte + assert!(out.starts_with("%")); + } +} diff --git a/api/src/main.rs b/api/src/main.rs index f3537d0..cb4a774 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -19,9 +19,10 @@ use api::helpers::helpers; use api::helpers::dataset_config; use api::helpers::tile_generator; use api::helpers::filter_composer; +use api::helpers::pagination; use mongodb::{options::FindOptions, bson::Document, error::Result}; -use actix_web::{get, web, App, HttpResponse, HttpServer, Responder}; +use actix_web::{get, web, App, HttpRequest, HttpResponse, HttpServer, Responder}; use once_cell::sync::Lazy; use std::sync::Mutex; use futures::stream::StreamExt; @@ -31,178 +32,252 @@ use serde::de::DeserializeOwned; use mongodb::bson::DateTime; use std::collections::HashSet; use async_stream::stream; +use serde_json::{json, Value}; static CLIENT: Lazy>> = Lazy::new(|| Mutex::new(None)); static TIMESERIES: Lazy>>> = Lazy::new(|| Mutex::new(None)); #[get("/timeseries/bsose")] -async fn search_data_schema(query_params: web::Query) -> impl Responder { +async fn search_data_schema( + req: HttpRequest, + query_params: web::Query, +) -> impl Responder { let params = query_params.into_inner(); - // Dataset-specific request-size policy. Drives tile sizing in Step 6 - // and radius capping in Step 7. + // Dataset-specific request-size policy: tile size, level set, radius cap. let config = &dataset_config::BSOSE_CONFIG; - // validate query params //////////////////////////////////////// - match helpers::validate_query_params(¶ms) { - Ok(_) => {}, - Err(response) => return response, + // The next_url we emit on success uses this request's own path, so the + // generated URL stays correct even if the route is re-mounted later. + let path = req.path().to_string(); + + // ---- validation --------------------------------------------------- + if let Err(response) = helpers::validate_query_params(¶ms) { + return response; } - // construct filter from query params ////////////////////////// - // - // Step 5: route the filter through the tile-aware composer. The tile - // is a *passthrough* — no spatial bbox, no level bracket — so the - // composer returns the user filter unchanged and behavior is identical - // to the pre-pagination handler. Step 6 will replace this with a loop - // over real tiles generated by `tile_generator::generate_tiles`. - let tile = tile_generator::TileSpec { - tile_bbox: None, - level_index: None, - }; - let filter = filter_composer::compose_filter_with_tile(params.clone(), &tile, config); - - // open the cursor ////////////////////////////////////////////// - let options = FindOptions::builder().build(); - let mut cursor = match generate_cursor::("argo", "bsose", filter, Some(options)).await { - Ok(c) => c, - Err(e) => { - eprintln!("Error opening cursor: {}", e); - return HttpResponse::InternalServerError().finish(); - } + let start_idx = match pagination::parse_tile_index(¶ms) { + Ok(i) => i, + Err(e) => return HttpResponse::BadRequest().json(json!({"error": e})), }; - // grab the cached timeseries vector once + // ---- tile sequence + cached startup data -------------------------- + let tiles = tile_generator::generate_tiles(¶ms, config); + let timeseries = { let ts = TIMESERIES.lock().unwrap(); ts.clone().unwrap() }; - let compression: Option = params.get("compression") + let compression: Option = params + .get("compression") .and_then(|v| v.as_str()) .map(|s| s.to_string()); - let batchmeta: Option = params.get("batchmeta") + let batchmeta: Option = params + .get("batchmeta") .and_then(|v| v.as_str()) .map(|s| s.to_string()); - // ------------------------------------------------------------------- - // batchmeta: drain the bsose cursor, but only keep the (small) set of - // unique metadata IDs in memory. Then fetch the metadata documents and - // return them as a normal JSON array. Worst-case memory is bounded by - // the number of distinct metadata ids, not the number of bsose hits. - // ------------------------------------------------------------------- - if batchmeta.is_some() { - let mut unique_metadata: HashSet = HashSet::new(); - while let Some(result) = cursor.next().await { - match result { - Ok(doc) => { - if let Some(t) = transforms::transform_timeseries(¶ms, ×eries, doc) { - for m in t.metadata.iter() { - unique_metadata.insert(m.clone()); - } - } - } - Err(e) => { - eprintln!("Cursor error: {}", e); - return HttpResponse::InternalServerError().finish(); - } - } - } - if unique_metadata.is_empty() { - return helpers::create_response::(vec![]); - } - let meta_filter = mongodb::bson::doc! { - "_id": { "$in": unique_metadata.into_iter().collect::>() } - }; - let meta_cursor = match generate_cursor::("argo", "timeseriesMeta", meta_filter, None).await { + let is_minimal = matches!(compression.as_deref(), Some("minimal")); + + // ---- probe-forward loop ------------------------------------------- + // + // For each candidate tile (starting at the caller-supplied tile_index), + // open a cursor and look for output. The flavour of "look" differs by + // branch: streaming peeks for the first doc that survives transformation + // (and keeps the cursor so the rest can be streamed straight through); + // batchmeta drains the entire cursor to collect unique metadata IDs + // (no streaming, but per-tile bounded by tile size). Either way, an + // empty tile drops through to the next iteration. We plod through + // empty tiles one at a time; a future land-mask short-circuit could + // replace this with a smarter skip. + for tile_idx in start_idx..tiles.len() { + let tile = &tiles[tile_idx]; + let filter = filter_composer::compose_filter_with_tile(params.clone(), tile, config); + let options = FindOptions::builder().build(); + + let mut cursor = match generate_cursor::( + "argo", "bsose", filter, Some(options), + ) + .await + { Ok(c) => c, Err(e) => { - eprintln!("Error opening metadata cursor: {}", e); + eprintln!("Error opening cursor for tile {}: {}", tile_idx, e); return HttpResponse::InternalServerError().finish(); } }; - let results: Vec<_> = meta_cursor.map(|doc| doc.unwrap()).collect().await; - return helpers::create_response(results); - } - // ------------------------------------------------------------------- - // Default and compression=minimal both stream the bsose cursor through - // the per-document transforms straight to the HTTP response, never - // materializing the full result set in memory. - // - // We do still need to peek ahead until we've found at least one doc - // that survives transformation, so we can preserve the existing - // 404-on-empty contract. Once we have one survivor in hand, we open - // the streamed JSON array `[`, emit it, and continue draining the - // cursor doc-by-doc. - // ------------------------------------------------------------------- - let is_minimal = matches!(compression.as_deref(), Some("minimal")); - - let mut first_doc: Option = None; - while let Some(result) = cursor.next().await { - match result { - Ok(doc) => { - if let Some(t) = transforms::transform_timeseries(¶ms, ×eries, doc) { - first_doc = Some(t); - break; + if batchmeta.is_some() { + // ---- batchmeta branch -------------------------------------- + let mut unique_metadata: HashSet = HashSet::new(); + while let Some(result) = cursor.next().await { + match result { + Ok(doc) => { + if let Some(t) = + transforms::transform_timeseries(¶ms, ×eries, doc) + { + for m in t.metadata.iter() { + unique_metadata.insert(m.clone()); + } + } + } + Err(e) => { + eprintln!("Cursor error during batchmeta drain: {}", e); + break; + } } } - Err(e) => { - eprintln!("Cursor error: {}", e); - return HttpResponse::InternalServerError().finish(); + + if unique_metadata.is_empty() { + continue; // tile produced no metadata — try the next one. } - } - } - let first_doc = match first_doc { - Some(d) => d, - None => return helpers::create_response::(vec![]), - }; + let meta_filter = mongodb::bson::doc! { + "_id": { "$in": unique_metadata.into_iter().collect::>() } + }; + let meta_cursor = match generate_cursor::( + "argo", "timeseriesMeta", meta_filter, None, + ) + .await + { + Ok(c) => c, + Err(e) => { + eprintln!("Error opening metadata cursor: {}", e); + return HttpResponse::InternalServerError().finish(); + } + }; + let docs: Vec<_> = meta_cursor.map(|d| d.unwrap()).collect().await; - // Stream owns: cursor, params, timeseries, first_doc, is_minimal. - // Cursor errors mid-stream are logged and end the stream; we cannot - // change the HTTP status after bytes have been sent, so we close the - // JSON array cleanly and let the caller see whatever they already got. - let body = stream! { - yield Ok::<_, Infallible>(web::Bytes::from_static(b"[")); - - // Project + serialize the buffered first doc. - let first_bytes = if is_minimal { - let stub = transforms::timeseries_stub(&first_doc); - serde_json::to_vec(&stub).expect("serializing one stub should not fail") - } else { - serde_json::to_vec(&first_doc).expect("serializing one bsose doc should not fail") - }; - yield Ok(web::Bytes::from(first_bytes)); + return HttpResponse::Ok().json(json!({ + "docs": docs, + "next_url": next_url_value(&path, ¶ms, tile_idx, tiles.len()), + "message": format!("page {}", tile_idx), + })); + } + // ---- streaming branch ------------------------------------------ + // + // Peek-ahead until we find a doc that survives transformation. + // If none, advance to the next tile. If found, keep the cursor — + // we'll continue draining it from inside the response body. + let mut first_doc: Option = None; while let Some(result) = cursor.next().await { match result { Ok(doc) => { - if let Some(t) = transforms::transform_timeseries(¶ms, ×eries, doc) { - let bytes = if is_minimal { - let stub = transforms::timeseries_stub(&t); - serde_json::to_vec(&stub).expect("serializing one stub should not fail") - } else { - serde_json::to_vec(&t).expect("serializing one bsose doc should not fail") - }; - yield Ok(web::Bytes::from_static(b",")); - yield Ok(web::Bytes::from(bytes)); + if let Some(t) = + transforms::transform_timeseries(¶ms, ×eries, doc) + { + first_doc = Some(t); + break; } } Err(e) => { - eprintln!("Cursor error during stream: {}", e); - break; + eprintln!("Cursor error during peek for tile {}: {}", tile_idx, e); + return HttpResponse::InternalServerError().finish(); } } } - yield Ok(web::Bytes::from_static(b"]")); - }; + let first_doc = match first_doc { + Some(d) => d, + None => continue, // tile contributed no surviving docs — try next. + }; + + let next_url = next_url_value(&path, ¶ms, tile_idx, tiles.len()); + let page_message = format!("page {}", tile_idx); + // Each of these gets moved into the stream! generator. params and + // timeseries are needed to transform each subsequent doc; the rest + // are emitted at the end of the response. + let params_for_stream = params.clone(); + let ts_for_stream = timeseries.clone(); + + let body = stream! { + yield Ok::<_, Infallible>(web::Bytes::from_static(b"{\"docs\":[")); - HttpResponse::Ok() - .content_type("application/json") - .streaming(body) + // Serialize the peeked first doc. + let first_bytes = if is_minimal { + let stub = transforms::timeseries_stub(&first_doc); + serde_json::to_vec(&stub).expect("serializing one stub should not fail") + } else { + serde_json::to_vec(&first_doc) + .expect("serializing one bsose doc should not fail") + }; + yield Ok(web::Bytes::from(first_bytes)); + + while let Some(result) = cursor.next().await { + match result { + Ok(doc) => { + if let Some(t) = transforms::transform_timeseries( + ¶ms_for_stream, &ts_for_stream, doc, + ) { + let bytes = if is_minimal { + let stub = transforms::timeseries_stub(&t); + serde_json::to_vec(&stub) + .expect("serializing one stub should not fail") + } else { + serde_json::to_vec(&t) + .expect("serializing one bsose doc should not fail") + }; + yield Ok(web::Bytes::from_static(b",")); + yield Ok(web::Bytes::from(bytes)); + } + } + Err(e) => { + // Mid-stream error: status is already 200, so we + // can only close the JSON cleanly and stop. + eprintln!("Cursor error during stream: {}", e); + break; + } + } + } + + // Close the docs array and emit the trailer fields. The + // serialize calls only fail on non-finite floats inside the + // value; for the strings/Null we use here that's impossible, + // but we fall back to safe bytes if for some reason it does. + yield Ok(web::Bytes::from_static(b"],\"next_url\":")); + yield Ok(web::Bytes::from( + serde_json::to_vec(&next_url).unwrap_or_else(|_| b"null".to_vec()), + )); + yield Ok(web::Bytes::from_static(b",\"message\":")); + yield Ok(web::Bytes::from( + serde_json::to_vec(&page_message) + .unwrap_or_else(|_| b"\"\"".to_vec()), + )); + yield Ok(web::Bytes::from_static(b"}")); + }; + + return HttpResponse::Ok() + .content_type("application/json") + .streaming(body); + } + + // ---- no non-empty tile in the requested range --------------------- + // + // Empty results no longer return 404 — the paginated contract is that + // `next_url: null` means "no more data", so empty must remain 200. + HttpResponse::Ok().json(json!({ + "docs": [], + "next_url": Value::Null, + "message": format!("no non-empty tiles from index {}", start_idx), + })) +} + +/// Build the JSON value emitted as `next_url`. Returns `Value::Null` when +/// the just-served tile was the last one (no further pages exist). +fn next_url_value( + path: &str, + params: &serde_json::Value, + current_idx: usize, + total_tiles: usize, +) -> Value { + if current_idx + 1 < total_tiles { + Value::String(pagination::build_next_url(path, params, current_idx + 1)) + } else { + Value::Null + } } #[actix_web::main] diff --git a/api/tests/integration.rs b/api/tests/integration.rs index db44e8a..9e5ca4d 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -14,18 +14,23 @@ // API_URL=http://localhost:8080 MONGODB_URI=mongodb://localhost:27017 \ // cargo test --test integration -- --test-threads=1 // -// Tests are kept independent and read-only, so they could in principle run -// in parallel; we serialize them above just to keep ordering stable in CI -// logs. +// The response shape is the paginated envelope introduced in Step 6: +// { "docs": [...], "next_url": "" | null, "message": "..." } +// Multi-tile queries (anything spanning multiple grid cells or vertical +// levels) span multiple pages — use `get_paged` to follow `next_url` and +// accumulate docs across pages. mod common; use common::url_with_query; use serde_json::Value; +/// Generous timeout: the naive plod-forward through empty tiles can take a +/// few seconds on the first page of a whole-globe request, even with the +/// tiny seeded corpus. We can tighten this once we have a land-mask shortcut. fn client() -> reqwest::Client { reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(10)) + .timeout(std::time::Duration::from_secs(30)) .build() .expect("reqwest client should build") } @@ -39,19 +44,67 @@ async fn get(path: &str, params: &[(&str, &str)]) -> reqwest::Response { .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)) } +/// One paginated response: assert 200, parse as Value, return the body. +async fn get_envelope(path: &str, params: &[(&str, &str)]) -> Value { + let resp = get(path, params).await; + assert_eq!(resp.status(), 200, "expected 200 OK from {}", path); + resp.json().await.expect("response should be JSON") +} + +/// Follow `next_url` across pages, accumulating every doc returned. Stops +/// once a page returns `next_url: null`. The relative `next_url` is +/// resolved against `API_URL`. +async fn get_paged(path: &str, params: &[(&str, &str)]) -> Vec { + let mut all_docs: Vec = Vec::new(); + let mut url = url_with_query(path, params); + + loop { + let resp = client() + .get(&url) + .send() + .await + .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)); + assert_eq!( + resp.status(), + 200, + "expected 200, got {} for {}", + resp.status(), + url + ); + let body: Value = resp.json().await.expect("response should be JSON"); + + let docs = body["docs"] + .as_array() + .expect("response.docs should be an array"); + all_docs.extend(docs.iter().cloned()); + + match body["next_url"].as_str() { + // `null` next_url means we've reached the end. (`as_str()` returns + // None for both Value::Null and missing keys; both should + // terminate.) + None => break, + Some(rel) => { + url = format!("{}{}", common::api_url(), rel); + } + } + } + + all_docs +} + // --------------------------------------------------------------------------- // Basic shape & happy path // --------------------------------------------------------------------------- #[tokio::test] -async fn no_filters_returns_all_seeded_documents() { - // Without `data` set, slice_data drops the data field on each row but - // keeps the rows themselves — so we should get 4 entries. - let resp = get("/timeseries/bsose", &[]).await; - assert_eq!(resp.status(), 200, "expected 200 OK with seeded DB"); - let body: Vec = resp.json().await.expect("body should be JSON array"); - assert_eq!(body.len(), 4, "expected all 4 seeded bsose docs"); - for row in &body { +async fn no_filters_returns_all_seeded_documents_across_pages() { + // Whole-globe queries paginate through many tiles. After walking the + // full sequence we should get every seeded doc back exactly once. + let docs = get_paged("/timeseries/bsose", &[]).await; + assert_eq!(docs.len(), 4, "expected all 4 seeded docs across pages"); + + // Without `data` set, slice_data clears the data field but keeps rows. + for row in &docs { let data = row.get("data").expect("each row should have a data field"); let outer = data.as_array().expect("data should be an array"); assert!( @@ -62,14 +115,11 @@ async fn no_filters_returns_all_seeded_documents() { } #[tokio::test] -async fn data_all_returns_full_timeseries() { - let resp = get("/timeseries/bsose", &[("data", "all")]).await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - assert_eq!(body.len(), 4); - - // Every row should have 2 variables × 4 timesteps. - for row in &body { +async fn data_all_returns_full_timeseries_across_pages() { + let docs = get_paged("/timeseries/bsose", &[("data", "all")]).await; + assert_eq!(docs.len(), 4); + + for row in &docs { let outer = row["data"].as_array().expect("data array"); assert_eq!(outer.len(), 2, "expected 2 variables per row"); for inner in outer { @@ -83,11 +133,10 @@ async fn data_all_returns_full_timeseries() { } #[tokio::test] -async fn data_specific_field_filters_columns() { - let resp = get("/timeseries/bsose", &[("data", "salinity")]).await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - for row in &body { +async fn data_specific_field_filters_columns_across_pages() { + let docs = get_paged("/timeseries/bsose", &[("data", "salinity")]).await; + assert!(!docs.is_empty()); + for row in &docs { let names = &row["data_info"][0]; assert_eq!( names.as_array().unwrap(), @@ -103,21 +152,30 @@ async fn data_specific_field_filters_columns() { #[tokio::test] async fn id_filter_returns_single_document() { - let resp = get( + // id lookups produce a single passthrough tile, so the response fits in + // one page with a null next_url. + let body = get_envelope( "/timeseries/bsose", &[("id", "bsose_doc_001"), ("data", "all")], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - assert_eq!(body.len(), 1); - assert_eq!(body[0]["_id"], "bsose_doc_001"); + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + assert_eq!(docs[0]["_id"], "bsose_doc_001"); + assert!( + body["next_url"].is_null(), + "single-page result should report no further pages" + ); } #[tokio::test] -async fn unknown_id_returns_404() { - let resp = get("/timeseries/bsose", &[("id", "nope")]).await; - assert_eq!(resp.status(), 404); +async fn unknown_id_returns_empty_docs_and_null_next_url() { + // Pre-pagination this returned 404. The paginated contract is that + // empty results return 200 + empty docs + null next_url. + let body = get_envelope("/timeseries/bsose", &[("id", "nope")]).await; + let docs = body["docs"].as_array().unwrap(); + assert!(docs.is_empty(), "no docs for an unknown id"); + assert!(body["next_url"].is_null(), "no further pages either"); } // --------------------------------------------------------------------------- @@ -125,18 +183,17 @@ async fn unknown_id_returns_404() { // --------------------------------------------------------------------------- #[tokio::test] -async fn vertical_range_filters_by_level() { +async fn vertical_range_filters_by_level_across_pages() { // levels in fixtures: 10, 10, 20, 50 — [0, 30) keeps the three with - // level < 30. - let resp = get( + // level < 30. Pagination iterates the tile×level grid; verticalRange + // restricts which level brackets contribute docs. + let docs = get_paged( "/timeseries/bsose", &[("verticalRange", "[0, 30]"), ("data", "all")], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - assert_eq!(body.len(), 3); - for row in &body { + assert_eq!(docs.len(), 3); + for row in &docs { let level = row["level"].as_f64().unwrap(); assert!(level >= 0.0 && level < 30.0, "unexpected level: {}", level); } @@ -147,18 +204,17 @@ async fn vertical_range_filters_by_level() { // --------------------------------------------------------------------------- #[tokio::test] -async fn box_filter_matches_seeded_points() { +async fn box_filter_matches_seeded_points_across_pages() { // Box covers (lon 15..45, lat 5..35) — should hit docs at (20,10) and // (40,30), which is doc_001, doc_002, doc_004 (doc_001 and doc_004 - // share coords but different levels). - let resp = get( + // share coords but different levels — they land in different + // level-brackets, so they show up on different pages). + let docs = get_paged( "/timeseries/bsose", &[("box", "[[15,5],[45,35]]"), ("data", "all")], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - let ids: Vec<&str> = body.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); assert!(ids.contains(&"bsose_doc_001"), "ids: {:?}", ids); assert!(ids.contains(&"bsose_doc_002"), "ids: {:?}", ids); assert!(ids.contains(&"bsose_doc_004"), "ids: {:?}", ids); @@ -166,9 +222,9 @@ async fn box_filter_matches_seeded_points() { } #[tokio::test] -async fn polygon_filter_matches_seeded_points() { +async fn polygon_filter_matches_seeded_points_across_pages() { // Polygon around (20, 10) — small square enclosing doc_001 / doc_004. - let resp = get( + let docs = get_paged( "/timeseries/bsose", &[ ("polygon", "[[15,5],[25,5],[25,15],[15,15],[15,5]]"), @@ -176,20 +232,18 @@ async fn polygon_filter_matches_seeded_points() { ], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - let ids: Vec<&str> = body.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); assert!(ids.contains(&"bsose_doc_001"), "ids: {:?}", ids); assert!(ids.contains(&"bsose_doc_004"), "ids: {:?}", ids); assert!(!ids.contains(&"bsose_doc_002")); } #[tokio::test] -async fn center_radius_filter_matches_nearby_points() { - // 5000 km radius around (20, 10) — should find doc_001/doc_004 and - // possibly doc_002 (about 3100 km away). doc_003 sits on the other side - // of the planet and should be excluded. - let resp = get( +async fn center_radius_filter_matches_nearby_points_across_pages() { + // 5000 km radius around (20, 10). center+radius gets level-only + // pagination (no spatial tiling), so we still need to walk pages to + // hit each level bracket that contains data. + let docs = get_paged( "/timeseries/bsose", &[ ("center", "[20.0, 10.0]"), @@ -198,9 +252,7 @@ async fn center_radius_filter_matches_nearby_points() { ], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - let ids: Vec<&str> = body.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); assert!(ids.contains(&"bsose_doc_001"), "ids: {:?}", ids); assert!(!ids.contains(&"bsose_doc_003"), "ids: {:?}", ids); } @@ -212,8 +264,8 @@ async fn center_radius_filter_matches_nearby_points() { #[tokio::test] async fn date_range_slices_timeseries_columns() { // Seeded timeseries: Jan, Apr, Jul, Oct (2020). Asking for Apr → Sep - // should keep Apr and Jul (end is exclusive, < Oct works too here). - let resp = get( + // should keep Apr and Jul (end is exclusive). id lookup is single-page. + let body = get_envelope( "/timeseries/bsose", &[ ("id", "bsose_doc_001"), @@ -223,10 +275,9 @@ async fn date_range_slices_timeseries_columns() { ], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - assert_eq!(body.len(), 1); - let outer = body[0]["data"].as_array().unwrap(); + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + let outer = docs[0]["data"].as_array().unwrap(); for inner in outer { assert_eq!( inner.as_array().unwrap().len(), @@ -234,8 +285,7 @@ async fn date_range_slices_timeseries_columns() { "Apr + Jul should be 2 timesteps" ); } - // The transformed `timeseries` field should also reflect the slice. - let ts = body[0]["timeseries"].as_array().unwrap(); + let ts = docs[0]["timeseries"].as_array().unwrap(); assert_eq!(ts.len(), 2); assert!(ts[0].as_str().unwrap().starts_with("2020-04-15")); assert!(ts[1].as_str().unwrap().starts_with("2020-07-15")); @@ -246,17 +296,15 @@ async fn date_range_slices_timeseries_columns() { // --------------------------------------------------------------------------- #[tokio::test] -async fn compression_minimal_returns_stub_arrays() { - let resp = get( +async fn compression_minimal_returns_stub_arrays_across_pages() { + let docs = get_paged( "/timeseries/bsose", &[("compression", "minimal"), ("data", "all")], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - assert!(!body.is_empty()); + assert!(!docs.is_empty()); // Stubs serialize as 5-element arrays: [_id, lon, lat, level, metadata]. - for row in &body { + for row in &docs { let arr = row.as_array().expect("each stub should be an array"); assert_eq!(arr.len(), 5); assert!(arr[0].is_string()); // _id @@ -268,18 +316,82 @@ async fn compression_minimal_returns_stub_arrays() { } #[tokio::test] -async fn batchmeta_returns_metadata_documents() { - let resp = get( +async fn batchmeta_returns_metadata_documents_across_pages() { + // batchmeta aggregates per-page; across pages, the same metadata id may + // appear multiple times (once per non-empty tile referencing it). We + // dedupe by `_id` here. + let docs = get_paged( "/timeseries/bsose", &[("batchmeta", "true"), ("data", "all")], ) .await; - assert_eq!(resp.status(), 200); - let body: Vec = resp.json().await.unwrap(); - // Our seeded bsose docs all reference one meta doc. - assert_eq!(body.len(), 1); - assert_eq!(body[0]["_id"], "bsose-profile-meta-2020"); - assert_eq!(body[0]["data_type"], "BSOSE-profile"); + let mut unique_ids: std::collections::HashSet = std::collections::HashSet::new(); + for d in &docs { + unique_ids.insert(d["_id"].as_str().unwrap().to_string()); + } + assert_eq!(unique_ids.len(), 1, "all seeded docs share one metadata id"); + assert!(unique_ids.contains("bsose-profile-meta-2020")); + // And the data_type came through on at least one returned copy. + assert!(docs.iter().any(|d| d["data_type"] == "BSOSE-profile")); +} + +// --------------------------------------------------------------------------- +// Pagination protocol +// --------------------------------------------------------------------------- + +#[tokio::test] +async fn tile_index_beyond_end_returns_empty_with_null_next_url() { + // Tile sequence for a small box is short; an absurdly large tile_index + // is past the end. The server should return 200 + empty docs + + // null next_url, not an error. + let body = get_envelope( + "/timeseries/bsose", + &[ + ("box", "[[0,0],[10,10]]"), + ("tile_index", "9999999"), + ], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert!(docs.is_empty()); + assert!(body["next_url"].is_null()); +} + +#[tokio::test] +async fn invalid_tile_index_returns_400() { + let resp = get( + "/timeseries/bsose", + &[("box", "[[0,0],[10,10]]"), ("tile_index", "not-a-number")], + ) + .await; + assert_eq!(resp.status(), 400); +} + +#[tokio::test] +async fn negative_tile_index_returns_400() { + let resp = get( + "/timeseries/bsose", + &[("box", "[[0,0],[10,10]]"), ("tile_index", "-1")], + ) + .await; + assert_eq!(resp.status(), 400); +} + +#[tokio::test] +async fn first_page_carries_a_next_url_when_more_pages_remain() { + // The (20,10)/(40,30) box has docs at multiple level brackets, so the + // first page should not be the last. + let body = get_envelope( + "/timeseries/bsose", + &[("box", "[[15,5],[45,35]]"), ("data", "all")], + ) + .await; + assert!( + body["next_url"].is_string(), + "first page of a multi-tile request should advertise next_url" + ); + let next = body["next_url"].as_str().unwrap(); + assert!(next.contains("tile_index="), "next_url: {}", next); } // --------------------------------------------------------------------------- From e1ffbfd02518ab7aeab9f1dca4d91b57f8d71333 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 15:08:30 -0400 Subject: [PATCH 10/25] test remediation --- api/src/helpers/filter_composer.rs | 98 +++++++++++++++++++----------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs index 84be3e6..0450f4b 100644 --- a/api/src/helpers/filter_composer.rs +++ b/api/src/helpers/filter_composer.rs @@ -10,10 +10,14 @@ //! a `level_index` resolves to. //! //! This module's job is to translate a `TileSpec` into BSON predicates and -//! AND them with the user filter. Tile bbox becomes -//! `geolocation.coordinates: { $geoWithin: { $box: [[sw], [ne]] } }`, which -//! matches the cartesian-box shape the existing user-box filter already -//! uses. Level index becomes `level: { $gte: L_i, $lt: L_{i+1} }`, with the +//! AND them with the user filter. Tile bbox becomes a GeoJSON Polygon on +//! `geolocation: { $geoWithin: { $geometry: Polygon } }` — same shape the +//! existing `polygon_filter` uses, so Mongo evaluates it via the 2dsphere +//! index that's actually present on `geolocation`. We deliberately do NOT +//! use the legacy `$box` shape on `geolocation.coordinates`, because that +//! path doesn't hit the 2dsphere index and can return duplicates under +//! multikey-array semantics when used alone (without an enclosing `$or`). +//! Level index becomes `level: { $gte: L_i, $lt: L_{i+1} }`, with the //! upper bound omitted for the deepest level so the bracket extends to //! +infinity (catches anything past the configured maximum). //! @@ -26,8 +30,8 @@ //! a programming error (the tile generator should never emit one), but if //! one slips through we drop the level clause silently rather than panic. -use mongodb::bson::{doc, Bson, Document}; -use serde_json::Value; +use mongodb::bson::{self, doc, Bson, Document}; +use serde_json::{json, Value}; use super::dataset_config::DatasetConfig; use super::filters::filter_timeseries; @@ -51,18 +55,30 @@ fn build_tile_filter(tile: &TileSpec, config: &DatasetConfig) -> Document { let mut out = Document::new(); if let Some(bbox) = &tile.tile_bbox { - // Match the format used by the existing user-box filter: - // geolocation.coordinates: { $geoWithin: { $box: [[sw], [ne]] } } - // Using cartesian $box (rather than spherical $geometry: Polygon) - // is fine at 10° tile scale where planar approximation is accurate - // enough, and it's cheap for Mongo to evaluate. - let box_array: Vec> = vec![ - vec![bbox.sw[0], bbox.sw[1]], - vec![bbox.ne[0], bbox.ne[1]], - ]; + // Build the tile as a 5-point GeoJSON Polygon ring (closed), + // matching the same shape the existing `polygon_filter` uses for + // user polygons. Mongo treats this via the 2dsphere index on + // `geolocation` (a GeoJSON Point field) and avoids the legacy + // `$box`-on-coordinates path, which can multiply matches when the + // coordinate array is treated as multikey. + // + // Ring winding order is counter-clockwise (SW → SE → NE → NW → SW), + // which is the GeoJSON spec direction for the outer ring of a + // small polygon. + let polygon_geom = bson::to_bson(&json!({ + "type": "Polygon", + "coordinates": [[ + [bbox.sw[0], bbox.sw[1]], + [bbox.ne[0], bbox.sw[1]], + [bbox.ne[0], bbox.ne[1]], + [bbox.sw[0], bbox.ne[1]], + [bbox.sw[0], bbox.sw[1]], + ]], + })) + .expect("polygon geometry serialization is infallible for finite floats"); out.insert( - "geolocation.coordinates", - doc! { "$geoWithin": { "$box": box_array } }, + "geolocation", + doc! { "$geoWithin": { "$geometry": polygon_geom } }, ); } @@ -159,9 +175,10 @@ mod tests { let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); // No $and wrapping — empty user filter means we return tile alone. assert!(f.get_array("$and").is_err()); - let geo = f.get_document("geolocation.coordinates").unwrap(); + let geo = f.get_document("geolocation").unwrap(); let within = geo.get_document("$geoWithin").unwrap(); - assert!(within.get_array("$box").is_ok()); + let geom = within.get_document("$geometry").unwrap(); + assert_eq!(geom.get_str("type").unwrap(), "Polygon"); } #[test] @@ -180,7 +197,7 @@ mod tests { fn empty_params_with_full_tile_has_both_clauses_no_and() { let f = compose_filter_with_tile(json!({}), &full_tile(0), &TEST_CONFIG); assert!(f.get_array("$and").is_err()); - assert!(f.get_document("geolocation.coordinates").is_ok()); + assert!(f.get_document("geolocation").is_ok()); assert!(f.get_document("level").is_ok()); } @@ -195,8 +212,8 @@ mod tests { ); let parts = f.get_array("$and").expect("should be $and-wrapped"); assert_eq!(parts.len(), 2); - // User filter has $or (from box_filter); tile filter has - // geolocation.coordinates. Both should appear, one per element. + // User filter has $or (from box_filter); tile filter has a + // geolocation Polygon clause. Both should appear, one per element. let p0 = parts[0].as_document().unwrap(); let p1 = parts[1].as_document().unwrap(); assert!( @@ -204,8 +221,8 @@ mod tests { "user box $or should land in one of the $and clauses" ); assert!( - p0.get_document("geolocation.coordinates").is_ok() - || p1.get_document("geolocation.coordinates").is_ok(), + p0.get_document("geolocation").is_ok() + || p1.get_document("geolocation").is_ok(), "tile bbox should land in one of the $and clauses" ); } @@ -288,7 +305,7 @@ mod tests { // ---- bbox shape sanity -------------------------------------------------- #[test] - fn tile_bbox_emits_corner_pair_in_box_array() { + fn tile_bbox_emits_closed_ccw_ring() { let tile = TileSpec { tile_bbox: Some(BoundingBox { sw: [-10.0, -5.0], @@ -297,15 +314,28 @@ mod tests { level_index: None, }; let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); - let geo = f.get_document("geolocation.coordinates").unwrap(); + let geo = f.get_document("geolocation").unwrap(); let within = geo.get_document("$geoWithin").unwrap(); - let box_array = within.get_array("$box").unwrap(); - assert_eq!(box_array.len(), 2); - let sw = box_array[0].as_array().unwrap(); - let ne = box_array[1].as_array().unwrap(); - assert!((sw[0].as_f64().unwrap() - -10.0).abs() < 1e-9); - assert!((sw[1].as_f64().unwrap() - -5.0).abs() < 1e-9); - assert!((ne[0].as_f64().unwrap() - 20.0).abs() < 1e-9); - assert!((ne[1].as_f64().unwrap() - 15.0).abs() < 1e-9); + let geom = within.get_document("$geometry").unwrap(); + assert_eq!(geom.get_str("type").unwrap(), "Polygon"); + let rings = geom.get_array("coordinates").unwrap(); + assert_eq!(rings.len(), 1, "expected a single outer ring"); + let ring = rings[0].as_array().unwrap(); + assert_eq!(ring.len(), 5, "ring should be 5 points (closed)"); + + // Check the four distinct corners are present in CCW order: + // SW, SE, NE, NW. + let expected: [[f64; 2]; 5] = [ + [-10.0, -5.0], + [20.0, -5.0], + [20.0, 15.0], + [-10.0, 15.0], + [-10.0, -5.0], + ]; + for (i, exp) in expected.iter().enumerate() { + let pt = ring[i].as_array().unwrap(); + assert!((pt[0].as_f64().unwrap() - exp[0]).abs() < 1e-9); + assert!((pt[1].as_f64().unwrap() - exp[1]).abs() < 1e-9); + } } } From d318df28db23472dbc66280b144ef6b83bd8824f Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 15:22:24 -0400 Subject: [PATCH 11/25] error spelunking --- api/tests/integration.rs | 67 ++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 9e5ca4d..4434d87 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -54,39 +54,88 @@ async fn get_envelope(path: &str, params: &[(&str, &str)]) -> Value { /// Follow `next_url` across pages, accumulating every doc returned. Stops /// once a page returns `next_url: null`. The relative `next_url` is /// resolved against `API_URL`. +/// +/// Currently instrumented with println!s so that when an assertion fails in +/// CI, the per-page state surfaces in the captured test output. Once +/// pagination behaviour stabilises these can be removed. async fn get_paged(path: &str, params: &[(&str, &str)]) -> Vec { let mut all_docs: Vec = Vec::new(); let mut url = url_with_query(path, params); + let mut page_count: usize = 0; loop { + page_count += 1; + println!("[get_paged] page {} GET {}", page_count, url); + let resp = client() .get(&url) .send() .await .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)); - assert_eq!( - resp.status(), - 200, - "expected 200, got {} for {}", - resp.status(), - url - ); - let body: Value = resp.json().await.expect("response should be JSON"); + let status = resp.status(); + let body_text = resp + .text() + .await + .expect("response body should be readable"); + if !status.is_success() { + panic!( + "[get_paged] non-200 status {} for {}: {}", + status, url, body_text + ); + } + let body: Value = serde_json::from_str(&body_text).unwrap_or_else(|e| { + panic!( + "[get_paged] invalid JSON for {}: {} (body: {})", + url, e, body_text + ) + }); let docs = body["docs"] .as_array() .expect("response.docs should be an array"); + let doc_ids: Vec<&str> = docs + .iter() + .map(|d| d["_id"].as_str().unwrap_or("")) + .collect(); + println!( + "[get_paged] page {} -> docs.len={} ids={:?} next_url={} message={}", + page_count, + docs.len(), + doc_ids, + body["next_url"], + body["message"] + ); + all_docs.extend(docs.iter().cloned()); match body["next_url"].as_str() { // `null` next_url means we've reached the end. (`as_str()` returns // None for both Value::Null and missing keys; both should // terminate.) - None => break, + None => { + println!( + "[get_paged] DONE after {} pages, {} total docs", + page_count, + all_docs.len() + ); + break; + } Some(rel) => { url = format!("{}{}", common::api_url(), rel); } } + + // Runaway-safety: a buggy server that always emits a next_url could + // loop forever. Cap at 20 pages so CI fails fast with a useful + // trace instead of timing out. + if page_count >= 20 { + panic!( + "[get_paged] aborting after {} pages — pagination likely looping. \ + accumulated {} docs so far", + page_count, + all_docs.len() + ); + } } all_docs From 8eb3fc83d3a72a0d51e9e9ef075b40205c3cf3e0 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 15:36:56 -0400 Subject: [PATCH 12/25] dont quad-count points on grid vertexes --- api/src/helpers/filter_composer.rs | 77 ++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs index 0450f4b..86d0956 100644 --- a/api/src/helpers/filter_composer.rs +++ b/api/src/helpers/filter_composer.rs @@ -56,22 +56,32 @@ fn build_tile_filter(tile: &TileSpec, config: &DatasetConfig) -> Document { if let Some(bbox) = &tile.tile_bbox { // Build the tile as a 5-point GeoJSON Polygon ring (closed), - // matching the same shape the existing `polygon_filter` uses for - // user polygons. Mongo treats this via the 2dsphere index on - // `geolocation` (a GeoJSON Point field) and avoids the legacy - // `$box`-on-coordinates path, which can multiply matches when the - // coordinate array is treated as multikey. + // matching the shape `polygon_filter` uses for user polygons. + // Mongo evaluates this via the 2dsphere index on `geolocation`. // - // Ring winding order is counter-clockwise (SW → SE → NE → NW → SW), - // which is the GeoJSON spec direction for the outer ring of a - // small polygon. + // Half-open boundary handling: `$geoWithin` is boundary-inclusive + // by GeoJSON spec — a point on a polygon edge counts as "within". + // That means a doc sitting exactly on a four-corner meeting point + // of the tile grid matches all four neighbouring tiles, and a + // pagination walk emits it four times. We make tile membership + // half-open ([sw, ne) on both axes) by shrinking the NE corner + // inward by `TILE_EDGE_EPSILON`. SW is left at the raw boundary, + // so each grid point ends up owned by exactly one tile — the one + // whose SW corner it sits at. The epsilon is far smaller than any + // realistic data resolution, so no real doc falls into the gap. + // + // Ring winding is CCW (SW → SE → NE → NW → SW), the GeoJSON + // convention for the outer ring of a small polygon. + const TILE_EDGE_EPSILON: f64 = 1.0e-6; // ~11 cm at the equator + let ne_lon = bbox.ne[0] - TILE_EDGE_EPSILON; + let ne_lat = bbox.ne[1] - TILE_EDGE_EPSILON; let polygon_geom = bson::to_bson(&json!({ "type": "Polygon", "coordinates": [[ [bbox.sw[0], bbox.sw[1]], - [bbox.ne[0], bbox.sw[1]], - [bbox.ne[0], bbox.ne[1]], - [bbox.sw[0], bbox.ne[1]], + [ne_lon, bbox.sw[1]], + [ne_lon, ne_lat], + [bbox.sw[0], ne_lat], [bbox.sw[0], bbox.sw[1]], ]], })) @@ -305,7 +315,7 @@ mod tests { // ---- bbox shape sanity -------------------------------------------------- #[test] - fn tile_bbox_emits_closed_ccw_ring() { + fn tile_bbox_emits_half_open_closed_ccw_ring() { let tile = TileSpec { tile_bbox: Some(BoundingBox { sw: [-10.0, -5.0], @@ -323,19 +333,34 @@ mod tests { let ring = rings[0].as_array().unwrap(); assert_eq!(ring.len(), 5, "ring should be 5 points (closed)"); - // Check the four distinct corners are present in CCW order: - // SW, SE, NE, NW. - let expected: [[f64; 2]; 5] = [ - [-10.0, -5.0], - [20.0, -5.0], - [20.0, 15.0], - [-10.0, 15.0], - [-10.0, -5.0], - ]; - for (i, exp) in expected.iter().enumerate() { - let pt = ring[i].as_array().unwrap(); - assert!((pt[0].as_f64().unwrap() - exp[0]).abs() < 1e-9); - assert!((pt[1].as_f64().unwrap() - exp[1]).abs() < 1e-9); - } + // SW corner is the raw bbox SW (inclusive). The ring starts here + // and ends here (closed). + let sw = ring[0].as_array().unwrap(); + assert!((sw[0].as_f64().unwrap() - -10.0).abs() < 1e-12); + assert!((sw[1].as_f64().unwrap() - -5.0).abs() < 1e-12); + let last = ring[4].as_array().unwrap(); + assert_eq!(last[0].as_f64().unwrap(), sw[0].as_f64().unwrap()); + assert_eq!(last[1].as_f64().unwrap(), sw[1].as_f64().unwrap()); + + // NE corner has been shrunk inward by a tiny epsilon so that tile + // membership is half-open. We don't assert the exact epsilon + // (it's a private constant), only that the NE corner is strictly + // less than the raw bbox NE and not absurdly shrunk. + let ne = ring[2].as_array().unwrap(); + let ne_lon = ne[0].as_f64().unwrap(); + let ne_lat = ne[1].as_f64().unwrap(); + assert!(ne_lon < 20.0, "NE lon should be shrunk: {}", ne_lon); + assert!(ne_lon > 20.0 - 1.0e-3, "NE lon shouldn't be wildly shrunk: {}", ne_lon); + assert!(ne_lat < 15.0, "NE lat should be shrunk: {}", ne_lat); + assert!(ne_lat > 15.0 - 1.0e-3, "NE lat shouldn't be wildly shrunk: {}", ne_lat); + + // CCW corners (the SE and NW corners use one shrunk axis and one + // raw axis — verify the pairing is right). + let se = ring[1].as_array().unwrap(); + assert!((se[0].as_f64().unwrap() - ne_lon).abs() < 1e-12); + assert!((se[1].as_f64().unwrap() - -5.0).abs() < 1e-12); + let nw = ring[3].as_array().unwrap(); + assert!((nw[0].as_f64().unwrap() - -10.0).abs() < 1e-12); + assert!((nw[1].as_f64().unwrap() - ne_lat).abs() < 1e-12); } } From 78912043f5fe0537186e800cdf170bd8396d644e Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 15:47:35 -0400 Subject: [PATCH 13/25] thinking about dateline edgecase --- api/src/helpers/filter_composer.rs | 124 ++++++++++++++++++++++++++--- api/tests/integration.rs | 67 +++------------- 2 files changed, 123 insertions(+), 68 deletions(-) diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs index 86d0956..670b2fe 100644 --- a/api/src/helpers/filter_composer.rs +++ b/api/src/helpers/filter_composer.rs @@ -61,20 +61,43 @@ fn build_tile_filter(tile: &TileSpec, config: &DatasetConfig) -> Document { // // Half-open boundary handling: `$geoWithin` is boundary-inclusive // by GeoJSON spec — a point on a polygon edge counts as "within". - // That means a doc sitting exactly on a four-corner meeting point - // of the tile grid matches all four neighbouring tiles, and a - // pagination walk emits it four times. We make tile membership - // half-open ([sw, ne) on both axes) by shrinking the NE corner - // inward by `TILE_EDGE_EPSILON`. SW is left at the raw boundary, - // so each grid point ends up owned by exactly one tile — the one - // whose SW corner it sits at. The epsilon is far smaller than any - // realistic data resolution, so no real doc falls into the gap. + // For an interior grid corner, that means a doc at the meeting + // point of four tiles matches all four polygons and pagination + // emits it four times. We make tile membership half-open by + // shrinking each tile's NE corner inward by `TILE_EDGE_EPSILON`. + // The SW is left raw, so each grid point is owned by exactly one + // tile — the one whose SW corner it sits at. + // + // Global east/north exception: at lon=180 (the antimeridian) and + // lat=90 (the north pole), the tile has no eastern/northern + // neighbour to claim that boundary, so shrinking would create a + // gap that swallows docs sitting exactly on the antimeridian or + // at the pole. We leave those edges inclusive. SW edges at + // lon=-180 / lat=-90 are already inclusive by construction. // // Ring winding is CCW (SW → SE → NE → NW → SW), the GeoJSON // convention for the outer ring of a small polygon. + // + // Known limitation (not handled here): a user-supplied bounding + // box whose NE corner lies exactly on a tile grid line and is + // also closed (e.g. `box=[[20,10],[40,30]]`) will lose docs at + // that NE corner, because the rightmost/topmost tile's NE is + // shrunk away from the user's NE. Fixable by passing the user + // box's NE into the tile filter and skipping shrinkage when they + // coincide; deferred until a real test exercises it. const TILE_EDGE_EPSILON: f64 = 1.0e-6; // ~11 cm at the equator - let ne_lon = bbox.ne[0] - TILE_EDGE_EPSILON; - let ne_lat = bbox.ne[1] - TILE_EDGE_EPSILON; + const GLOBAL_EAST: f64 = 180.0; + const GLOBAL_NORTH: f64 = 90.0; + let ne_lon = if bbox.ne[0] >= GLOBAL_EAST { + bbox.ne[0] + } else { + bbox.ne[0] - TILE_EDGE_EPSILON + }; + let ne_lat = if bbox.ne[1] >= GLOBAL_NORTH { + bbox.ne[1] + } else { + bbox.ne[1] - TILE_EDGE_EPSILON + }; let polygon_geom = bson::to_bson(&json!({ "type": "Polygon", "coordinates": [[ @@ -314,6 +337,87 @@ mod tests { // ---- bbox shape sanity -------------------------------------------------- + /// Read back the NE corner of the polygon ring from a composed filter. + /// Returns (ne_lon, ne_lat). + fn ne_of_composed(f: &Document) -> (f64, f64) { + let geo = f.get_document("geolocation").unwrap(); + let within = geo.get_document("$geoWithin").unwrap(); + let geom = within.get_document("$geometry").unwrap(); + let rings = geom.get_array("coordinates").unwrap(); + let ring = rings[0].as_array().unwrap(); + let ne = ring[2].as_array().unwrap(); + (ne[0].as_f64().unwrap(), ne[1].as_f64().unwrap()) + } + + #[test] + fn interior_tile_ne_is_shrunk() { + // An interior tile (neither edge touches a global meridian) gets + // its NE corner shrunk inward so adjacent tiles can't both claim + // a corner-meeting doc. + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [0.0, 0.0], + ne: [10.0, 10.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let (ne_lon, ne_lat) = ne_of_composed(&f); + assert!(ne_lon < 10.0, "interior NE lon should be shrunk: {}", ne_lon); + assert!(ne_lat < 10.0, "interior NE lat should be shrunk: {}", ne_lat); + } + + #[test] + fn easternmost_tile_keeps_ne_lon_at_180() { + // A tile that abuts the antimeridian (ne_lon = 180) must NOT be + // shrunk in longitude — otherwise docs sitting exactly on the + // antimeridian fall in the gap with no tile to claim them. + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [170.0, 0.0], + ne: [180.0, 10.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let (ne_lon, ne_lat) = ne_of_composed(&f); + assert_eq!(ne_lon, 180.0, "antimeridian tile NE lon must remain 180"); + // ne_lat is interior — still shrunk. + assert!(ne_lat < 10.0, "interior NE lat is still shrunk: {}", ne_lat); + } + + #[test] + fn northernmost_tile_keeps_ne_lat_at_90() { + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [0.0, 80.0], + ne: [10.0, 90.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let (ne_lon, ne_lat) = ne_of_composed(&f); + assert!(ne_lon < 10.0, "interior NE lon is still shrunk: {}", ne_lon); + assert_eq!(ne_lat, 90.0, "north-pole tile NE lat must remain 90"); + } + + #[test] + fn ne_pole_meridian_corner_tile_keeps_both_unshrunk() { + // The single tile in the global grid that sits at both ne_lon=180 + // AND ne_lat=90. Both axes must remain inclusive. + let tile = TileSpec { + tile_bbox: Some(BoundingBox { + sw: [170.0, 80.0], + ne: [180.0, 90.0], + }), + level_index: None, + }; + let f = compose_filter_with_tile(json!({}), &tile, &TEST_CONFIG); + let (ne_lon, ne_lat) = ne_of_composed(&f); + assert_eq!(ne_lon, 180.0); + assert_eq!(ne_lat, 90.0); + } + #[test] fn tile_bbox_emits_half_open_closed_ccw_ring() { let tile = TileSpec { diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 4434d87..9e5ca4d 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -54,88 +54,39 @@ async fn get_envelope(path: &str, params: &[(&str, &str)]) -> Value { /// Follow `next_url` across pages, accumulating every doc returned. Stops /// once a page returns `next_url: null`. The relative `next_url` is /// resolved against `API_URL`. -/// -/// Currently instrumented with println!s so that when an assertion fails in -/// CI, the per-page state surfaces in the captured test output. Once -/// pagination behaviour stabilises these can be removed. async fn get_paged(path: &str, params: &[(&str, &str)]) -> Vec { let mut all_docs: Vec = Vec::new(); let mut url = url_with_query(path, params); - let mut page_count: usize = 0; loop { - page_count += 1; - println!("[get_paged] page {} GET {}", page_count, url); - let resp = client() .get(&url) .send() .await .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)); - let status = resp.status(); - let body_text = resp - .text() - .await - .expect("response body should be readable"); - if !status.is_success() { - panic!( - "[get_paged] non-200 status {} for {}: {}", - status, url, body_text - ); - } - let body: Value = serde_json::from_str(&body_text).unwrap_or_else(|e| { - panic!( - "[get_paged] invalid JSON for {}: {} (body: {})", - url, e, body_text - ) - }); + assert_eq!( + resp.status(), + 200, + "expected 200, got {} for {}", + resp.status(), + url + ); + let body: Value = resp.json().await.expect("response should be JSON"); let docs = body["docs"] .as_array() .expect("response.docs should be an array"); - let doc_ids: Vec<&str> = docs - .iter() - .map(|d| d["_id"].as_str().unwrap_or("")) - .collect(); - println!( - "[get_paged] page {} -> docs.len={} ids={:?} next_url={} message={}", - page_count, - docs.len(), - doc_ids, - body["next_url"], - body["message"] - ); - all_docs.extend(docs.iter().cloned()); match body["next_url"].as_str() { // `null` next_url means we've reached the end. (`as_str()` returns // None for both Value::Null and missing keys; both should // terminate.) - None => { - println!( - "[get_paged] DONE after {} pages, {} total docs", - page_count, - all_docs.len() - ); - break; - } + None => break, Some(rel) => { url = format!("{}{}", common::api_url(), rel); } } - - // Runaway-safety: a buggy server that always emits a next_url could - // loop forever. Cap at 20 pages so CI fails fast with a useful - // trace instead of timing out. - if page_count >= 20 { - panic!( - "[get_paged] aborting after {} pages — pagination likely looping. \ - accumulated {} docs so far", - page_count, - all_docs.len() - ); - } } all_docs From 608ff5396adedd9ee1496f98242e9a432fd775c7 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:00:34 -0400 Subject: [PATCH 14/25] step 7: enforce max proximity radius --- api/src/helpers/dataset_config.rs | 8 ++- api/src/helpers/helpers.rs | 109 ++++++++++++++++++++++++++++++ api/src/main.rs | 3 + api/tests/integration.rs | 22 ++++++ 4 files changed, 139 insertions(+), 3 deletions(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 79b561b..1bf203d 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -48,11 +48,13 @@ pub const BSOSE_LEVELS: &[f64] = &[ /// Configuration for the BSOSE timeseries dataset. /// /// 10° tiles × 4 grid cells/degree = 40 × 40 = 1600 cells per (tile, level). -/// `max_radius_meters` is a placeholder; revisit with a real -/// operational limit once we have request-distribution data. +/// `max_radius_meters` is set to roughly one major-basin radius. It exists +/// so a naive caller can't ask for a half-globe disk via `center + radius` +/// (which goes through Mongo `$near` and isn't spatially tiled). Revisit +/// when we have operational request-distribution data. pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { tile_degrees: 10.0, - max_radius_meters: 2_000_000.0, // 2000 km — placeholder + max_radius_meters: 5_000_000.0, // 5000 km — one-basin radius levels: BSOSE_LEVELS, }; diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index 9ce273b..d8d7929 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -4,6 +4,8 @@ use serde::{Serialize}; use actix_web::{HttpResponse}; use serde_json::{json, from_str}; +use super::dataset_config::DatasetConfig; + pub fn validlonlat(coords: Vec>) -> Vec> { coords.into_iter().map(|mut pair| { if pair.len() == 2 { @@ -108,6 +110,56 @@ pub fn validate_query_params(params: &serde_json::Value) -> Result<(), HttpRespo Ok(()) } +/// Enforce the dataset's `max_radius_meters` for `center + radius` queries. +/// +/// `center + radius` is the one geo mode that *isn't* spatially tiled +/// (Mongo's `$near` handles the bounding internally), so a request with an +/// unbounded radius could ask for a half-globe disk and return millions of +/// docs. We cap the radius in config and reject anything past it here. +/// +/// Only runs when `center` is present in the params: if center is absent, +/// this mode doesn't apply and the cap is irrelevant. When center IS +/// present, `validate_query_params` has already required `radius` to be +/// present as well; we additionally require it to parse as a non-negative +/// finite f64 (a pre-existing bug elsewhere unwraps that parse — catching +/// it here makes the error path graceful for callers). +pub fn validate_radius_cap( + params: &serde_json::Value, + config: &DatasetConfig, +) -> Result<(), HttpResponse> { + if params.get("center").is_none() { + return Ok(()); + } + let radius_str = match params.get("radius").and_then(|v| v.as_str()) { + Some(s) => s, + // validate_query_params should have caught a missing radius + // already; if it didn't, treating this as "no cap to enforce" + // hands the request through to the existing error path rather + // than masking it with a different 400. + None => return Ok(()), + }; + let radius: f64 = match radius_str.parse() { + Ok(r) if r.is_finite() && r >= 0.0 => r, + _ => { + return Err(HttpResponse::BadRequest().json(json!({ + "error": format!( + "'radius' must be a non-negative finite number, got '{}'", + radius_str + ) + }))); + } + }; + if radius > config.max_radius_meters { + return Err(HttpResponse::BadRequest().json(json!({ + "error": format!( + "radius {} m exceeds the dataset's maximum allowed radius of {} m", + radius, config.max_radius_meters + ) + }))); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; @@ -283,4 +335,61 @@ mod tests { }); assert!(validate_query_params(¶ms).is_ok()); } + + // ---- validate_radius_cap ------------------------------------------------- + + /// Minimal config for radius-cap testing. tile_degrees / levels are + /// irrelevant for this validator. + const RADIUS_TEST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1_000_000.0, // 1000 km + levels: &[0.0], + }; + + #[test] + fn radius_cap_skipped_when_no_center() { + // No center → mode doesn't apply, cap is moot. + let params = json!({}); + assert!(validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).is_ok()); + } + + #[test] + fn radius_cap_accepts_radius_at_exactly_the_cap() { + let params = json!({"center": "[0,0]", "radius": "1000000"}); + assert!(validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).is_ok()); + } + + #[test] + fn radius_cap_accepts_radius_below_cap() { + let params = json!({"center": "[0,0]", "radius": "500000"}); + assert!(validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).is_ok()); + } + + #[test] + fn radius_cap_rejects_radius_above_cap() { + let params = json!({"center": "[0,0]", "radius": "1000001"}); + let err = validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn radius_cap_rejects_non_numeric_radius() { + let params = json!({"center": "[0,0]", "radius": "huge"}); + let err = validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn radius_cap_rejects_negative_radius() { + let params = json!({"center": "[0,0]", "radius": "-1"}); + let err = validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn radius_cap_rejects_non_finite_radius() { + let params = json!({"center": "[0,0]", "radius": "inf"}); + let err = validate_radius_cap(¶ms, &RADIUS_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } } \ No newline at end of file diff --git a/api/src/main.rs b/api/src/main.rs index cb4a774..0255f4d 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -55,6 +55,9 @@ async fn search_data_schema( if let Err(response) = helpers::validate_query_params(¶ms) { return response; } + if let Err(response) = helpers::validate_radius_cap(¶ms, config) { + return response; + } let start_idx = match pagination::parse_tile_index(¶ms) { Ok(i) => i, diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 9e5ca4d..59b1fc0 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -432,3 +432,25 @@ async fn rejects_unparseable_start_date() { let resp = get("/timeseries/bsose", &[("startDate", "yesterday")]).await; assert_eq!(resp.status(), 400); } + +#[tokio::test] +async fn rejects_radius_above_cap() { + // BSOSE_CONFIG.max_radius_meters is 5_000_000. Asking for 10_000_000 + // should be rejected before the cursor opens. + let resp = get( + "/timeseries/bsose", + &[("center", "[0.0, 0.0]"), ("radius", "10000000")], + ) + .await; + assert_eq!(resp.status(), 400); +} + +#[tokio::test] +async fn rejects_non_numeric_radius() { + let resp = get( + "/timeseries/bsose", + &[("center", "[0.0, 0.0]"), ("radius", "huge")], + ) + .await; + assert_eq!(resp.status(), 400); +} From 5fb7de13a7d444b2c740916e0dbc57418f688dd5 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:05:41 -0400 Subject: [PATCH 15/25] test fiddling --- api/src/helpers/dataset_config.rs | 10 +++++----- api/src/helpers/helpers.rs | 5 ++++- api/tests/integration.rs | 11 +++++++---- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 1bf203d..c736d14 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -48,13 +48,13 @@ pub const BSOSE_LEVELS: &[f64] = &[ /// Configuration for the BSOSE timeseries dataset. /// /// 10° tiles × 4 grid cells/degree = 40 × 40 = 1600 cells per (tile, level). -/// `max_radius_meters` is set to roughly one major-basin radius. It exists -/// so a naive caller can't ask for a half-globe disk via `center + radius` -/// (which goes through Mongo `$near` and isn't spatially tiled). Revisit -/// when we have operational request-distribution data. +/// `max_radius_meters` is intentionally tight: BSOSE at 1/4° resolution +/// produces ~16 docs per 25 km × 25 km cell, so even a small disk pulls +/// thousands of docs out of `$near` (which isn't spatially tiled). 100 km +/// is a conservative starting point — easy to bump up if users complain. pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { tile_degrees: 10.0, - max_radius_meters: 5_000_000.0, // 5000 km — one-basin radius + max_radius_meters: 100_000.0, // 100 km — bump if users complain levels: BSOSE_LEVELS, }; diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index d8d7929..6bcc924 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -138,7 +138,10 @@ pub fn validate_radius_cap( // than masking it with a different 400. None => return Ok(()), }; - let radius: f64 = match radius_str.parse() { + // Explicit turbofish: the match guard runs before the let-binding's + // type annotation flows back to `parse()`, so type inference can't + // resolve `F` otherwise. + let radius: f64 = match radius_str.parse::() { Ok(r) if r.is_finite() && r >= 0.0 => r, _ => { return Err(HttpResponse::BadRequest().json(json!({ diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 59b1fc0..4689743 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -240,14 +240,17 @@ async fn polygon_filter_matches_seeded_points_across_pages() { #[tokio::test] async fn center_radius_filter_matches_nearby_points_across_pages() { - // 5000 km radius around (20, 10). center+radius gets level-only - // pagination (no spatial tiling), so we still need to walk pages to - // hit each level bracket that contains data. + // 100 km radius around (20, 10) — at the BSOSE radius cap. + // center+radius gets level-only pagination (no spatial tiling), so + // we still need to walk pages to hit each level bracket that + // contains data. doc_001 / doc_004 sit exactly at the center so any + // positive radius catches them; doc_003 is on the other side of the + // planet and is excluded by any sane radius. let docs = get_paged( "/timeseries/bsose", &[ ("center", "[20.0, 10.0]"), - ("radius", "5000000"), // 5000 km in metres + ("radius", "100000"), // 100 km — at the cap ("data", "all"), ], ) From 29e23235a0c6646d3490869507b4bcaff4fd4241 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:14:02 -0400 Subject: [PATCH 16/25] more tests! --- api/tests/integration.rs | 94 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 6 deletions(-) diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 4689743..05fb07e 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -305,7 +305,9 @@ async fn compression_minimal_returns_stub_arrays_across_pages() { &[("compression", "minimal"), ("data", "all")], ) .await; - assert!(!docs.is_empty()); + // The 4 seeded docs live in 4 distinct (spatial, level) tiles, so + // pagination should yield exactly 4 stubs total. + assert_eq!(docs.len(), 4, "expected one stub per seeded doc, got {:?}", docs); // Stubs serialize as 5-element arrays: [_id, lon, lat, level, metadata]. for row in &docs { let arr = row.as_array().expect("each stub should be an array"); @@ -318,24 +320,104 @@ async fn compression_minimal_returns_stub_arrays_across_pages() { } } +#[tokio::test] +async fn compression_minimal_with_id_returns_single_page_stub() { + // id lookup produces a passthrough tile; combining it with minimal + // should yield exactly one stub in a single-page response. + let body = get_envelope( + "/timeseries/bsose", + &[ + ("id", "bsose_doc_001"), + ("compression", "minimal"), + ("data", "all"), + ], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + let stub = docs[0].as_array().expect("minimal response is a stub array"); + assert_eq!(stub.len(), 5); + assert_eq!(stub[0].as_str().unwrap(), "bsose_doc_001"); + assert!(body["next_url"].is_null(), "id lookups don't paginate further"); +} + +#[tokio::test] +async fn batchmeta_with_id_returns_single_page_metadata() { + // id + batchmeta also passthrough-tiled: one metadata doc, single page. + let body = get_envelope( + "/timeseries/bsose", + &[ + ("id", "bsose_doc_001"), + ("batchmeta", "true"), + ("data", "all"), + ], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + assert_eq!(docs[0]["_id"], "bsose-profile-meta-2020"); + assert_eq!(docs[0]["data_type"], "BSOSE-profile"); + assert!(body["next_url"].is_null()); +} + +#[tokio::test] +async fn batchmeta_takes_precedence_over_minimal() { + // The handler dispatches into the batchmeta branch before the + // streaming branch consults compression=minimal. With both set, + // batchmeta wins and the returned docs are metadata objects, not + // 5-element stubs. + let body = get_envelope( + "/timeseries/bsose", + &[ + ("id", "bsose_doc_001"), + ("batchmeta", "true"), + ("compression", "minimal"), + ("data", "all"), + ], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + // A metadata doc is a JSON object with a data_type field; a stub is + // a 5-element array. Verify we got the object form. + let first = &docs[0]; + assert!( + first.is_object(), + "expected metadata object when batchmeta is set, got {:?}", + first + ); + assert_eq!(first["data_type"], "BSOSE-profile"); +} + #[tokio::test] async fn batchmeta_returns_metadata_documents_across_pages() { - // batchmeta aggregates per-page; across pages, the same metadata id may - // appear multiple times (once per non-empty tile referencing it). We - // dedupe by `_id` here. + // batchmeta aggregates per-page: each non-empty (spatial, level) tile + // returns the metadata docs referenced by that tile's bsose docs. + // Our 4 seeded docs each live in their own tile and all reference the + // same metadata id, so we get 4 page-level metadata responses each + // containing that one metadata doc — 4 returned docs, 1 unique id. let docs = get_paged( "/timeseries/bsose", &[("batchmeta", "true"), ("data", "all")], ) .await; + assert_eq!( + docs.len(), + 4, + "expected one batchmeta response per non-empty tile, got {:?}", + docs + ); + let mut unique_ids: std::collections::HashSet = std::collections::HashSet::new(); for d in &docs { unique_ids.insert(d["_id"].as_str().unwrap().to_string()); } assert_eq!(unique_ids.len(), 1, "all seeded docs share one metadata id"); assert!(unique_ids.contains("bsose-profile-meta-2020")); - // And the data_type came through on at least one returned copy. - assert!(docs.iter().any(|d| d["data_type"] == "BSOSE-profile")); + // Every returned doc should be a metadata document, not a bsose doc. + for d in &docs { + assert_eq!(d["data_type"], "BSOSE-profile"); + } } // --------------------------------------------------------------------------- From d41231ac489050e41d381c905a40637d7fd9a6c1 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:25:29 -0400 Subject: [PATCH 17/25] yet more tests --- api/tests/integration.rs | 119 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 05fb07e..eeabe58 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -224,6 +224,11 @@ async fn box_filter_matches_seeded_points_across_pages() { #[tokio::test] async fn polygon_filter_matches_seeded_points_across_pages() { // Polygon around (20, 10) — small square enclosing doc_001 / doc_004. + // The polygon's bbox spans four spatial tiles ([10-20, 0-10], + // [20-30, 0-10], [10-20, 10-20], [20-30, 10-20]) — multi-tile case. + // doc_001 (level 10 → L0) and doc_004 (level 50 → L3) share the same + // spatial tile but land in different level pages, so we expect + // exactly 2 docs across 2 non-empty pages. let docs = get_paged( "/timeseries/bsose", &[ @@ -233,11 +238,37 @@ async fn polygon_filter_matches_seeded_points_across_pages() { ) .await; let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + assert_eq!(docs.len(), 2, "expected exactly 2 docs, got {:?}", ids); assert!(ids.contains(&"bsose_doc_001"), "ids: {:?}", ids); assert!(ids.contains(&"bsose_doc_004"), "ids: {:?}", ids); assert!(!ids.contains(&"bsose_doc_002")); } +#[tokio::test] +async fn box_crossing_dateline_finds_antimeridian_docs() { + // Dateline-crossing box: sw_lon (170) > ne_lon (-160), so the box + // wraps the antimeridian. Tile generation splits it into an eastern + // sub-box (170..180) and a western sub-box (-180..-160). doc_003 at + // (-170, 50) lives in the western band; the other seeded docs are + // far from this box and should be excluded. + let docs = get_paged( + "/timeseries/bsose", + &[("box", "[[170,40],[-160,60]]"), ("data", "all")], + ) + .await; + let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + assert_eq!( + docs.len(), + 1, + "expected exactly one doc (doc_003), got {:?}", + ids + ); + assert!(ids.contains(&"bsose_doc_003"), "ids: {:?}", ids); + assert!(!ids.contains(&"bsose_doc_001")); + assert!(!ids.contains(&"bsose_doc_002")); + assert!(!ids.contains(&"bsose_doc_004")); +} + #[tokio::test] async fn center_radius_filter_matches_nearby_points_across_pages() { // 100 km radius around (20, 10) — at the BSOSE radius cap. @@ -424,6 +455,29 @@ async fn batchmeta_returns_metadata_documents_across_pages() { // Pagination protocol // --------------------------------------------------------------------------- +#[tokio::test] +async fn polygon_over_empty_region_returns_empty_envelope() { + // Polygon in the Indian Ocean (60-70°E, 5-15°N) — far from any + // seeded doc. Probe-forward should walk every candidate tile, find + // none non-empty, and return a 200 envelope with an empty docs array + // and null next_url instead of 404 or any other error code. + let body = get_envelope( + "/timeseries/bsose", + &[ + ("polygon", "[[60,5],[70,5],[70,15],[60,15],[60,5]]"), + ("data", "all"), + ], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert!( + docs.is_empty(), + "expected no docs in empty region, got {:?}", + docs + ); + assert!(body["next_url"].is_null()); +} + #[tokio::test] async fn tile_index_beyond_end_returns_empty_with_null_next_url() { // Tile sequence for a small box is short; an absurdly large tile_index @@ -462,10 +516,71 @@ async fn negative_tile_index_returns_400() { assert_eq!(resp.status(), 400); } +/// Pull `tile_index` out of a next_url query string. Panics if absent — +/// only used in tests where the URL was just emitted by the server, so a +/// missing index is itself a bug worth surfacing. +fn tile_index_from(url: &str) -> usize { + let query = url.split('?').nth(1).unwrap_or(""); + for pair in query.split('&') { + if let Some(("tile_index", v)) = pair.split_once('=') { + return v + .parse() + .unwrap_or_else(|e| panic!("tile_index in {} failed to parse: {}", url, e)); + } + } + panic!("no tile_index in url: {}", url); +} + +#[tokio::test] +async fn next_url_round_trips_cleanly() { + // Issue a multi-page request, GET its next_url directly (not via + // get_paged), and verify the server returns a valid envelope and the + // tile_index has advanced. Confirms that build_next_url's output + // survives the round-trip through the URL parser back into the + // handler — catches percent-encoding bugs, param dropping, etc. + let body = get_envelope( + "/timeseries/bsose", + &[("box", "[[15,5],[45,35]]"), ("data", "all")], + ) + .await; + let next = body["next_url"] + .as_str() + .expect("first page of multi-tile request should advertise next_url"); + let initial_idx = tile_index_from(next); + + let url = format!("{}{}", common::api_url(), next); + let resp = client() + .get(&url) + .send() + .await + .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)); + assert_eq!(resp.status(), 200, "next_url should yield 200"); + let next_body: Value = resp.json().await.expect("response should be JSON"); + + // Envelope shape preserved. + assert!(next_body["docs"].is_array()); + assert!(next_body["message"].is_string()); + + // If there are still more pages after this one, the new next_url + // should reference a tile_index strictly greater than the one we just + // requested (server probed forward to find a non-empty tile). + if let Some(further) = next_body["next_url"].as_str() { + let further_idx = tile_index_from(further); + assert!( + further_idx > initial_idx, + "further next_url tile_index ({}) should advance past {}", + further_idx, + initial_idx + ); + } +} + #[tokio::test] async fn first_page_carries_a_next_url_when_more_pages_remain() { // The (20,10)/(40,30) box has docs at multiple level brackets, so the - // first page should not be the last. + // first page should not be the last. next_url must carry both the + // user's params (so the next request hits the same filter) and an + // advanced tile_index. let body = get_envelope( "/timeseries/bsose", &[("box", "[[15,5],[45,35]]"), ("data", "all")], @@ -477,6 +592,8 @@ async fn first_page_carries_a_next_url_when_more_pages_remain() { ); let next = body["next_url"].as_str().unwrap(); assert!(next.contains("tile_index="), "next_url: {}", next); + assert!(next.contains("box="), "next_url should preserve box param: {}", next); + assert!(next.contains("data="), "next_url should preserve data param: {}", next); } // --------------------------------------------------------------------------- From e8a2c44992d8555a61746994c659cfb57d142687 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:35:05 -0400 Subject: [PATCH 18/25] comments and docs cleanup --- api/PAGINATION.md | 130 ++++++++++++++++++++++++++++++ api/src/helpers/dataset_config.rs | 14 ++-- api/src/helpers/tile_generator.rs | 18 +++-- api/tests/integration.rs | 5 +- 4 files changed, 149 insertions(+), 18 deletions(-) create mode 100644 api/PAGINATION.md diff --git a/api/PAGINATION.md b/api/PAGINATION.md new file mode 100644 index 0000000..9d9a8be --- /dev/null +++ b/api/PAGINATION.md @@ -0,0 +1,130 @@ +# Pagination + +Every response from the `/timeseries/...` endpoints is a JSON envelope: + +```json +{ + "docs": [...], + "next_url": "/timeseries/bsose?...&tile_index=N", + "message": "page N" +} +``` + +- `docs` — array of result documents (or stubs / metadata documents, + depending on the mode flags). May be empty. +- `next_url` — relative path + query for the next page. `null` when this + is the last page. Clients resolve it against the original request's + origin and follow it until they see `null`. +- `message` — human-readable status, currently the served tile index. + +There is no separate "no results" status code: an empty response is +`200 + {docs: [], next_url: null, ...}`, never `404`. + +## How pagination walks + +Server-side, each request's spatial parameters define a sequence of +**tiles**. A tile is one spatial sub-region paired with one discrete +depth level. Tiles are ordered *spatial outer, level inner*: all levels +for one (lon, lat) cell come out before moving to the next cell. + +Tile size is per-dataset: + +- Spatial extent is `DatasetConfig::tile_degrees` (10° for BSOSE). +- Depth pages are the dataset's discrete `levels` (24 brackets for BSOSE). + +For BSOSE that's up to 1600 docs per (tile × level) page, with the actual +count clipped by land, the user's filter, and the dataset's coverage. + +Each HTTP request serves at most **one** non-empty tile. The server +**probes forward** from the requested `tile_index`, opening a small +cursor per candidate tile and advancing past empties until it finds one +that yields output (or runs out of tiles). `next_url` carries +`tile_index = served_idx + 1`, so the next request resumes one tile past +the one we just emitted. When the server runs out of tiles, `next_url` +is `null`. + +This is naive plod-forward — there's no land-mask shortcut yet, so +whole-globe requests do walk a lot of empty tiles server-side. Clients +don't see that work; they only get one HTTP response per non-empty +tile. + +## Tile membership + +Tiles are **half-open** — `[sw, ne)` on both lon and lat axes — so each +grid point is owned by exactly one tile (the one whose SW corner it sits +at). Without this, a doc at the corner where four tiles meet would be +emitted four times. The half-open behaviour is implemented by shrinking +each tile's NE corner inward by a sub-cm epsilon, *except* at the global +east meridian (`lon=180`) and the north pole (`lat=90`), where there's +no neighbouring tile to overlap with — those edges remain inclusive so +antimeridian / north-pole docs aren't lost. + +## Special spatial modes + +| Mode | Tile sequence | +|------|---------------| +| `id` | A single passthrough tile — no spatial or level constraint is added. | +| `center + radius` | No spatial tiling; pagination is level-only. Radius must satisfy `radius ≤ max_radius_meters` (100 km for BSOSE today). | +| `polygon` | Tile the polygon's bounding box. Mongo `$geoWithin` does the actual polygon intersection per tile. | +| `box` | Tile the box. A dateline-crossing box (`sw_lon > ne_lon`) is split into east and west sub-boxes; tile generation runs on each. | +| no spatial param | Tile the whole globe. | + +## Mode flags + +- `compression=minimal` — each doc is serialized as a compact 5-element + array `[_id, lon, lat, level, metadata]` rather than the full + measurement document. +- `batchmeta` — instead of measurement docs, return the *metadata* + documents referenced by the matching docs (looked up in + `timeseriesMeta`). Aggregates per-page; clients union across pages. + Takes precedence over `compression=minimal` if both are set. + +## Query parameters + +| Param | Type | Notes | +|-------|------|-------| +| `id` | string | Exact match on `_id`. | +| `box` | JSON `[[sw_lon, sw_lat], [ne_lon, ne_lat]]` | Bounding box. Wraps the dateline if `sw_lon > ne_lon`. | +| `polygon` | JSON `[[lon, lat], ...]` | Closed ring of vertices (first = last), ≥ 4 points. | +| `center` + `radius` | JSON `[lon, lat]` + meters | Disk query. Radius capped at the dataset's `max_radius_meters`. | +| `verticalRange` | JSON `[lo, hi]` | Half-open depth range applied on top of tile-level pagination. | +| `startDate` / `endDate` | RFC-3339 string | Slices each doc's timeseries to this window. | +| `data` | comma-separated | Variables to include. `all` keeps everything. `except_data_values` keeps the schema but clears values. | +| `compression` | `minimal` | See mode flags. | +| `batchmeta` | any | See mode flags. | +| `tile_index` | non-negative integer | Pagination cursor. Default `0`. Almost always supplied by the previous response's `next_url`. | + +## Validation errors (HTTP 400) + +- More than one of `polygon` / `box` / `center` set. +- `center` set without `radius`, or vice versa. +- `radius` non-numeric, negative, non-finite, or above the dataset's cap. +- `polygon` malformed: fewer than 4 points, not closed, or any vertex + that isn't a 2-element pair. +- `startDate` or `endDate` not RFC-3339. +- `tile_index` present but not a non-negative integer. + +## Known limitations + +- **Antimeridian polygons.** A polygon whose vertices straddle the + antimeridian (some near `lon=+180`, others near `lon=-180`) computes + a naive bounding box spanning most of the globe and generates a huge + tile sequence. Mongo's `$geoWithin` doesn't handle these cleanly + either — for now, treat as user input bug. +- **Grid-aligned user box NE corner.** A user-supplied box whose NE + corner sits exactly on a tile grid line (e.g. + `box=[[20,10],[40,30]]`) will lose docs at that exact NE corner, + because the rightmost/topmost tile's NE is shrunk by the half-open + mechanism. Workaround on the client: pad the NE by a tiny amount. +- **Antipodal docs at `lon=±180` stored as distinct values.** Docs at + `lon=+180` land in the easternmost tile, docs at `lon=-180` in the + westernmost — same physical meridian, two different pages. Data + providers should normalise to one convention on insertion. + +## Per-dataset configuration + +`api/src/helpers/dataset_config.rs` defines a `DatasetConfig` struct +with the dataset's `tile_degrees`, `max_radius_meters`, and the discrete +`levels` array. The BSOSE handler binds `BSOSE_CONFIG` directly; adding +a new dataset means defining its config there and wiring its handler +through the same `tile_generator` / `filter_composer` machinery. diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index c736d14..37c3472 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -1,13 +1,11 @@ //! Per-dataset configuration governing request-size limits. //! -//! This module is the seam where pagination decisions hang off the dataset -//! identity. Future steps will consult `tile_degrees` to generate spatial -//! pagination tiles, and `max_radius_meters` to reject oversize `center + -//! radius` queries (which go through MongoDB `$near` / `$geoNear` and aren't -//! paginated). -//! -//! Step 1 (current): introduce the type and a BSOSE-specific instance. No -//! behaviour change yet — the handler binds the config but does not act on it. +//! This is the seam where pagination decisions hang off the dataset +//! identity. `tile_degrees` drives spatial tile generation; `levels` +//! defines the discrete depth pages within each spatial tile; +//! `max_radius_meters` caps `center + radius` queries (which go through +//! MongoDB `$near` and aren't paginated, so the cap is the only thing +//! preventing a runaway disk-of-most-of-the-globe). /// Per-dataset request-size policy. /// diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index 1da844a..5d539d4 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -17,14 +17,16 @@ //! time. This means a 10°×10° tile gives the BSOSE-class upper bound of //! ~1600 docs even when the user's query covers only a sliver of it. //! -//! Step 3 scope: this is a pure function. It does not look up the database, -//! it does not skip empty tiles, and it does not compose with the existing -//! Mongo filter. Those are Steps 4–6. +//! This module is a pure function over its inputs. It doesn't touch +//! MongoDB, doesn't decide which tiles are non-empty, and doesn't compose +//! its output with the user filter. `filter_composer` does the BSON +//! composition, and the handler in `main.rs` drives the probe-forward +//! walk that skips empty tiles. //! //! Known limitation: polygons that cross the antimeridian produce a naive -//! bbox that spans most of the globe (min_lon ≈ -180, max_lon ≈ +180). The -//! existing filter code doesn't handle antimeridian polygons either, so we -//! match that behaviour for now and leave a proper fix for later. +//! bbox spanning most of the globe (min_lon ≈ -180, max_lon ≈ +180), +//! which produces an excessive tile sequence. The existing user-polygon +//! filter has the same issue, so we match its behaviour for now. use serde_json::Value; @@ -76,8 +78,8 @@ pub fn generate_tiles(params: &Value, config: &DatasetConfig) -> Vec { } // center + radius: level-only pagination. The `$near` query is bounded - // by `max_radius_meters` (enforced in a later step), so we don't tile - // it spatially. + // by `max_radius_meters` (enforced in helpers::validate_radius_cap), + // so we don't tile it spatially. if params.get("center").is_some() { return level_only_tiles(config); } diff --git a/api/tests/integration.rs b/api/tests/integration.rs index eeabe58..330f60b 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -14,11 +14,12 @@ // API_URL=http://localhost:8080 MONGODB_URI=mongodb://localhost:27017 \ // cargo test --test integration -- --test-threads=1 // -// The response shape is the paginated envelope introduced in Step 6: +// Every response is the paginated envelope: // { "docs": [...], "next_url": "" | null, "message": "..." } // Multi-tile queries (anything spanning multiple grid cells or vertical // levels) span multiple pages — use `get_paged` to follow `next_url` and -// accumulate docs across pages. +// accumulate docs across pages. See api/PAGINATION.md for the full +// contract. mod common; From 2a5d94c22f3b05856bfca4c0b4b7b24d130f7f14 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 12 May 2026 16:44:12 -0400 Subject: [PATCH 19/25] dateline polygons --- api/PAGINATION.md | 12 +- api/src/helpers/tile_generator.rs | 221 +++++++++++++++++++++++++++--- api/tests/integration.rs | 27 ++++ 3 files changed, 234 insertions(+), 26 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index 9d9a8be..c82c8cd 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -106,11 +106,13 @@ antimeridian / north-pole docs aren't lost. ## Known limitations -- **Antimeridian polygons.** A polygon whose vertices straddle the - antimeridian (some near `lon=+180`, others near `lon=-180`) computes - a naive bounding box spanning most of the globe and generates a huge - tile sequence. Mongo's `$geoWithin` doesn't handle these cleanly - either — for now, treat as user input bug. +- **Polygons spanning more than a hemisphere or with multiple + antimeridian crossings.** Single-crossing antimeridian polygons are + detected and split into east + west sub-bboxes (no globe-spanning + over-tile). Polygons with two or more antimeridian crossings, or + polygons covering more than half the sphere, may over-tile — the + result is still correct (Mongo's `$geoWithin` does the actual + polygon intersection), just slower than ideal. - **Grid-aligned user box NE corner.** A user-supplied box whose NE corner sits exactly on a tile grid line (e.g. `box=[[20,10],[40,30]]`) will lose docs at that exact NE corner, diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index 5d539d4..f1e143f 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -23,10 +23,12 @@ //! composition, and the handler in `main.rs` drives the probe-forward //! walk that skips empty tiles. //! -//! Known limitation: polygons that cross the antimeridian produce a naive -//! bbox spanning most of the globe (min_lon ≈ -180, max_lon ≈ +180), -//! which produces an excessive tile sequence. The existing user-polygon -//! filter has the same issue, so we match its behaviour for now. +//! Antimeridian-crossing polygons (any edge spanning > 180° in longitude) +//! are detected and split into an east sub-bbox and a west sub-bbox so we +//! don't over-tile a thin strip across the dateline. Polygons with +//! multiple antimeridian crossings or those spanning more than a +//! hemisphere may still over-tile — the unwrapping is intentionally +//! simple-minded and assumes the polygon has at most one crossing. use serde_json::Value; @@ -166,20 +168,49 @@ fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { Err(_) => return Vec::new(), }; - let mut min_lon = f64::INFINITY; - let mut max_lon = f64::NEG_INFINITY; + let mut spatial = Vec::new(); + for bbox in polygon_bboxes(&coords) { + spatial.extend(grid_aligned_tiles(bbox, config.tile_degrees)); + } + cross_levels(spatial, config) +} + +/// Compute the bounding box(es) covering a polygon's vertices, handling +/// antimeridian-crossing polygons by emitting two sub-bboxes (one east of +/// the dateline, one west) instead of one bbox that naively spans most of +/// the globe. +/// +/// Detection: any edge of the polygon with `|lon_diff| > 180°` must be the +/// short way around the sphere — i.e. it crosses the antimeridian. (An +/// edge of exactly 180° is ambiguous and treated as non-crossing.) +/// +/// Splitting: when crossing is detected, longitudes are *unwrapped* by +/// adding 360° to negative values, putting everything in `[0°, 540°]`. +/// The polygon then has a contiguous lon range `[min, max]` in that +/// space. If `max > 180°` the polygon crosses, and we split into: +/// - east bbox: `[min, 180°]` +/// - west bbox: `[-180°, max - 360°]` +/// Both sub-bboxes share the same lat range (computed across all vertices). +/// +/// This assumes at most one antimeridian crossing per polygon. Polygons +/// with multiple crossings or those covering more than a hemisphere will +/// over-tile but stay correct (Mongo's `$geoWithin` does the actual +/// polygon intersection per tile). +fn polygon_bboxes(coords: &[Vec]) -> Vec { + if coords.is_empty() { + return Vec::new(); + } + + let crosses_antimeridian = coords.windows(2).any(|w| { + w[0].len() >= 2 && w[1].len() >= 2 && (w[0][0] - w[1][0]).abs() > 180.0 + }); + let mut min_lat = f64::INFINITY; let mut max_lat = f64::NEG_INFINITY; - for pt in &coords { + for pt in coords { if pt.len() < 2 { continue; } - if pt[0] < min_lon { - min_lon = pt[0]; - } - if pt[0] > max_lon { - max_lon = pt[0]; - } if pt[1] < min_lat { min_lat = pt[1]; } @@ -187,20 +218,76 @@ fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { max_lat = pt[1]; } } - if !min_lon.is_finite() || !max_lat.is_finite() { + if !min_lat.is_finite() { return Vec::new(); } - cross_levels( - grid_aligned_tiles( + if !crosses_antimeridian { + let mut min_lon = f64::INFINITY; + let mut max_lon = f64::NEG_INFINITY; + for pt in coords { + if pt.len() < 2 { + continue; + } + if pt[0] < min_lon { + min_lon = pt[0]; + } + if pt[0] > max_lon { + max_lon = pt[0]; + } + } + if !min_lon.is_finite() { + return Vec::new(); + } + return vec![BoundingBox { + sw: [min_lon, min_lat], + ne: [max_lon, max_lat], + }]; + } + + // Unwrapped longitudes: shift negatives into [180°, 360°] so the + // polygon becomes contiguous in lon space. + let mut min_lon = f64::INFINITY; + let mut max_lon = f64::NEG_INFINITY; + for pt in coords { + if pt.len() < 2 { + continue; + } + let lon = if pt[0] < 0.0 { pt[0] + 360.0 } else { pt[0] }; + if lon < min_lon { + min_lon = lon; + } + if lon > max_lon { + max_lon = lon; + } + } + if !min_lon.is_finite() { + return Vec::new(); + } + + if max_lon > 180.0 && min_lon < 180.0 { + // Genuine antimeridian crossing — split. + vec![ BoundingBox { sw: [min_lon, min_lat], - ne: [max_lon, max_lat], + ne: [180.0, max_lat], }, - config.tile_degrees, - ), - config, - ) + BoundingBox { + sw: [-180.0, min_lat], + ne: [max_lon - 360.0, max_lat], + }, + ] + } else { + // Detection fired but the unwrapped polygon doesn't actually + // straddle the dateline (rare, e.g. all vertices in [-180, 0] + // but with an edge near the dateline whose lon_diff just + // exceeds 180°). Fall back to the unwrapped bbox un-split — + // an over-cover that Mongo's polygon filter will trim anyway. + vec![BoundingBox { + sw: [min_lon, min_lat], + ne: [max_lon, max_lat], + }] + } } /// Tile a bbox into grid-aligned cells of side `tile_degrees`. The grid is @@ -488,6 +575,98 @@ mod tests { assert!(tiles.is_empty()); } + #[test] + fn polygon_crossing_antimeridian_splits_into_two_sub_bboxes() { + // Polygon straddles the antimeridian: vertices at 170°E and 170°W, + // i.e. the polygon is a thin strip across the dateline. Naive + // bbox would give [-170, 0]→[170, 10] (340° wide); we want two + // bboxes covering [170, 0]→[180, 10] and [-180, 0]→[-170, 10]. + let tiles = generate_tiles( + &json!({"polygon": "[[170,0],[-170,0],[-170,10],[170,10],[170,0]]"}), + &TEST_CONFIG, + ); + + // 2 spatial sub-bboxes × 1 spatial tile each × 2 levels = 4 specs. + // (Each sub-bbox is exactly 10°×10°, so one tile.) + assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); + + let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + assert!(bboxes.contains(&Some(BoundingBox { + sw: [170.0, 0.0], + ne: [180.0, 10.0], + }))); + assert!(bboxes.contains(&Some(BoundingBox { + sw: [-180.0, 0.0], + ne: [-170.0, 10.0], + }))); + } + + #[test] + fn polygon_entirely_east_of_dateline_is_not_split() { + // All vertices positive, no edge spans > 180°. Single bbox. + let tiles = generate_tiles( + &json!({"polygon": "[[10,0],[20,0],[20,10],[10,10],[10,0]]"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), TEST_CONFIG.levels.len()); + assert_eq!( + tiles[0].tile_bbox, + Some(BoundingBox { + sw: [10.0, 0.0], + ne: [20.0, 10.0], + }) + ); + } + + #[test] + fn polygon_entirely_west_of_dateline_is_not_split() { + // All negatives, no edge spans > 180°. Single bbox. + let tiles = generate_tiles( + &json!({"polygon": "[[-20,0],[-10,0],[-10,10],[-20,10],[-20,0]]"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), TEST_CONFIG.levels.len()); + assert_eq!( + tiles[0].tile_bbox, + Some(BoundingBox { + sw: [-20.0, 0.0], + ne: [-10.0, 10.0], + }) + ); + } + + #[test] + fn polygon_edge_at_exactly_180_lon_diff_is_treated_as_non_crossing() { + // Edge from (90, 0) to (-90, 0) has |lon_diff| = 180 exactly, + // which is ambiguous (the polygon could go the short way around + // either hemisphere). We treat exact-180 as non-crossing, which + // gives a 180°-wide bbox covering the eastern hemisphere; Mongo's + // $geoWithin will pick its own interpretation. + let tiles = generate_tiles( + &json!({"polygon": "[[90,0],[-90,0],[-90,10],[90,10],[90,0]]"}), + &TEST_CONFIG, + ); + // One bbox spanning [-90, 0]→[90, 10] = 9 lon cells × 1 lat cell. + let distinct_bboxes: Vec<_> = { + let mut v: Vec> = Vec::new(); + for t in &tiles { + if !v.iter().any(|b| b == &t.tile_bbox) { + v.push(t.tile_bbox.clone()); + } + } + v + }; + // Some number of distinct tiles, but no SW negative-to-positive split. + // We just assert we didn't accidentally split (would produce a + // BoundingBox with sw=[-180, _]). + assert!( + !distinct_bboxes.iter().any(|b| { + b.as_ref().map(|bb| bb.sw[0] == -180.0).unwrap_or(false) + }), + "exact-180° edge shouldn't have triggered an antimeridian split" + ); + } + // ---- defensive: dataset with no levels ---------------------------------- #[test] diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 330f60b..65b3899 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -270,6 +270,33 @@ async fn box_crossing_dateline_finds_antimeridian_docs() { assert!(!ids.contains(&"bsose_doc_004")); } +#[tokio::test] +async fn polygon_crossing_antimeridian_finds_seeded_doc() { + // Polygon straddles the dateline: vertices at (170, 45), (-160, 45), + // (-160, 55), (170, 55). The tile generator should detect the + // antimeridian crossing and emit east + west sub-bboxes covering the + // narrow band rather than the naive 330°-wide bbox. + // + // doc_003 at (-170, 50) sits inside the western piece. The other + // seeded docs are far from this band and should be excluded. + let docs = get_paged( + "/timeseries/bsose", + &[ + ("polygon", "[[170,45],[-160,45],[-160,55],[170,55],[170,45]]"), + ("data", "all"), + ], + ) + .await; + let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); + assert_eq!( + docs.len(), + 1, + "expected exactly doc_003 in antimeridian polygon, got {:?}", + ids + ); + assert!(ids.contains(&"bsose_doc_003"), "ids: {:?}", ids); +} + #[tokio::test] async fn center_radius_filter_matches_nearby_points_across_pages() { // 100 km radius around (20, 10) — at the BSOSE radius cap. From 466a54ea96df45c916d7804ba7aefc670a51623c Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 13 May 2026 14:27:34 -0400 Subject: [PATCH 20/25] actual bsose levels --- api/src/helpers/dataset_config.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 37c3472..2fd913e 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -37,11 +37,7 @@ pub struct DatasetConfig { /// near-surface dense, deep-ocean coarse, ~5500 m bottom) is representative /// of typical Southern Ocean gridded products so the rest of the pagination /// machinery sees plausible input. -pub const BSOSE_LEVELS: &[f64] = &[ - 5.0, 15.0, 25.0, 40.0, 60.0, 85.0, 120.0, 165.0, 220.0, 290.0, - 380.0, 490.0, 625.0, 790.0, 990.0, 1230.0, 1520.0, 1870.0, - 2290.0, 2790.0, 3380.0, 4070.0, 4870.0, 5575.0, -]; +pub const BSOSE_LEVELS: &[f64] = &[2.1, 6.7, 12.15, 18.55, 26.25, 35.25, 45.0, 55.0, 65.0, 75.0, 85.0, 95.0, 105.0, 115.0, 125.0, 135.0, 146.5, 161.5, 180.0, 200.0, 220.0, 240.0, 260.0, 280.0, 301.0, 327.0, 361.0, 402.5, 450.0, 500.0, 551.5, 614.0, 700.0, 800.0, 900.0, 1000.0, 1100.0, 1225.0, 1400.0, 1600.0, 1800.0, 2010.0, 2270.0, 2610.0, 3000.0, 3400.0, 3800.0, 4200.0, 4600.0, 5000.0, 5400.0, 5800.0]; /// Configuration for the BSOSE timeseries dataset. /// From 8ef3f3e88aa4ce7242cc71265d4b44ab65eadf7b Mon Sep 17 00:00:00 2001 From: katieannemills Date: Thu, 14 May 2026 23:34:18 -0400 Subject: [PATCH 21/25] better error loggingand looser timeouts --- api/tests/integration.rs | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 65b3899..e36a57f 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -26,23 +26,38 @@ mod common; use common::url_with_query; use serde_json::Value; -/// Generous timeout: the naive plod-forward through empty tiles can take a -/// few seconds on the first page of a whole-globe request, even with the -/// tiny seeded corpus. We can tighten this once we have a land-mask shortcut. +/// Generous timeout: the naive plod-forward through empty tiles can take +/// most of a minute on the first page of a whole-globe request once the +/// dataset has many depth levels (BSOSE has ~52). We can tighten this +/// once probe-forward gets smarter than per-tile cursor opens. fn client() -> reqwest::Client { reqwest::Client::builder() - .timeout(std::time::Duration::from_secs(30)) + .timeout(std::time::Duration::from_secs(120)) .build() .expect("reqwest client should build") } +/// Render a reqwest error with its full source chain. `reqwest::Error`'s +/// Display impl drops the cause unless you use `{:#}` (alternate format), +/// which is easy to forget — without the chain, every transport-level +/// failure looks like a vague "error sending request for url (...)". +fn render_error(e: &dyn std::error::Error) -> String { + let mut msg = e.to_string(); + let mut current = e.source(); + while let Some(src) = current { + msg.push_str(&format!("\n caused by: {}", src)); + current = src.source(); + } + msg +} + async fn get(path: &str, params: &[(&str, &str)]) -> reqwest::Response { let url = url_with_query(path, params); client() .get(&url) .send() .await - .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)) + .unwrap_or_else(|e| panic!("GET {} failed:\n {}", url, render_error(&e))) } /// One paginated response: assert 200, parse as Value, return the body. @@ -64,7 +79,7 @@ async fn get_paged(path: &str, params: &[(&str, &str)]) -> Vec { .get(&url) .send() .await - .unwrap_or_else(|e| panic!("GET {} failed: {}", url, e)); + .unwrap_or_else(|e| panic!("GET {} failed:\n {}", url, render_error(&e))); assert_eq!( resp.status(), 200, From 1333bd036c261d1d3f7c07f08d5abdfe99237f22 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Sun, 17 May 2026 00:06:36 -0400 Subject: [PATCH 22/25] correction for dateline crosses --- api/src/helpers/dataset_config.rs | 2 +- api/src/helpers/tile_generator.rs | 96 +++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 2fd913e..a8f2a00 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -47,7 +47,7 @@ pub const BSOSE_LEVELS: &[f64] = &[2.1, 6.7, 12.15, 18.55, 26.25, 35.25, 45.0, 5 /// thousands of docs out of `$near` (which isn't spatially tiled). 100 km /// is a conservative starting point — easy to bump up if users complain. pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { - tile_degrees: 10.0, + tile_degrees: 5.0, max_radius_meters: 100_000.0, // 100 km — bump if users complain levels: BSOSE_LEVELS, }; diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index f1e143f..dcd7f46 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -33,6 +33,7 @@ use serde_json::Value; use super::dataset_config::DatasetConfig; +use super::helpers::validlonlat; /// A longitude/latitude bounding box. `sw` is the south-west corner /// (min lon, min lat); `ne` is the north-east corner (max lon, max lat). @@ -133,6 +134,11 @@ fn box_tiles(boxregion: &str, config: &DatasetConfig) -> Vec { if parsed.len() != 2 || parsed[0].len() != 2 || parsed[1].len() != 2 { return Vec::new(); } + // Normalize coords into [-180, 180] / [-90, 90] so they match what + // filters::box_filter sends to Mongo. Without this, out-of-range + // inputs like lon=181 stay raw here and produce tile bboxes outside + // the valid coordinate range, which Mongo then rejects. + let parsed = validlonlat(parsed); let sw = [parsed[0][0], parsed[0][1]]; let ne = [parsed[1][0], parsed[1][1]]; @@ -167,6 +173,12 @@ fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { Ok(v) => v, Err(_) => return Vec::new(), }; + // Same normalization filters::polygon_filter applies — keeps our tile + // bboxes aligned with the user filter Mongo actually evaluates, and + // ensures the antimeridian-detection edges below see wrapped values + // (e.g. lon=181 becomes -179, so an edge that straddles the dateline + // genuinely has |lon_diff| > 180°). + let coords = validlonlat(coords); let mut spatial = Vec::new(); for bbox in polygon_bboxes(&coords) { @@ -635,6 +647,90 @@ mod tests { ); } + #[test] + fn polygon_with_out_of_range_longitude_gets_normalized_and_split() { + // The user's exact failure shape: a thin strip straddling the + // antimeridian, expressed with lon=181 instead of lon=-179. Before + // the validlonlat call, this looked like a non-crossing polygon + // with bbox [179, -60]→[181, -58], and tile generation walked off + // the right edge of the world. After normalization, lon=181 + // becomes lon=-179, the edge 179→-179 is detected as crossing, + // and we get a sane east/west split. + // + // 2° wide strip at 5° tile_degrees → one tile each side. + let tiles = generate_tiles( + &json!({"polygon": "[[179,-60],[181,-60],[181,-58],[179,-58],[179,-60]]"}), + &TEST_CONFIG, + ); + + // 2 spatial sub-bboxes × 1 spatial tile each × 2 levels = 4 specs. + assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); + + let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + // East tile catches the 179..180 sliver. + assert!( + bboxes.contains(&Some(BoundingBox { + sw: [175.0, -60.0], + ne: [180.0, -55.0], + })), + "expected east tile [175,-60]→[180,-55] in {:?}", + bboxes + ); + // West tile catches the -180..-179 sliver. No tile should have + // lon outside [-180, 180] — that was the symptom. + assert!( + bboxes.contains(&Some(BoundingBox { + sw: [-180.0, -60.0], + ne: [-175.0, -55.0], + })), + "expected west tile [-180,-60]→[-175,-55] in {:?}", + bboxes + ); + // And nothing pathological in either direction. + for b in &bboxes { + if let Some(bb) = b { + assert!( + bb.sw[0] >= -180.0 && bb.ne[0] <= 180.0, + "tile bbox out of range: {:?}", + bb + ); + } + } + } + + #[test] + fn box_with_out_of_range_longitude_gets_normalized_and_split() { + // Same diagnosis for the box mode: a 2° wide strip across the + // antimeridian, expressed with lon=181. Before normalization, + // sw_lon=179 < ne_lon=181 looks like an ordinary non-crossing + // box. After normalization, lon=181 → -179, sw_lon=179 > -179, + // dateline split fires. + let tiles = generate_tiles( + &json!({"box": "[[179,-60],[181,-58]]"}), + &TEST_CONFIG, + ); + assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); + + let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); + assert!(bboxes.contains(&Some(BoundingBox { + sw: [175.0, -60.0], + ne: [180.0, -55.0], + }))); + assert!(bboxes.contains(&Some(BoundingBox { + sw: [-180.0, -60.0], + ne: [-175.0, -55.0], + }))); + for b in &bboxes { + if let Some(bb) = b { + assert!( + bb.sw[0] >= -180.0 && bb.ne[0] <= 180.0, + "tile bbox out of range: {:?}", + bb + ); + } + } + } + #[test] fn polygon_edge_at_exactly_180_lon_diff_is_treated_as_non_crossing() { // Edge from (90, 0) to (-90, 0) has |lon_diff| = 180 exactly, From a8d8f2e78cbf6b983754ca81d512a8abe27ff26c Mon Sep 17 00:00:00 2001 From: katieannemills Date: Sun, 17 May 2026 15:42:10 -0400 Subject: [PATCH 23/25] test cleanup --- api/src/helpers/tile_generator.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index dcd7f46..214f51f 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -657,17 +657,23 @@ mod tests { // becomes lon=-179, the edge 179→-179 is detected as crossing, // and we get a sane east/west split. // - // 2° wide strip at 5° tile_degrees → one tile each side. + // Uses BSOSE_CONFIG so the test reflects the deployment's actual + // grid alignment — the bbox assertions below assume the current + // tile_degrees (5°) and will fail with a useful "expected this + // bbox, got these instead" message if BSOSE_CONFIG changes. let tiles = generate_tiles( &json!({"polygon": "[[179,-60],[181,-60],[181,-58],[179,-58],[179,-60]]"}), - &TEST_CONFIG, + &crate::helpers::dataset_config::BSOSE_CONFIG, ); - // 2 spatial sub-bboxes × 1 spatial tile each × 2 levels = 4 specs. - assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); + // 2 spatial sub-bboxes × 1 spatial tile each × N levels. + assert_eq!( + tiles.len(), + 2 * crate::helpers::dataset_config::BSOSE_CONFIG.levels.len() + ); let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); - // East tile catches the 179..180 sliver. + // East tile catches the 179..180 sliver (5° grid cell [175,180]). assert!( bboxes.contains(&Some(BoundingBox { sw: [175.0, -60.0], @@ -676,8 +682,9 @@ mod tests { "expected east tile [175,-60]→[180,-55] in {:?}", bboxes ); - // West tile catches the -180..-179 sliver. No tile should have - // lon outside [-180, 180] — that was the symptom. + // West tile catches the -180..-179 sliver (5° cell [-180,-175]). + // No tile should have lon outside [-180, 180] — that was the + // pre-fix symptom. assert!( bboxes.contains(&Some(BoundingBox { sw: [-180.0, -60.0], @@ -707,9 +714,12 @@ mod tests { // dateline split fires. let tiles = generate_tiles( &json!({"box": "[[179,-60],[181,-58]]"}), - &TEST_CONFIG, + &crate::helpers::dataset_config::BSOSE_CONFIG, + ); + assert_eq!( + tiles.len(), + 2 * crate::helpers::dataset_config::BSOSE_CONFIG.levels.len() ); - assert_eq!(tiles.len(), 2 * TEST_CONFIG.levels.len()); let bboxes: Vec<_> = tiles.iter().map(|t| t.tile_bbox.clone()).collect(); assert!(bboxes.contains(&Some(BoundingBox { From 9bd9bb232119d28eaafe3115910219c9394553c7 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 13:26:49 -0400 Subject: [PATCH 24/25] bounding box for tile walk --- api/PAGINATION.md | 37 +++++--- api/fixtures/bsose.json | 8 +- api/src/helpers/dataset_config.rs | 56 ++++++----- api/src/helpers/filter_composer.rs | 3 +- api/src/helpers/helpers.rs | 1 + api/src/helpers/mod.rs | 3 + api/src/helpers/tile_generator.rs | 147 ++++++++++++++++++++++++----- api/tests/integration.rs | 71 +++++++------- 8 files changed, 224 insertions(+), 102 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index c82c8cd..071a1a2 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -27,13 +27,15 @@ Server-side, each request's spatial parameters define a sequence of depth level. Tiles are ordered *spatial outer, level inner*: all levels for one (lon, lat) cell come out before moving to the next cell. -Tile size is per-dataset: +Tile size and extent are per-dataset: -- Spatial extent is `DatasetConfig::tile_degrees` (10° for BSOSE). -- Depth pages are the dataset's discrete `levels` (24 brackets for BSOSE). - -For BSOSE that's up to 1600 docs per (tile × level) page, with the actual -count clipped by land, the user's filter, and the dataset's coverage. +- Spatial extent is `DatasetConfig::tile_degrees` (5° for BSOSE). +- Depth pages are the dataset's discrete `levels` (52 brackets for BSOSE). +- The tile sequence is clipped to `DatasetConfig::coverage_bbox`, an + optional rectangle that tells the generator where the dataset has + data. For BSOSE that's `[-180,-90]→[180,-30]` (south of 30°S); for + datasets without an a-priori coverage bound, it can be `None` and + the generator walks the whole globe. Each HTTP request serves at most **one** non-empty tile. The server **probes forward** from the requested `tile_index`, opening a small @@ -43,10 +45,14 @@ that yields output (or runs out of tiles). `next_url` carries the one we just emitted. When the server runs out of tiles, `next_url` is `null`. -This is naive plod-forward — there's no land-mask shortcut yet, so -whole-globe requests do walk a lot of empty tiles server-side. Clients -don't see that work; they only get one HTTP response per non-empty -tile. +The coverage bbox is the cheap way to keep probe-forward sane: tiles +that fall entirely outside the coverage are never probed at all, so +e.g. a BSOSE whole-globe walk doesn't have to confirm that the entire +Northern Hemisphere is empty before terminating. Probe-forward is still +linear in the number of *candidate* tiles after coverage filtering, so +sparse datasets within their coverage area can still incur empty +probes — a denser secondary mask (e.g. land/ocean per cell) would help +here but isn't implemented. ## Tile membership @@ -126,7 +132,10 @@ antimeridian / north-pole docs aren't lost. ## Per-dataset configuration `api/src/helpers/dataset_config.rs` defines a `DatasetConfig` struct -with the dataset's `tile_degrees`, `max_radius_meters`, and the discrete -`levels` array. The BSOSE handler binds `BSOSE_CONFIG` directly; adding -a new dataset means defining its config there and wiring its handler -through the same `tile_generator` / `filter_composer` machinery. +with the dataset's `tile_degrees`, `max_radius_meters`, the discrete +`levels` array, and an optional `coverage_bbox`. The BSOSE handler +binds `BSOSE_CONFIG` directly; adding a new dataset means defining its +config there and wiring its handler through the same `tile_generator` / +`filter_composer` machinery. `coverage_bbox: None` for a new dataset +gives global-walk semantics; setting it to a bounding rectangle tells +the tile generator to skip everything outside the rectangle. diff --git a/api/fixtures/bsose.json b/api/fixtures/bsose.json index e1199d4..77dd0f6 100644 --- a/api/fixtures/bsose.json +++ b/api/fixtures/bsose.json @@ -3,7 +3,7 @@ "_id": "bsose_doc_001", "metadata": ["bsose-profile-meta-2020"], "basin": 1.0, - "geolocation": { "type": "Point", "coordinates": [20.0, 10.0] }, + "geolocation": { "type": "Point", "coordinates": [20.0, -50.0] }, "level": 10.0, "cell_vertical_fraction": 1.0, "sea_binary_mask_at_t_locaiton": true, @@ -23,7 +23,7 @@ "_id": "bsose_doc_002", "metadata": ["bsose-profile-meta-2020"], "basin": 1.0, - "geolocation": { "type": "Point", "coordinates": [40.0, 30.0] }, + "geolocation": { "type": "Point", "coordinates": [40.0, -40.0] }, "level": 10.0, "cell_vertical_fraction": 1.0, "sea_binary_mask_at_t_locaiton": true, @@ -43,7 +43,7 @@ "_id": "bsose_doc_003", "metadata": ["bsose-profile-meta-2020"], "basin": 2.0, - "geolocation": { "type": "Point", "coordinates": [-170.0, 50.0] }, + "geolocation": { "type": "Point", "coordinates": [-170.0, -55.0] }, "level": 20.0, "cell_vertical_fraction": 1.0, "sea_binary_mask_at_t_locaiton": true, @@ -63,7 +63,7 @@ "_id": "bsose_doc_004", "metadata": ["bsose-profile-meta-2020"], "basin": 1.0, - "geolocation": { "type": "Point", "coordinates": [20.0, 10.0] }, + "geolocation": { "type": "Point", "coordinates": [20.0, -50.0] }, "level": 50.0, "cell_vertical_fraction": 1.0, "sea_binary_mask_at_t_locaiton": true, diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index a8f2a00..909ce1b 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -5,51 +5,63 @@ //! defines the discrete depth pages within each spatial tile; //! `max_radius_meters` caps `center + radius` queries (which go through //! MongoDB `$near` and aren't paginated, so the cap is the only thing -//! preventing a runaway disk-of-most-of-the-globe). +//! preventing a runaway disk-of-most-of-the-globe); `coverage_bbox` +//! tells the tile generator the lat/lon rectangle the dataset's data +//! actually lives inside, so we skip probing tiles outside it. + +use super::geometry::BoundingBox; /// Per-dataset request-size policy. /// /// `tile_degrees`: edge length (degrees of longitude and latitude) of one -/// spatial pagination tile. For grid-uniform datasets, choose this so that -/// one (tile × single level) page contains at most ~1600 documents. For -/// BSOSE (1/4° grid) that means 10° tiles. +/// spatial pagination tile. /// -/// `max_radius_meters`: hard upper bound on the `radius` query parameter for -/// `center + radius` requests. These bypass tile pagination because Mongo's -/// `$near` enforces its own bound; we cap the bound so a malicious or naive -/// caller can't ask for a half-globe disk. +/// `max_radius_meters`: hard upper bound on the `radius` query parameter +/// for `center + radius` requests. These bypass tile pagination because +/// Mongo's `$near` enforces its own bound; we cap the bound so a +/// malicious or naive caller can't ask for a half-globe disk. /// /// `levels`: the discrete vertical levels the dataset is sampled at, in /// strictly increasing order (shallowest first). Pagination treats each /// level as a separate page within a spatial tile. Datasets without a -/// vertical dimension can pass a single-element slice (effectively a single -/// "level" per tile). +/// vertical dimension can pass a single-element slice (effectively a +/// single "level" per tile). +/// +/// `coverage_bbox`: optional rectangle the dataset's data is known to +/// live inside. The tile generator drops any spatial tile that doesn't +/// overlap this rectangle, so probe-forward never has to walk through +/// regions that *can't* contain data. `None` means "no a-priori bound" +/// — tile generation falls back to walking the whole globe. The +/// rectangle is treated as inclusive on its edges; a doc lying exactly +/// on the coverage boundary is preserved. pub struct DatasetConfig { pub tile_degrees: f64, pub max_radius_meters: f64, pub levels: &'static [f64], + pub coverage_bbox: Option, } -/// Placeholder BSOSE level spectrum. -/// -/// These are *not* the real BSOSE levels — Katie will overwrite them with -/// the actual depths once we have them in hand. The shape (roughly: -/// near-surface dense, deep-ocean coarse, ~5500 m bottom) is representative -/// of typical Southern Ocean gridded products so the rest of the pagination -/// machinery sees plausible input. +/// BSOSE's 52 vertical levels, in metres (positive-downward), shallowest +/// first. From the dataset's published grid; should be updated if BSOSE +/// re-releases with a different vertical discretisation. pub const BSOSE_LEVELS: &[f64] = &[2.1, 6.7, 12.15, 18.55, 26.25, 35.25, 45.0, 55.0, 65.0, 75.0, 85.0, 95.0, 105.0, 115.0, 125.0, 135.0, 146.5, 161.5, 180.0, 200.0, 220.0, 240.0, 260.0, 280.0, 301.0, 327.0, 361.0, 402.5, 450.0, 500.0, 551.5, 614.0, 700.0, 800.0, 900.0, 1000.0, 1100.0, 1225.0, 1400.0, 1600.0, 1800.0, 2010.0, 2270.0, 2610.0, 3000.0, 3400.0, 3800.0, 4200.0, 4600.0, 5000.0, 5400.0, 5800.0]; /// Configuration for the BSOSE timeseries dataset. /// -/// 10° tiles × 4 grid cells/degree = 40 × 40 = 1600 cells per (tile, level). -/// `max_radius_meters` is intentionally tight: BSOSE at 1/4° resolution -/// produces ~16 docs per 25 km × 25 km cell, so even a small disk pulls -/// thousands of docs out of `$near` (which isn't spatially tiled). 100 km -/// is a conservative starting point — easy to bump up if users complain. +/// 5° tiles × 12 grid cells/degree = 60 × 60 = 3600 cells per (tile, +/// level), most less due to land/coastlines. `max_radius_meters` is +/// intentionally tight: BSOSE produces many docs even in a small disk +/// since `$near` isn't spatially tiled. `coverage_bbox` reflects that +/// BSOSE only has data south of 30°S — no point in probing northern +/// tiles that will never contain anything. pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { tile_degrees: 5.0, max_radius_meters: 100_000.0, // 100 km — bump if users complain levels: BSOSE_LEVELS, + coverage_bbox: Some(BoundingBox { + sw: [-180.0, -90.0], + ne: [180.0, -30.0], + }), }; #[cfg(test)] diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs index 670b2fe..834d7aa 100644 --- a/api/src/helpers/filter_composer.rs +++ b/api/src/helpers/filter_composer.rs @@ -160,7 +160,7 @@ fn combine_user_and_tile(user: Document, tile: Document) -> Document { #[cfg(test)] mod tests { use super::*; - use crate::helpers::tile_generator::BoundingBox; + use crate::helpers::geometry::BoundingBox; use serde_json::json; /// Two-level dataset config: keeps test math simple and exercises both @@ -169,6 +169,7 @@ mod tests { tile_degrees: 10.0, max_radius_meters: 1.0e6, levels: &[100.0, 500.0], + coverage_bbox: None, }; fn null_tile() -> TileSpec { diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index 6bcc924..862d65f 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -347,6 +347,7 @@ mod tests { tile_degrees: 10.0, max_radius_meters: 1_000_000.0, // 1000 km levels: &[0.0], + coverage_bbox: None, }; #[test] diff --git a/api/src/helpers/mod.rs b/api/src/helpers/mod.rs index c473eb4..37ecd15 100644 --- a/api/src/helpers/mod.rs +++ b/api/src/helpers/mod.rs @@ -10,6 +10,9 @@ pub use schema::*; pub mod filters; pub use filters::*; +pub mod geometry; +pub use geometry::*; + pub mod dataset_config; pub use dataset_config::*; diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index 214f51f..7997130 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -33,21 +33,9 @@ use serde_json::Value; use super::dataset_config::DatasetConfig; +use super::geometry::BoundingBox; use super::helpers::validlonlat; -/// A longitude/latitude bounding box. `sw` is the south-west corner -/// (min lon, min lat); `ne` is the north-east corner (max lon, max lat). -/// The box is *half-open* in both dimensions: `[sw_lon, ne_lon) × [sw_lat, -/// ne_lat)`. Documents on the south or west edge belong to the tile; -/// documents on the north or east edge belong to the next tile over. This -/// matters at tile boundaries for grid-aligned datasets like BSOSE — see -/// the box construction in `grid_aligned_tiles`. -#[derive(Debug, Clone, PartialEq)] -pub struct BoundingBox { - pub sw: [f64; 2], - pub ne: [f64; 2], -} - /// One unit of pagination. Both fields are `Option` because some query /// shapes naturally suppress one or the other: /// @@ -114,16 +102,14 @@ fn level_only_tiles(config: &DatasetConfig) -> Vec { } fn whole_globe_tiles(config: &DatasetConfig) -> Vec { - cross_levels( - grid_aligned_tiles( - BoundingBox { - sw: [-180.0, -90.0], - ne: [180.0, 90.0], - }, - config.tile_degrees, - ), - config, - ) + let spatial = grid_aligned_tiles( + BoundingBox { + sw: [-180.0, -90.0], + ne: [180.0, 90.0], + }, + config.tile_degrees, + ); + cross_levels(apply_coverage(spatial, config), config) } fn box_tiles(boxregion: &str, config: &DatasetConfig) -> Vec { @@ -165,7 +151,7 @@ fn box_tiles(boxregion: &str, config: &DatasetConfig) -> Vec { for bbox in sub_boxes { tiles.extend(grid_aligned_tiles(bbox, config.tile_degrees)); } - cross_levels(tiles, config) + cross_levels(apply_coverage(tiles, config), config) } fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { @@ -184,7 +170,20 @@ fn polygon_tiles(polygon: &str, config: &DatasetConfig) -> Vec { for bbox in polygon_bboxes(&coords) { spatial.extend(grid_aligned_tiles(bbox, config.tile_degrees)); } - cross_levels(spatial, config) + cross_levels(apply_coverage(spatial, config), config) +} + +/// Drop tiles that don't overlap the dataset's known coverage region. +/// `config.coverage_bbox = None` means "no a-priori bound" — every tile +/// passes through. With a coverage set, this is where pagination saves +/// the most work: we never even probe regions that *can't* contain data +/// (latitudes north of BSOSE's domain, far-from-the-mooring tiles for +/// some hypothetical regional dataset, etc.). +fn apply_coverage(spatial: Vec, config: &DatasetConfig) -> Vec { + match &config.coverage_bbox { + None => spatial, + Some(coverage) => spatial.into_iter().filter(|t| t.overlaps(coverage)).collect(), + } } /// Compute the bounding box(es) covering a polygon's vertices, handling @@ -368,6 +367,7 @@ mod tests { tile_degrees: 10.0, max_radius_meters: 1.0e6, levels: &[0.0, 100.0], + coverage_bbox: None, }; // ---- top-level dispatch -------------------------------------------------- @@ -647,6 +647,102 @@ mod tests { ); } + // ---- coverage_bbox filtering -------------------------------------------- + + /// Test config restricted to a southern band, matching BSOSE's shape. + const COVERAGE_TEST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1.0e6, + levels: &[0.0, 100.0], + coverage_bbox: Some(BoundingBox { + sw: [-180.0, -90.0], + ne: [180.0, -30.0], + }), + }; + + #[test] + fn coverage_bbox_drops_tiles_entirely_outside() { + // Whole-globe walk against a southern-band coverage. Every emitted + // tile must overlap the coverage region — no tiles north of -30°. + let tiles = generate_tiles(&json!({}), &COVERAGE_TEST_CONFIG); + assert!(!tiles.is_empty(), "coverage band should still produce tiles"); + for t in &tiles { + let bb = t.tile_bbox.as_ref().expect("whole-globe → bbox tiles"); + // Tile must have at least some range at lat ≤ -30°. + assert!( + bb.sw[1] <= -30.0, + "tile {:?} should be entirely south of -30° (sw_lat <= -30)", + bb + ); + } + // 6 lat rows × 36 lon cols × 2 levels = 432 specs. The 6 rows are + // those whose sw_lat is one of -90,-80,-70,-60,-50,-40 (each + // overlaps the coverage [-90,-30]). Row sw_lat=-30 is also kept + // by the permissive overlap test (sw_lat=-30 == ne_lat of cov). + // So 7 rows × 36 cols × 2 levels = 504. + assert_eq!(tiles.len(), 7 * 36 * COVERAGE_TEST_CONFIG.levels.len()); + } + + #[test] + fn coverage_bbox_keeps_tile_at_coverage_boundary() { + // The southernmost "non-covered" tile is the one whose sw_lat + // equals the coverage's ne_lat. Our permissive overlap test keeps + // it so that data sitting exactly on the boundary lat doesn't + // fall in a gap. + let tiles = generate_tiles(&json!({}), &COVERAGE_TEST_CONFIG); + let bboxes: Vec<_> = tiles.iter().filter_map(|t| t.tile_bbox.clone()).collect(); + // Boundary tile starts at lat=-30, runs to lat=-20. Should be + // present in the sequence. + let boundary_present = bboxes.iter().any(|b| b.sw[1] == -30.0); + assert!( + boundary_present, + "tile whose SW touches the coverage NE should be kept; got {:?}", + bboxes + ); + } + + #[test] + fn coverage_bbox_none_preserves_global_walk() { + // Sanity: the TEST_CONFIG (coverage_bbox: None) still produces + // the full 648-tile whole-globe sequence we asserted elsewhere. + let tiles = generate_tiles(&json!({}), &TEST_CONFIG); + assert_eq!(tiles.len(), 18 * 36 * TEST_CONFIG.levels.len()); + } + + #[test] + fn polygon_entirely_outside_coverage_produces_zero_tiles() { + // Polygon over the Sahara — well north of BSOSE's coverage. + // Every candidate spatial tile is dropped by the coverage filter, + // so generate_tiles returns an empty sequence. The handler will + // fall through to its empty-response path. + let tiles = generate_tiles( + &json!({"polygon": "[[0,15],[20,15],[20,25],[0,25],[0,15]]"}), + &COVERAGE_TEST_CONFIG, + ); + assert!( + tiles.is_empty(), + "polygon outside coverage should produce no tiles, got {:?}", + tiles + ); + } + + #[test] + fn box_partially_outside_coverage_keeps_only_overlapping_tiles() { + // Box straddling the coverage boundary: lat range [-40, -20]. + // The southern half (lat in [-40,-30]) is inside coverage; the + // northern half (lat in [-30,-20]) is outside. Only southern + // tiles should survive. + let tiles = generate_tiles( + &json!({"box": "[[10,-40],[20,-20]]"}), + &COVERAGE_TEST_CONFIG, + ); + assert!(!tiles.is_empty()); + for t in &tiles { + let bb = t.tile_bbox.as_ref().unwrap(); + assert!(bb.sw[1] <= -30.0, "tile {:?} should be at or south of -30°", bb); + } + } + #[test] fn polygon_with_out_of_range_longitude_gets_normalized_and_split() { // The user's exact failure shape: a thin strip straddling the @@ -784,6 +880,7 @@ mod tests { tile_degrees: 10.0, max_radius_meters: 1.0e6, levels: &[], + coverage_bbox: None, }; let tiles = generate_tiles( &json!({"box": "[[0,0],[10,10]]"}), diff --git a/api/tests/integration.rs b/api/tests/integration.rs index e36a57f..d8e8f3d 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -221,13 +221,13 @@ async fn vertical_range_filters_by_level_across_pages() { #[tokio::test] async fn box_filter_matches_seeded_points_across_pages() { - // Box covers (lon 15..45, lat 5..35) — should hit docs at (20,10) and - // (40,30), which is doc_001, doc_002, doc_004 (doc_001 and doc_004 - // share coords but different levels — they land in different + // Box covers (lon 15..45, lat -55..-35) — should hit docs at (20,-50) + // and (40,-40), which is doc_001, doc_002, doc_004 (doc_001 and + // doc_004 share coords but different levels — they land in different // level-brackets, so they show up on different pages). let docs = get_paged( "/timeseries/bsose", - &[("box", "[[15,5],[45,35]]"), ("data", "all")], + &[("box", "[[15,-55],[45,-35]]"), ("data", "all")], ) .await; let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); @@ -239,16 +239,15 @@ async fn box_filter_matches_seeded_points_across_pages() { #[tokio::test] async fn polygon_filter_matches_seeded_points_across_pages() { - // Polygon around (20, 10) — small square enclosing doc_001 / doc_004. - // The polygon's bbox spans four spatial tiles ([10-20, 0-10], - // [20-30, 0-10], [10-20, 10-20], [20-30, 10-20]) — multi-tile case. - // doc_001 (level 10 → L0) and doc_004 (level 50 → L3) share the same - // spatial tile but land in different level pages, so we expect - // exactly 2 docs across 2 non-empty pages. + // Polygon around (20, -50) — small square enclosing doc_001 / doc_004. + // The polygon's bbox spans multiple spatial tiles (multi-tile case). + // doc_001 (level 10) and doc_004 (level 50) share the same spatial + // tile but land in different level pages, so we expect exactly 2 docs + // across 2 non-empty pages. let docs = get_paged( "/timeseries/bsose", &[ - ("polygon", "[[15,5],[25,5],[25,15],[15,15],[15,5]]"), + ("polygon", "[[15,-55],[25,-55],[25,-45],[15,-45],[15,-55]]"), ("data", "all"), ], ) @@ -265,11 +264,11 @@ async fn box_crossing_dateline_finds_antimeridian_docs() { // Dateline-crossing box: sw_lon (170) > ne_lon (-160), so the box // wraps the antimeridian. Tile generation splits it into an eastern // sub-box (170..180) and a western sub-box (-180..-160). doc_003 at - // (-170, 50) lives in the western band; the other seeded docs are + // (-170, -55) lives in the western band; the other seeded docs are // far from this box and should be excluded. let docs = get_paged( "/timeseries/bsose", - &[("box", "[[170,40],[-160,60]]"), ("data", "all")], + &[("box", "[[170,-60],[-160,-40]]"), ("data", "all")], ) .await; let ids: Vec<&str> = docs.iter().map(|r| r["_id"].as_str().unwrap()).collect(); @@ -287,17 +286,17 @@ async fn box_crossing_dateline_finds_antimeridian_docs() { #[tokio::test] async fn polygon_crossing_antimeridian_finds_seeded_doc() { - // Polygon straddles the dateline: vertices at (170, 45), (-160, 45), - // (-160, 55), (170, 55). The tile generator should detect the + // Polygon straddles the dateline: vertices at (170, -60), (-160, -60), + // (-160, -50), (170, -50). The tile generator should detect the // antimeridian crossing and emit east + west sub-bboxes covering the // narrow band rather than the naive 330°-wide bbox. // - // doc_003 at (-170, 50) sits inside the western piece. The other + // doc_003 at (-170, -55) sits inside the western piece. The other // seeded docs are far from this band and should be excluded. let docs = get_paged( "/timeseries/bsose", &[ - ("polygon", "[[170,45],[-160,45],[-160,55],[170,55],[170,45]]"), + ("polygon", "[[170,-60],[-160,-60],[-160,-50],[170,-50],[170,-60]]"), ("data", "all"), ], ) @@ -314,7 +313,7 @@ async fn polygon_crossing_antimeridian_finds_seeded_doc() { #[tokio::test] async fn center_radius_filter_matches_nearby_points_across_pages() { - // 100 km radius around (20, 10) — at the BSOSE radius cap. + // 100 km radius around (20, -50) — at the BSOSE radius cap. // center+radius gets level-only pagination (no spatial tiling), so // we still need to walk pages to hit each level bracket that // contains data. doc_001 / doc_004 sit exactly at the center so any @@ -323,7 +322,7 @@ async fn center_radius_filter_matches_nearby_points_across_pages() { let docs = get_paged( "/timeseries/bsose", &[ - ("center", "[20.0, 10.0]"), + ("center", "[20.0, -50.0]"), ("radius", "100000"), // 100 km — at the cap ("data", "all"), ], @@ -500,14 +499,14 @@ async fn batchmeta_returns_metadata_documents_across_pages() { #[tokio::test] async fn polygon_over_empty_region_returns_empty_envelope() { - // Polygon in the Indian Ocean (60-70°E, 5-15°N) — far from any - // seeded doc. Probe-forward should walk every candidate tile, find - // none non-empty, and return a 200 envelope with an empty docs array - // and null next_url instead of 404 or any other error code. + // Polygon in the south Indian Ocean (80-90°E, -55..-45°) — inside + // BSOSE's coverage region but far from any seeded doc. Probe-forward + // walks the candidate tiles, finds none non-empty, and returns a + // 200 envelope with empty docs and null next_url instead of 404. let body = get_envelope( "/timeseries/bsose", &[ - ("polygon", "[[60,5],[70,5],[70,15],[60,15],[60,5]]"), + ("polygon", "[[80,-55],[90,-55],[90,-45],[80,-45],[80,-55]]"), ("data", "all"), ], ) @@ -523,13 +522,13 @@ async fn polygon_over_empty_region_returns_empty_envelope() { #[tokio::test] async fn tile_index_beyond_end_returns_empty_with_null_next_url() { - // Tile sequence for a small box is short; an absurdly large tile_index - // is past the end. The server should return 200 + empty docs + - // null next_url, not an error. + // Tile sequence for a small box inside BSOSE coverage is short; an + // absurdly large tile_index is past the end. The server should + // return 200 + empty docs + null next_url, not an error. let body = get_envelope( "/timeseries/bsose", &[ - ("box", "[[0,0],[10,10]]"), + ("box", "[[0,-40],[10,-30]]"), ("tile_index", "9999999"), ], ) @@ -543,7 +542,7 @@ async fn tile_index_beyond_end_returns_empty_with_null_next_url() { async fn invalid_tile_index_returns_400() { let resp = get( "/timeseries/bsose", - &[("box", "[[0,0],[10,10]]"), ("tile_index", "not-a-number")], + &[("box", "[[0,-40],[10,-30]]"), ("tile_index", "not-a-number")], ) .await; assert_eq!(resp.status(), 400); @@ -553,7 +552,7 @@ async fn invalid_tile_index_returns_400() { async fn negative_tile_index_returns_400() { let resp = get( "/timeseries/bsose", - &[("box", "[[0,0],[10,10]]"), ("tile_index", "-1")], + &[("box", "[[0,-40],[10,-30]]"), ("tile_index", "-1")], ) .await; assert_eq!(resp.status(), 400); @@ -583,7 +582,7 @@ async fn next_url_round_trips_cleanly() { // handler — catches percent-encoding bugs, param dropping, etc. let body = get_envelope( "/timeseries/bsose", - &[("box", "[[15,5],[45,35]]"), ("data", "all")], + &[("box", "[[15,-55],[45,-35]]"), ("data", "all")], ) .await; let next = body["next_url"] @@ -620,13 +619,13 @@ async fn next_url_round_trips_cleanly() { #[tokio::test] async fn first_page_carries_a_next_url_when_more_pages_remain() { - // The (20,10)/(40,30) box has docs at multiple level brackets, so the - // first page should not be the last. next_url must carry both the - // user's params (so the next request hits the same filter) and an - // advanced tile_index. + // The (20,-50)/(40,-40) box has docs at multiple level brackets, so + // the first page should not be the last. next_url must carry both + // the user's params (so the next request hits the same filter) and + // an advanced tile_index. let body = get_envelope( "/timeseries/bsose", - &[("box", "[[15,5],[45,35]]"), ("data", "all")], + &[("box", "[[15,-55],[45,-35]]"), ("data", "all")], ) .await; assert!( From 686126b8bf2f978487bd984f088f0f86fa2720a3 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 13:58:46 -0400 Subject: [PATCH 25/25] dont forget --- api/src/helpers/geometry.rs | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 api/src/helpers/geometry.rs diff --git a/api/src/helpers/geometry.rs b/api/src/helpers/geometry.rs new file mode 100644 index 0000000..793ac42 --- /dev/null +++ b/api/src/helpers/geometry.rs @@ -0,0 +1,94 @@ +//! Shared spatial primitives used across the tile generator, the filter +//! composer, and the dataset config. +//! +//! Kept in its own module so that `dataset_config` can express coverage +//! regions without depending on `tile_generator`, and vice versa — both +//! reach for `BoundingBox` for unrelated reasons (one to describe a tile, +//! one to describe where a dataset has data), and neither should be a +//! parent of the other in the module graph. + +/// A longitude/latitude bounding box. `sw` is the south-west corner +/// (min lon, min lat); `ne` is the north-east corner (max lon, max lat). +/// +/// Tile bboxes use this type as half-open intervals: `[sw, ne)` on both +/// axes (the half-open behaviour is enforced by `filter_composer` shrinking +/// the NE corner of the underlying GeoJSON polygon, not by this type). +/// +/// Dataset coverage bboxes use this type as inclusive intervals: a doc at +/// lat == cov.ne[1] is considered "covered" so it isn't dropped at the +/// dataset's edge. +#[derive(Debug, Clone, PartialEq)] +pub struct BoundingBox { + pub sw: [f64; 2], + pub ne: [f64; 2], +} + +impl BoundingBox { + /// Permissive overlap test using closed-interval semantics on both + /// boxes. Returns true if the two bboxes share any region, including + /// touching only at an edge or a corner. + /// + /// Closed semantics are deliberately chosen for the + /// tile-vs-coverage-bbox use case: a tile whose SW edge sits exactly + /// on the coverage's NE edge gets *kept*, so a grid-aligned dataset + /// whose data extends to (and includes) the coverage boundary doesn't + /// lose its edge cells. The cost is one extra tile worth of probing + /// at the boundary, which is negligible. + pub fn overlaps(&self, other: &BoundingBox) -> bool { + self.sw[0] <= other.ne[0] + && self.ne[0] >= other.sw[0] + && self.sw[1] <= other.ne[1] + && self.ne[1] >= other.sw[1] + } +} + +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + fn bb(sw: [f64; 2], ne: [f64; 2]) -> BoundingBox { + BoundingBox { sw, ne } + } + + #[test] + fn overlaps_disjoint_returns_false() { + let a = bb([0.0, 0.0], [10.0, 10.0]); + let b = bb([20.0, 20.0], [30.0, 30.0]); + assert!(!a.overlaps(&b)); + assert!(!b.overlaps(&a)); + } + + #[test] + fn overlaps_fully_contained_returns_true() { + let outer = bb([0.0, 0.0], [100.0, 100.0]); + let inner = bb([10.0, 10.0], [20.0, 20.0]); + assert!(outer.overlaps(&inner)); + assert!(inner.overlaps(&outer)); + } + + #[test] + fn overlaps_partial_returns_true() { + let a = bb([0.0, 0.0], [10.0, 10.0]); + let b = bb([5.0, 5.0], [15.0, 15.0]); + assert!(a.overlaps(&b)); + } + + #[test] + fn overlaps_touching_at_edge_returns_true() { + // The whole point of the permissive ≤/≥ test: bboxes that share + // exactly an edge count as overlapping. Used by the coverage + // filter so tiles at the coverage boundary aren't dropped. + let a = bb([0.0, 0.0], [10.0, 10.0]); + let b = bb([10.0, 0.0], [20.0, 10.0]); + assert!(a.overlaps(&b)); + } + + #[test] + fn overlaps_touching_at_corner_returns_true() { + let a = bb([0.0, 0.0], [10.0, 10.0]); + let b = bb([10.0, 10.0], [20.0, 20.0]); + assert!(a.overlaps(&b)); + } +}