From c81c28da5a1e481c146fd6fbcec462c7d1daddb4 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 18:34:49 -0400 Subject: [PATCH 01/13] refactoring to accommodate arbitrary timeseries grids --- api/src/helpers/dataset_config.rs | 52 ++++++++ api/src/helpers/schema.rs | 48 ++++++- api/src/helpers/transforms.rs | 95 +++++++++++++- api/src/main.rs | 207 +++++++++++++++++++++++------- 4 files changed, 347 insertions(+), 55 deletions(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 909ce1b..702d95b 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -8,8 +8,19 @@ //! preventing a runaway disk-of-most-of-the-globe); `coverage_bbox` //! tells the tile generator the lat/lon rectangle the dataset's data //! actually lives inside, so we skip probing tiles outside it. +//! +//! `DatasetSource` is the sibling struct that names *where* each dataset +//! lives in Mongo (db, collection, meta collection, meta discriminator) +//! and caches startup-loaded metadata (the timeseries axis, the meta +//! default `data_info`). One static `*_SOURCE` per dataset; the +//! generic handler in `main.rs` consumes `(&DatasetConfig, &DatasetSource)` +//! plus the dataset's schema generic to serve a request. + +use mongodb::bson::DateTime as BsonDateTime; +use once_cell::sync::OnceCell; use super::geometry::BoundingBox; +use super::schema::DataInfo; /// Per-dataset request-size policy. /// @@ -41,6 +52,47 @@ pub struct DatasetConfig { pub coverage_bbox: Option, } +/// Per-dataset *identity*: where the dataset lives in Mongo, and a place +/// to stash the values we read once at startup so requests don't have to. +/// +/// `db_name` / `collection` name the data collection the handler queries. +/// `meta_collection` is the per-dataset (or shared) metadata collection; +/// `meta_data_type` is the value of the `data_type` discriminator that +/// picks this dataset's meta doc out of the meta collection. +/// +/// `timeseries` and `data_info` are populated once at server startup by +/// the generic loader in `main.rs` and read on every request. They're +/// behind `OnceCell` rather than `Mutex>` because they're +/// write-once: the loader fills them in `main()` and nothing else ever +/// writes again, so handlers can read without locking. +/// +/// `data_info` is the *meta-level default* for the dataset. Per the +/// precedence rule documented on `transforms::transform_timeseries`, a +/// data doc that carries its own `data_info` wins over this default; if +/// the doc's `data_info` is empty, the cached default is stamped on +/// before column filtering runs. Datasets that put `data_info` on every +/// data doc (e.g. BSOSE today) leave this cache as the empty tuple — it +/// gets populated but never consulted. +pub struct DatasetSource { + pub db_name: &'static str, + pub collection: &'static str, + pub meta_collection: &'static str, + pub meta_data_type: &'static str, + pub timeseries: OnceCell>, + pub data_info: OnceCell, +} + +/// Mongo identity for the BSOSE timeseries dataset. The OnceCells are +/// filled at startup by `main.rs::populate_dataset_caches`. +pub static BSOSE_SOURCE: DatasetSource = DatasetSource { + db_name: "argo", + collection: "bsose", + meta_collection: "timeseriesMeta", + meta_data_type: "BSOSE-profile", + timeseries: OnceCell::new(), + data_info: OnceCell::new(), +}; + /// BSOSE's 52 vertical levels, in metres (positive-downward), shallowest /// first. From the dataset's published grid; should be updated if BSOSE /// re-releases with a different vertical discretisation. diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs index a37ce5b..a8cabfb 100644 --- a/api/src/helpers/schema.rs +++ b/api/src/helpers/schema.rs @@ -17,6 +17,15 @@ pub struct SourceMeta { pub(crate) iter: String, } +// type aliases /////////////////////////////////////////////////////////////// + +/// Per-variable descriptor carried alongside a timeseries doc: +/// `(variable_names, info_fields, per_variable_info)`. The shape is +/// preserved as a tuple for backward-compatible JSON serialization (the +/// public response format encodes it as a 3-tuple), but giving it a name +/// makes function signatures and the per-dataset cache easier to read. +pub type DataInfo = (Vec, Vec, Vec>); + // categroical traits ///////////////////////////////////////////////////////// pub trait IsTimeseries { @@ -25,8 +34,8 @@ pub trait IsTimeseries { fn set_data(&mut self, data: Vec>); fn timeseries(&mut self) -> Option<&mut Vec>; fn set_timeseries(&mut self, timeseries: Vec); - fn data_info(&mut self) -> (Vec, Vec, Vec>); - fn set_data_info(&mut self, data_info: (Vec, Vec, Vec>)); + fn data_info(&mut self) -> DataInfo; + fn set_data_info(&mut self, data_info: DataInfo); fn _id(&self) -> String; fn longitude(&self) -> f64; fn latitude(&self) -> f64; @@ -36,6 +45,16 @@ pub trait IsTimeseries { pub trait IsTimeseriesMeta { fn get_timeseries_meta(&self) -> bool; + /// Snapshot of the dataset's timestamp axis. Read once at startup and + /// cached on `DatasetSource::timeseries` so request handling doesn't + /// re-fetch it per request. + fn timeseries(&self) -> Vec; + /// Per-dataset `data_info` default. Stamped onto a data doc by + /// `transform_timeseries` only when that doc carries no `data_info` of + /// its own (the precedence rule: doc-level wins over meta-level). May + /// be the empty tuple — datasets that store `data_info` on every data + /// doc (e.g. BSOSE today) leave the meta-level value blank. + fn data_info(&self) -> DataInfo; } // bsose ////////////////////////////////////////////////////////////////////// @@ -56,7 +75,7 @@ pub struct BsoseSchema { pub(crate) data: Vec>, // Not present in the source collection — gets populated by transforms. pub(crate) timeseries: Option>, - pub(crate) data_info: (Vec, Vec, Vec>), + pub(crate) data_info: DataInfo, } impl IsTimeseries for BsoseSchema { @@ -80,11 +99,11 @@ impl IsTimeseries for BsoseSchema { self.timeseries = Some(timeseries); } - fn data_info(&mut self) -> (Vec, Vec, Vec>) { + fn data_info(&mut self) -> DataInfo { self.data_info.clone() } - fn set_data_info(&mut self, data_info: (Vec, Vec, Vec>)) { + fn set_data_info(&mut self, data_info: DataInfo) { self.data_info = data_info; } @@ -114,8 +133,8 @@ pub struct BsoseMeta { pub(crate) _id: String, pub(crate) data_type: String, pub(crate) date_updated_argovis: BsonDateTime, - // `timeseries` is read from main.rs at startup to populate the cached - // TIMESERIES global, so it stays fully `pub`. + // `timeseries` and `data_info` are read at startup to populate the + // per-dataset cache on `DatasetSource`, so both stay fully `pub`. pub timeseries: Vec, pub(crate) source: Vec, pub(crate) cell_area: f64, @@ -123,12 +142,27 @@ pub struct BsoseMeta { pub(crate) depth_r0_to_bottom: f64, pub(crate) interior_2d_mask: bool, pub(crate) depth_r0_to_ref_surface: f64, + // `data_info` may or may not be present on the BSOSE meta doc (today + // it lives only on the data docs themselves). `#[serde(default)]` + // makes deserialization tolerate either case: when absent the cache + // becomes the empty sentinel and per-doc values keep taking + // precedence, exactly the current behaviour. + #[serde(default)] + pub data_info: DataInfo, } impl IsTimeseriesMeta for BsoseMeta { fn get_timeseries_meta(&self) -> bool { return true; } + + fn timeseries(&self) -> Vec { + self.timeseries.clone() + } + + fn data_info(&self) -> DataInfo { + self.data_info.clone() + } } #[derive(Deserialize, Debug, Clone)] diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs index 2449344..adbda37 100644 --- a/api/src/helpers/transforms.rs +++ b/api/src/helpers/transforms.rs @@ -5,9 +5,27 @@ use mongodb::bson::DateTime as BsonDateTime; /// Apply the user's `startDate` / `endDate` / `data` parameters to a single /// timeseries document. Returns `None` if the document was filtered out /// entirely (e.g. `data=somefield` produced no matching columns). +/// +/// `cached_data_info` is the *meta-level default* `data_info` for this +/// dataset — read once at startup from the meta doc and stashed on the +/// dataset's `DatasetSource`. Precedence rule for `data_info`: +/// +/// - If the data doc carries its own non-empty `data_info` (the BSOSE +/// case today), it wins; the cached default is ignored. +/// - If the data doc has no `data_info` (the OI SST case — single +/// variable, info kept on the meta doc), the cached default is +/// stamped onto the doc before `slice_data` runs. +/// - If both are empty, `slice_data` will return `None` for any +/// specific-variable request (no columns to match) and an +/// empty-data passthrough for `data=` / `data=all` — same as today. +/// +/// The cache passes through as `&DataInfo` rather than `Option`: an empty +/// tuple is its own "no default" sentinel, matching the convention +/// `slice_data` already uses for "no fields". pub fn transform_timeseries( params: &serde_json::Value, ts: &[BsonDateTime], + cached_data_info: &schema::DataInfo, mut doc: T, ) -> Option { let start_date = params.get("startDate") @@ -26,6 +44,15 @@ pub fn transform_timeseries( if start_date.is_some() || end_date.is_some() { slice_timerange(start_date, end_date, ts, &mut doc); } + + // Doc-level data_info takes precedence; only fall back to the cached + // meta-level default when the doc itself carries no variable names. + // `data_info.0` is the variable-names vector, so its emptiness is the + // canonical "no data_info" check. + if doc.data_info().0.is_empty() && !cached_data_info.0.is_empty() { + doc.set_data_info(cached_data_info.clone()); + } + slice_data(&data, doc) } @@ -293,10 +320,76 @@ mod tests { "data": "salinity", }); - let mut out = transform_timeseries(¶ms, ×eries, doc).unwrap(); + // Empty cached_data_info — this doc already carries its own + // data_info, so the precedence rule means the cache is never + // consulted regardless. + let empty_cache: schema::DataInfo = (vec![], vec![], vec![]); + let mut out = transform_timeseries(¶ms, ×eries, &empty_cache, doc).unwrap(); assert_eq!(*out.data(), vec![vec![20.0, 30.0]]); } + // ---- transform_timeseries: data_info precedence ------------------------- + + #[test] + fn transform_uses_cached_data_info_when_doc_has_none() { + // OI SST-style: the data doc carries no data_info; the meta-level + // cache supplies the variable names so slice_data can resolve + // `data=sst`. + let timeseries = ts(&[1, 2]); + let mut doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0]], + &["sst"], // gets cleared below to simulate a docless data_info + ); + // Clear the doc's data_info so the cache fallback kicks in. + doc.data_info = (vec![], vec![], vec![]); + + let cache: schema::DataInfo = ( + vec!["sst".to_string()], + vec!["units".to_string(), "long_name".to_string()], + vec![vec!["degC".to_string(), "SST".to_string()]], + ); + + let params = json!({"data": "sst"}); + let mut out = + transform_timeseries(¶ms, ×eries, &cache, doc).expect("should resolve"); + // sst column survives. + assert_eq!(*out.data(), vec![vec![1.0, 2.0]]); + // data_info has been stamped from the cache, then column-filtered + // by slice_data — should still list sst. + let info = out.data_info(); + assert_eq!(info.0, vec!["sst".to_string()]); + } + + #[test] + fn transform_doc_data_info_wins_over_cache() { + // BSOSE-style: the doc has data_info and the cache also has + // something (hypothetically). The doc's value should take + // precedence — the cache should not overwrite it. + let timeseries = ts(&[1, 2]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0], vec![3.0, 4.0]], + &["temp", "salinity"], + ); + + // Cache says "sst" — but doc has temp/salinity. Doc wins; the + // request for `data=temp` should resolve against the doc, not + // get clobbered by the cache. + let cache: schema::DataInfo = ( + vec!["sst".to_string()], + vec!["units".to_string()], + vec![vec!["degC".to_string()]], + ); + + let params = json!({"data": "temp"}); + let mut out = + transform_timeseries(¶ms, ×eries, &cache, doc).expect("should resolve"); + assert_eq!(*out.data(), vec![vec![1.0, 2.0]]); + let info = out.data_info(); + assert_eq!(info.0, vec!["temp".to_string()]); + } + // ---- timeseries_stub ----------------------------------------------------- #[test] diff --git a/api/src/main.rs b/api/src/main.rs index 0255f4d..e20f37e 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -28,25 +28,68 @@ use std::sync::Mutex; use futures::stream::StreamExt; use std::env; use std::convert::Infallible; +use serde::Serialize; use serde::de::DeserializeOwned; -use mongodb::bson::DateTime; use std::collections::HashSet; use async_stream::stream; use serde_json::{json, Value}; +use dataset_config::{DatasetConfig, DatasetSource}; + static CLIENT: Lazy>> = Lazy::new(|| Mutex::new(None)); -static TIMESERIES: Lazy>>> = Lazy::new(|| Mutex::new(None)); + +// ---- route handlers -------------------------------------------------------- +// +// Each dataset gets its own one-route handler that resolves the dataset +// generic and hands off to `serve_timeseries`. Keeps Actix's `#[get(...)]` +// attribute discoverable per-dataset and lets the routing table grow +// without touching the generic core. Adding a new dataset is: define its +// `*_CONFIG` + `*_SOURCE` in `dataset_config.rs`, define its schema + +// meta in `schema.rs`, then add a 4-line handler here. #[get("/timeseries/bsose")] -async fn search_data_schema( +async fn bsose_handler( req: HttpRequest, query_params: web::Query, ) -> impl Responder { - let params = query_params.into_inner(); - - // Dataset-specific request-size policy: tile size, level set, radius cap. - let config = &dataset_config::BSOSE_CONFIG; + serve_timeseries::( + req, + query_params.into_inner(), + &dataset_config::BSOSE_CONFIG, + &dataset_config::BSOSE_SOURCE, + ) + .await +} +// ---- generic timeseries handler -------------------------------------------- + +/// Generic body of the `/timeseries/{dataset}` endpoint. Parameterized by +/// `S`, the per-dataset data-doc schema (e.g. `BsoseSchema`). The dataset's +/// request-size policy comes in as `config` (tile size, level set, radius +/// cap, coverage); its Mongo identity and startup caches come in as +/// `source` (db/collection names, the cached `timeseries` axis and the +/// cached meta-level `data_info` default). +/// +/// Behaviour is the same as the previous BSOSE-specific handler: validate +/// the query, generate the tile sequence, then probe-forward through +/// tiles serving at most one non-empty tile per request. The two branches +/// (streaming docs vs. `batchmeta` metadata lookup) are unchanged from +/// the original implementation. +async fn serve_timeseries( + req: HttpRequest, + params: serde_json::Value, + config: &DatasetConfig, + source: &'static DatasetSource, +) -> HttpResponse +where + S: schema::IsTimeseries + + DeserializeOwned + + Serialize + + Send + + Sync + + Unpin + + 'static, +{ // The next_url we emit on success uses this request's own path, so the // generated URL stays correct even if the route is re-mounted later. let path = req.path().to_string(); @@ -67,10 +110,19 @@ async fn search_data_schema( // ---- tile sequence + cached startup data -------------------------- let tiles = tile_generator::generate_tiles(¶ms, config); - let timeseries = { - let ts = TIMESERIES.lock().unwrap(); - ts.clone().unwrap() - }; + // Both caches are populated at startup by `populate_dataset_cache` + // before the server begins accepting connections. A handler hit + // before startup finished is a server bug, so we surface it. + let timeseries = source + .timeseries + .get() + .expect("dataset timeseries cache should be populated at startup") + .clone(); + let cached_data_info = source + .data_info + .get() + .expect("dataset data_info cache should be populated at startup") + .clone(); let compression: Option = params .get("compression") @@ -100,8 +152,11 @@ async fn search_data_schema( let filter = filter_composer::compose_filter_with_tile(params.clone(), tile, config); let options = FindOptions::builder().build(); - let mut cursor = match generate_cursor::( - "argo", "bsose", filter, Some(options), + let mut cursor = match generate_cursor::( + source.db_name, + source.collection, + filter, + Some(options), ) .await { @@ -118,11 +173,14 @@ async fn search_data_schema( while let Some(result) = cursor.next().await { match result { Ok(doc) => { - if let Some(t) = - transforms::transform_timeseries(¶ms, ×eries, doc) - { - for m in t.metadata.iter() { - unique_metadata.insert(m.clone()); + if let Some(t) = transforms::transform_timeseries( + ¶ms, + ×eries, + &cached_data_info, + doc, + ) { + for m in t.metadata().into_iter() { + unique_metadata.insert(m); } } } @@ -141,7 +199,10 @@ async fn search_data_schema( "_id": { "$in": unique_metadata.into_iter().collect::>() } }; let meta_cursor = match generate_cursor::( - "argo", "timeseriesMeta", meta_filter, None, + source.db_name, + source.meta_collection, + meta_filter, + None, ) .await { @@ -165,13 +226,16 @@ async fn search_data_schema( // Peek-ahead until we find a doc that survives transformation. // If none, advance to the next tile. If found, keep the cursor — // we'll continue draining it from inside the response body. - let mut first_doc: Option = None; + let mut first_doc: Option = None; while let Some(result) = cursor.next().await { match result { Ok(doc) => { - if let Some(t) = - transforms::transform_timeseries(¶ms, ×eries, doc) - { + if let Some(t) = transforms::transform_timeseries( + ¶ms, + ×eries, + &cached_data_info, + doc, + ) { first_doc = Some(t); break; } @@ -190,11 +254,13 @@ async fn search_data_schema( let next_url = next_url_value(&path, ¶ms, tile_idx, tiles.len()); let page_message = format!("page {}", tile_idx); - // Each of these gets moved into the stream! generator. params and - // timeseries are needed to transform each subsequent doc; the rest - // are emitted at the end of the response. + // Each of these gets moved into the stream! generator. params, + // timeseries, and the data_info cache are needed to transform + // each subsequent doc; the rest are emitted at the end of the + // response. let params_for_stream = params.clone(); let ts_for_stream = timeseries.clone(); + let data_info_for_stream = cached_data_info.clone(); let body = stream! { yield Ok::<_, Infallible>(web::Bytes::from_static(b"{\"docs\":[")); @@ -205,7 +271,7 @@ async fn search_data_schema( serde_json::to_vec(&stub).expect("serializing one stub should not fail") } else { serde_json::to_vec(&first_doc) - .expect("serializing one bsose doc should not fail") + .expect("serializing a timeseries doc should not fail") }; yield Ok(web::Bytes::from(first_bytes)); @@ -213,7 +279,10 @@ async fn search_data_schema( match result { Ok(doc) => { if let Some(t) = transforms::transform_timeseries( - ¶ms_for_stream, &ts_for_stream, doc, + ¶ms_for_stream, + &ts_for_stream, + &data_info_for_stream, + doc, ) { let bytes = if is_minimal { let stub = transforms::timeseries_stub(&t); @@ -221,7 +290,7 @@ async fn search_data_schema( .expect("serializing one stub should not fail") } else { serde_json::to_vec(&t) - .expect("serializing one bsose doc should not fail") + .expect("serializing a timeseries doc should not fail") }; yield Ok(web::Bytes::from_static(b",")); yield Ok(web::Bytes::from(bytes)); @@ -283,34 +352,78 @@ fn next_url_value( } } +/// Populate the startup caches on a `DatasetSource` from its meta doc. +/// +/// Reads the one meta doc selected by `{data_type: source.meta_data_type}` +/// from `{db_name}.{meta_collection}`, then sets the source's +/// `timeseries` and `data_info` OnceCells from it. A dataset whose meta +/// doc has no `data_info` field will land an empty tuple in the cache +/// (via `#[serde(default)]` on the meta struct) — the precedence rule in +/// `transform_timeseries` makes that the right default-suppressing value +/// for datasets that carry `data_info` per data doc. +/// +/// Panics if the meta doc can't be found or the OnceCell is already set. +/// Both indicate startup misconfiguration that should fail loudly rather +/// than serve stale or partial data. +async fn populate_dataset_cache(source: &DatasetSource) -> Result<()> +where + M: schema::IsTimeseriesMeta + DeserializeOwned + Unpin + Send + Sync, +{ + let filter = mongodb::bson::doc! {"data_type": source.meta_data_type}; + let options = FindOptions::builder().limit(1).build(); + let mut cursor = generate_cursor::( + source.db_name, + source.meta_collection, + filter, + Some(options), + ) + .await?; + + let meta = match cursor.next().await { + Some(Ok(m)) => m, + Some(Err(e)) => { + panic!( + "Error reading meta doc for {}.{} (data_type={}): {}", + source.db_name, source.meta_collection, source.meta_data_type, e + ); + } + None => panic!( + "No meta doc found in {}.{} matching data_type={}", + source.db_name, source.meta_collection, source.meta_data_type + ), + }; + + source + .timeseries + .set(meta.timeseries()) + .unwrap_or_else(|_| panic!("timeseries cache for data_type={} already set", source.meta_data_type)); + source + .data_info + .set(meta.data_info()) + .unwrap_or_else(|_| panic!("data_info cache for data_type={} already set", source.meta_data_type)); + + Ok(()) +} + #[actix_web::main] async fn main() -> std::io::Result<()> { // Initialize the MongoDB client let client_options = mongodb::options::ClientOptions::parse(env::var("MONGODB_URI").unwrap()).await.unwrap(); - let client = mongodb::Client::with_options(client_options).unwrap(); + let client = mongodb::Client::with_options(client_options).unwrap(); *CLIENT.lock().unwrap() = Some(client); - // some generic data useful to have on hand - let mut filter = mongodb::bson::doc! {"data_type": "BSOSE-profile"}; - let mut options = FindOptions::builder().limit(1).build(); - let mut metacursor = generate_cursor::("argo", "timeseriesMeta", filter, Some(options)).await.unwrap(); - let mut metadata = Vec::new(); - while let Some(result) = metacursor.next().await { - match result { - Ok(document) => { - metadata.push(document); - }, - Err(e) => { - eprintln!("Error: {}", e); - } - } - } - *TIMESERIES.lock().unwrap() = Some(metadata[0].timeseries.clone()); + // Populate per-dataset startup caches. Each dataset's meta-doc-read + // happens here, before the server starts accepting connections. + // Adding a new dataset is a one-line addition: a call to + // populate_dataset_cache::(&NEW_SOURCE). + populate_dataset_cache::(&dataset_config::BSOSE_SOURCE) + .await + .expect("failed to populate BSOSE dataset cache at startup"); HttpServer::new(|| { App::new() - .service(search_data_schema) + .service(bsose_handler) }) .bind(("0.0.0.0", 8080))? .run() From 0e22089a03537876ba1b890b9df14b420a8c4a9b Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 19:12:26 -0400 Subject: [PATCH 02/13] k.i.s.s. --- api/src/helpers/dataset_config.rs | 63 +++++++------ api/src/main.rs | 141 +++++++++++++++++------------- 2 files changed, 110 insertions(+), 94 deletions(-) diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 702d95b..06fac18 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -11,13 +11,16 @@ //! //! `DatasetSource` is the sibling struct that names *where* each dataset //! lives in Mongo (db, collection, meta collection, meta discriminator) -//! and caches startup-loaded metadata (the timeseries axis, the meta -//! default `data_info`). One static `*_SOURCE` per dataset; the -//! generic handler in `main.rs` consumes `(&DatasetConfig, &DatasetSource)` -//! plus the dataset's schema generic to serve a request. +//! and carries the startup-loaded metadata (the timeseries axis, the +//! meta default `data_info`). It can't be `const`/`static` directly +//! because the metadata fields are loaded from Mongo at runtime; it's +//! built once in `main()` via `load_dataset_source` and stashed in a +//! top-level `Lazy>>` (the same pattern the +//! existing Mongo `CLIENT` static uses, so we don't drag in new +//! initialization vocabulary). Handlers clone it out of the lock at the +//! top of each request, then read its fields as plain owned data. use mongodb::bson::DateTime as BsonDateTime; -use once_cell::sync::OnceCell; use super::geometry::BoundingBox; use super::schema::DataInfo; @@ -52,47 +55,41 @@ pub struct DatasetConfig { pub coverage_bbox: Option, } -/// Per-dataset *identity*: where the dataset lives in Mongo, and a place -/// to stash the values we read once at startup so requests don't have to. +/// Per-dataset identity + startup-loaded metadata, built once in `main()`. /// -/// `db_name` / `collection` name the data collection the handler queries. -/// `meta_collection` is the per-dataset (or shared) metadata collection; -/// `meta_data_type` is the value of the `data_type` discriminator that -/// picks this dataset's meta doc out of the meta collection. +/// The first four fields are the dataset's Mongo identity: +/// - `db_name` / `collection`: where the data docs live. +/// - `meta_collection`: where the metadata doc lives. May or may not be +/// shared across datasets; today everything is in `timeseriesMeta`, +/// disambiguated by `meta_data_type`. +/// - `meta_data_type`: the `data_type` discriminator that selects this +/// dataset's meta doc out of `meta_collection`. /// -/// `timeseries` and `data_info` are populated once at server startup by -/// the generic loader in `main.rs` and read on every request. They're -/// behind `OnceCell` rather than `Mutex>` because they're -/// write-once: the loader fills them in `main()` and nothing else ever -/// writes again, so handlers can read without locking. +/// The remaining two fields are values we load *once* at startup from the +/// meta doc and read on every request. Plain owned types — no cells. +/// +/// `Clone` is derived so handlers can copy a `DatasetSource` out of the +/// `Lazy>>` static and use it locally without holding +/// the mutex across `.await` points. The clone is small (a few KB of +/// dates plus a few short strings). /// /// `data_info` is the *meta-level default* for the dataset. Per the /// precedence rule documented on `transforms::transform_timeseries`, a /// data doc that carries its own `data_info` wins over this default; if -/// the doc's `data_info` is empty, the cached default is stamped on -/// before column filtering runs. Datasets that put `data_info` on every -/// data doc (e.g. BSOSE today) leave this cache as the empty tuple — it -/// gets populated but never consulted. +/// the doc's `data_info` is empty, the meta-level value is stamped onto +/// the doc before column filtering runs. Datasets that put `data_info` +/// on every data doc (e.g. BSOSE today) leave this as the empty tuple — +/// it's still populated, just never consulted. +#[derive(Clone)] pub struct DatasetSource { pub db_name: &'static str, pub collection: &'static str, pub meta_collection: &'static str, pub meta_data_type: &'static str, - pub timeseries: OnceCell>, - pub data_info: OnceCell, + pub timeseries: Vec, + pub data_info: DataInfo, } -/// Mongo identity for the BSOSE timeseries dataset. The OnceCells are -/// filled at startup by `main.rs::populate_dataset_caches`. -pub static BSOSE_SOURCE: DatasetSource = DatasetSource { - db_name: "argo", - collection: "bsose", - meta_collection: "timeseriesMeta", - meta_data_type: "BSOSE-profile", - timeseries: OnceCell::new(), - data_info: OnceCell::new(), -}; - /// BSOSE's 52 vertical levels, in metres (positive-downward), shallowest /// first. From the dataset's published grid; should be updated if BSOSE /// re-releases with a different vertical discretisation. diff --git a/api/src/main.rs b/api/src/main.rs index e20f37e..8ba46c5 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -38,25 +38,48 @@ use dataset_config::{DatasetConfig, DatasetSource}; static CLIENT: Lazy>> = Lazy::new(|| Mutex::new(None)); +// Per-dataset source-of-truth: identity strings + values loaded from the +// dataset's meta doc at startup. Same pattern as CLIENT above — the +// `Lazy>>` is what lets us declare a static that's +// initialized once, after main() reads the value out of Mongo. Handlers +// clone the inner DatasetSource out of the lock at the top of each +// request and use it locally, so the mutex is never held across `.await`. +// +// One static per dataset. Adding a new dataset is one more static here +// plus one more load_dataset_source/ load-and-set block in main(). +static BSOSE_SOURCE: Lazy>> = Lazy::new(|| Mutex::new(None)); + // ---- route handlers -------------------------------------------------------- // // Each dataset gets its own one-route handler that resolves the dataset // generic and hands off to `serve_timeseries`. Keeps Actix's `#[get(...)]` // attribute discoverable per-dataset and lets the routing table grow // without touching the generic core. Adding a new dataset is: define its -// `*_CONFIG` + `*_SOURCE` in `dataset_config.rs`, define its schema + -// meta in `schema.rs`, then add a 4-line handler here. +// `*_CONFIG` in `dataset_config.rs`, define its schema + meta in +// `schema.rs`, add a `*_SOURCE` static + a `load_dataset_source` call +// in `main()`, and add a 4-line handler here. #[get("/timeseries/bsose")] async fn bsose_handler( req: HttpRequest, query_params: web::Query, ) -> impl Responder { + // Snapshot the source out of the lock before any `.await`. Holding + // the mutex guard across an await would let other tasks on the same + // worker deadlock on it, so the idiom is "lock, clone, unlock, then + // do async work with the local copy." + let source = BSOSE_SOURCE + .lock() + .unwrap() + .as_ref() + .expect("BSOSE_SOURCE not initialized at startup") + .clone(); + serve_timeseries::( req, query_params.into_inner(), &dataset_config::BSOSE_CONFIG, - &dataset_config::BSOSE_SOURCE, + &source, ) .await } @@ -79,7 +102,7 @@ async fn serve_timeseries( req: HttpRequest, params: serde_json::Value, config: &DatasetConfig, - source: &'static DatasetSource, + source: &DatasetSource, ) -> HttpResponse where S: schema::IsTimeseries @@ -107,22 +130,16 @@ where Err(e) => return HttpResponse::BadRequest().json(json!({"error": e})), }; - // ---- tile sequence + cached startup data -------------------------- + // ---- tile sequence + startup-loaded data -------------------------- let tiles = tile_generator::generate_tiles(¶ms, config); - // Both caches are populated at startup by `populate_dataset_cache` - // before the server begins accepting connections. A handler hit - // before startup finished is a server bug, so we surface it. - let timeseries = source - .timeseries - .get() - .expect("dataset timeseries cache should be populated at startup") - .clone(); - let cached_data_info = source - .data_info - .get() - .expect("dataset data_info cache should be populated at startup") - .clone(); + // Clone the cached values out of `source` locally so the streaming + // branch can move owned copies into its async generator without + // juggling lifetimes — both are small (timeseries is a few KB of + // dates; data_info is a few strings), so cloning per request is + // cheap relative to the actual query work. + let timeseries = source.timeseries.clone(); + let cached_data_info = source.data_info.clone(); let compression: Option = params .get("compression") @@ -352,57 +369,52 @@ fn next_url_value( } } -/// Populate the startup caches on a `DatasetSource` from its meta doc. +/// Build a `DatasetSource` by reading the dataset's meta doc out of Mongo. /// -/// Reads the one meta doc selected by `{data_type: source.meta_data_type}` -/// from `{db_name}.{meta_collection}`, then sets the source's -/// `timeseries` and `data_info` OnceCells from it. A dataset whose meta -/// doc has no `data_info` field will land an empty tuple in the cache -/// (via `#[serde(default)]` on the meta struct) — the precedence rule in -/// `transform_timeseries` makes that the right default-suppressing value -/// for datasets that carry `data_info` per data doc. +/// Called once per dataset at server startup. The four `'static str` +/// arguments are the dataset's Mongo identity; the resulting struct +/// bundles those identity strings with the values we read out of the +/// meta doc (`timeseries`, `data_info`). `main()` then stashes the +/// returned struct into the dataset's `*_SOURCE` static for handlers +/// to read on each request. /// -/// Panics if the meta doc can't be found or the OnceCell is already set. -/// Both indicate startup misconfiguration that should fail loudly rather +/// Panics if the meta doc can't be found or the cursor errors. Both +/// indicate startup misconfiguration that should fail loudly rather /// than serve stale or partial data. -async fn populate_dataset_cache(source: &DatasetSource) -> Result<()> +async fn load_dataset_source( + db_name: &'static str, + collection: &'static str, + meta_collection: &'static str, + meta_data_type: &'static str, +) -> Result where M: schema::IsTimeseriesMeta + DeserializeOwned + Unpin + Send + Sync, { - let filter = mongodb::bson::doc! {"data_type": source.meta_data_type}; + let filter = mongodb::bson::doc! {"data_type": meta_data_type}; let options = FindOptions::builder().limit(1).build(); - let mut cursor = generate_cursor::( - source.db_name, - source.meta_collection, - filter, - Some(options), - ) - .await?; + let mut cursor = + generate_cursor::(db_name, meta_collection, filter, Some(options)).await?; let meta = match cursor.next().await { Some(Ok(m)) => m, - Some(Err(e)) => { - panic!( - "Error reading meta doc for {}.{} (data_type={}): {}", - source.db_name, source.meta_collection, source.meta_data_type, e - ); - } + Some(Err(e)) => panic!( + "Error reading meta doc for {}.{} (data_type={}): {}", + db_name, meta_collection, meta_data_type, e + ), None => panic!( "No meta doc found in {}.{} matching data_type={}", - source.db_name, source.meta_collection, source.meta_data_type + db_name, meta_collection, meta_data_type ), }; - source - .timeseries - .set(meta.timeseries()) - .unwrap_or_else(|_| panic!("timeseries cache for data_type={} already set", source.meta_data_type)); - source - .data_info - .set(meta.data_info()) - .unwrap_or_else(|_| panic!("data_info cache for data_type={} already set", source.meta_data_type)); - - Ok(()) + Ok(DatasetSource { + db_name, + collection, + meta_collection, + meta_data_type, + timeseries: meta.timeseries(), + data_info: meta.data_info(), + }) } #[actix_web::main] @@ -413,13 +425,20 @@ async fn main() -> std::io::Result<()> { let client = mongodb::Client::with_options(client_options).unwrap(); *CLIENT.lock().unwrap() = Some(client); - // Populate per-dataset startup caches. Each dataset's meta-doc-read - // happens here, before the server starts accepting connections. - // Adding a new dataset is a one-line addition: a call to - // populate_dataset_cache::(&NEW_SOURCE). - populate_dataset_cache::(&dataset_config::BSOSE_SOURCE) - .await - .expect("failed to populate BSOSE dataset cache at startup"); + // Load each dataset's source-of-truth and stash it in the static for + // handlers to read. The Mongo identity strings live here at the call + // site — one place per dataset — and the returned struct bundles + // them with the values read from the meta doc. Adding a new dataset + // is one more load-and-set block here. + let bsose = load_dataset_source::( + "argo", + "bsose", + "timeseriesMeta", + "BSOSE-profile", + ) + .await + .expect("failed to load BSOSE dataset source at startup"); + *BSOSE_SOURCE.lock().unwrap() = Some(bsose); HttpServer::new(|| { App::new() From 8399037939b1cf2320a558f9b0309525081dd85a Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 19:25:39 -0400 Subject: [PATCH 03/13] first attempt at actual noaa oisst guts --- api/PAGINATION.md | 61 +++++++++++-- api/src/helpers/dataset_config.rs | 71 ++++++++++++++++ api/src/helpers/schema.rs | 137 ++++++++++++++++++++++++++++++ api/src/main.rs | 33 +++++++ 4 files changed, 294 insertions(+), 8 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index 071a1a2..acebde8 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -131,11 +131,56 @@ antimeridian / north-pole docs aren't lost. ## Per-dataset configuration -`api/src/helpers/dataset_config.rs` defines a `DatasetConfig` struct -with the dataset's `tile_degrees`, `max_radius_meters`, the discrete -`levels` array, and an optional `coverage_bbox`. The BSOSE handler -binds `BSOSE_CONFIG` directly; adding a new dataset means defining its -config there and wiring its handler through the same `tile_generator` / -`filter_composer` machinery. `coverage_bbox: None` for a new dataset -gives global-walk semantics; setting it to a bounding rectangle tells -the tile generator to skip everything outside the rectangle. +Two per-dataset structs sit side by side in `api/src/helpers/dataset_config.rs`: + +- **`DatasetConfig`** — *request-size policy*. `tile_degrees` (spatial + page size), `max_radius_meters` (cap for `center + radius` queries), + `levels` (discrete vertical pages — single-element `&[0.0]` for + surface-only datasets like OI SST), and an optional `coverage_bbox` + (rectangle the dataset's data lives inside; `None` means walk the + whole globe). Declared as a `pub const` per dataset. + +- **`DatasetSource`** — *Mongo identity* (`db_name`, `collection`, + `meta_collection`, `meta_data_type`) plus the values read once at + startup from the meta doc (`timeseries` axis, `data_info` default). + Built at runtime by `main()` via `load_dataset_source::` + and stashed in a `Lazy>>` static (same + pattern as the Mongo `CLIENT` static). + +The generic handler `serve_timeseries::` in `main.rs` consumes +`(&DatasetConfig, &DatasetSource)` plus a schema generic `S` that +implements `IsTimeseries`. + +### Recipe for adding a new dataset + +1. Define the data-doc schema (`S`) and meta-doc schema (`M`) in + `api/src/helpers/schema.rs`. `S` implements `IsTimeseries`; `M` + implements `IsTimeseriesMeta`. +2. Define `_CONFIG: DatasetConfig` (and `_LEVELS` if depth + discretisation is non-trivial) in `dataset_config.rs`. +3. Add `_SOURCE: Lazy>>` in + `main.rs`, next to the existing dataset statics. +4. In `main()`, load the source: `let x = load_dataset_source::(...) + .await?; *_SOURCE.lock().unwrap() = Some(x);`. +5. Add a 4-line route handler annotated with `#[get("/timeseries/")]` + that clones the source out of the lock and forwards to + `serve_timeseries::`. +6. Register the handler with `.service(_handler)` on the `App`. + +### `data_info` precedence rule + +`data_info` (variable names, units, per-variable descriptors) may +appear on either the data doc, the meta doc, or both: + +- **Doc-level wins.** If a data doc carries its own non-empty + `data_info` (BSOSE today), that's what `slice_data` filters + against. The cached meta-level default is ignored. +- **Cache fallback.** If the data doc has no `data_info` (OI SST: the + field lives only on the meta doc), the per-dataset cached default — + loaded from the meta doc at startup — is stamped onto the doc + before column filtering runs. + +The cache for a dataset whose meta doc has no `data_info` is the empty +tuple, and `transform_timeseries` treats empty as "no default to +apply". So a dataset can store `data_info` per data doc, per meta doc, +or per both — the response carries the right thing in each case. diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 06fac18..0ec6cd0 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -113,6 +113,31 @@ pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { }), }; +/// OI SST has a single vertical level (the sea surface). We model it as +/// a one-element levels array of 0.0 so the existing tile_generator / +/// filter_composer code path works unchanged: each spatial tile crosses +/// with `level_index = 0` to produce exactly one tile per spatial cell. +/// The filter composer emits `level: { $gte: 0.0 }` for the only-level +/// case (no upper bound — it's both the first and last level), which +/// matches any doc with `level >= 0`. OI SST docs all have `level = 0`. +pub const OISST_LEVELS: &[f64] = &[0.0]; + +/// Configuration for the NOAA OI SST v2 high-res timeseries dataset. +/// +/// 1/4° grid × one level. 5° tiles give ~400 cells per tile (no level +/// multiplier, unlike BSOSE), matching BSOSE's tile size for uniformity +/// even though OI SST has lighter per-tile load. `max_radius_meters` +/// starts at the same 100 km cap as BSOSE — OI SST per-doc payload is +/// smaller (one variable, weekly cadence), so this cap can be relaxed +/// once we have real usage to size it against. `coverage_bbox: None` +/// because OI SST spans the entire globe. +pub const OISST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 5.0, + max_radius_meters: 100_000.0, // 100 km — relax once usage informs us + levels: OISST_LEVELS, + coverage_bbox: None, +}; + #[cfg(test)] mod tests { use super::*; @@ -175,4 +200,50 @@ mod tests { assert!(d >= 0.0, "level depths should be non-negative; got {}", d); } } + + // ---- OI SST config invariants (mirror the BSOSE checks) ---------------- + + #[test] + fn oisst_tile_degrees_is_positive_and_divides_a_hemisphere() { + assert!(OISST_CONFIG.tile_degrees > 0.0); + assert!( + (180.0_f64 % OISST_CONFIG.tile_degrees).abs() < 1e-9, + "tile_degrees should evenly divide 180° for clean global coverage" + ); + assert!( + (360.0_f64 % OISST_CONFIG.tile_degrees).abs() < 1e-9, + "tile_degrees should evenly divide 360° for clean global coverage" + ); + } + + #[test] + fn oisst_max_radius_is_positive_and_subhemispheric() { + assert!(OISST_CONFIG.max_radius_meters > 0.0); + assert!(OISST_CONFIG.max_radius_meters < 1.0e7); + } + + #[test] + fn oisst_levels_is_non_empty() { + // Even a single-level dataset must have a one-element levels + // array so the tile generator emits one tile per spatial cell. + // An empty list would produce zero pages. + assert!(!OISST_CONFIG.levels.is_empty()); + } + + #[test] + fn oisst_has_exactly_one_level() { + // OI SST is a surface-only dataset. The single-element levels + // array is what makes the existing tile generator / filter + // composer code path work without special-casing "no vertical + // dimension". If this ever changes, we should pause and think + // about what multi-level OI SST would mean physically. + assert_eq!(OISST_CONFIG.levels.len(), 1); + assert!((OISST_CONFIG.levels[0] - 0.0).abs() < 1e-9); + } + + #[test] + fn oisst_has_global_coverage() { + // OI SST is global; no coverage_bbox skip available. + assert!(OISST_CONFIG.coverage_bbox.is_none()); + } } diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs index a8cabfb..4883bb9 100644 --- a/api/src/helpers/schema.rs +++ b/api/src/helpers/schema.rs @@ -165,6 +165,143 @@ impl IsTimeseriesMeta for BsoseMeta { } } +// oi sst ///////////////////////////////////////////////////////////////////// + +/// `source` substructure for the OI SST metadata doc. Differs from +/// `SourceMeta` (used by BSOSE) — OI SST uses `url` where BSOSE uses +/// `iter`. Neither field is read by the handler today; modeled here so +/// deserialization of the meta doc succeeds. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct OisstSourceMeta { + pub(crate) source: Vec, + pub(crate) url: String, +} + +/// Grid descriptor on the OI SST metadata doc. Captures the regular +/// lat/lon lattice the dataset is sampled on. Not consulted by the +/// handler today (the equivalent information lives in `OISST_CONFIG`), +/// but modeled here so deserialization succeeds. A future cleanup could +/// derive the dataset's `DatasetConfig.coverage_bbox` / `tile_degrees` +/// from this struct instead of duplicating the values in code. +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct Lattice { + pub center: [f64; 2], + pub spacing: [f64; 2], + pub min_lat: f64, + pub min_lon: f64, + pub max_lat: f64, + pub max_lon: f64, +} + +/// One spatial cell of the NOAA OI SST v2 high-res grid. Surface-only +/// (no vertical dimension; `level` is always `0.0`). `data` holds the +/// timeseries per variable — there's exactly one variable (SST), so the +/// outer Vec always has length one. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct OisstSchema { + pub(crate) _id: String, + // Reachable from main.rs (batchmeta branch reads `metadata()`), so + // `pub` for symmetry with BsoseSchema. + pub metadata: Vec, + pub(crate) basin: f64, + pub(crate) geolocation: GeoJSONPoint, + pub(crate) level: f64, + pub(crate) data: Vec>, + // OI SST data docs don't carry `timeseries` or `data_info` of their + // own — both are populated at request time. `timeseries` is filled + // by `slice_timerange`; `data_info` is stamped from the per-dataset + // cached default by `transform_timeseries` (the meta doc holds the + // dataset-wide variable info, single-variable for OI SST). Both + // default at deserialization so an absent field in the source doc + // produces an empty/None value rather than a parse error. + #[serde(default)] + pub(crate) timeseries: Option>, + #[serde(default)] + pub(crate) data_info: DataInfo, +} + +impl IsTimeseries for OisstSchema { + fn get_timeseries(&self) -> bool { + return true; + } + + fn data(&mut self) -> &mut Vec> { + &mut self.data + } + + fn set_data(&mut self, data: Vec>) { + self.data = data; + } + + fn timeseries(&mut self) -> Option<&mut Vec> { + self.timeseries.as_mut() + } + + fn set_timeseries(&mut self, timeseries: Vec) { + self.timeseries = Some(timeseries); + } + + fn data_info(&mut self) -> DataInfo { + self.data_info.clone() + } + + fn set_data_info(&mut self, data_info: DataInfo) { + self.data_info = data_info; + } + + fn _id(&self) -> String { + self._id.clone() + } + + fn longitude(&self) -> f64 { + self.geolocation.coordinates[0] + } + + fn latitude(&self) -> f64 { + self.geolocation.coordinates[1] + } + + fn level(&self) -> f64 { + self.level + } + + fn metadata(&self) -> Vec { + self.metadata.clone() + } +} + +/// Metadata doc for the OI SST dataset. Crucially, `data_info` lives +/// here (per-dataset default) rather than on every data doc — the +/// generic transform layer reads it from the cache and stamps it onto +/// each data doc before column filtering. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct OisstMeta { + pub(crate) _id: String, + pub(crate) data_type: String, + pub data_info: DataInfo, + pub(crate) date_updated_argovis: BsonDateTime, + pub timeseries: Vec, + pub(crate) source: Vec, + pub(crate) lattice: Lattice, +} + +impl IsTimeseriesMeta for OisstMeta { + fn get_timeseries_meta(&self) -> bool { + return true; + } + + fn timeseries(&self) -> Vec { + self.timeseries.clone() + } + + fn data_info(&self) -> DataInfo { + self.data_info.clone() + } +} + +// /////////////////////////////////////////////////////////////////////////// + #[derive(Deserialize, Debug, Clone)] pub struct TimeseriesStub { pub _id: String, diff --git a/api/src/main.rs b/api/src/main.rs index 8ba46c5..9857e17 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -48,6 +48,7 @@ static CLIENT: Lazy>> = Lazy::new(|| Mutex::new(No // One static per dataset. Adding a new dataset is one more static here // plus one more load_dataset_source/ load-and-set block in main(). static BSOSE_SOURCE: Lazy>> = Lazy::new(|| Mutex::new(None)); +static OISST_SOURCE: Lazy>> = Lazy::new(|| Mutex::new(None)); // ---- route handlers -------------------------------------------------------- // @@ -84,6 +85,27 @@ async fn bsose_handler( .await } +#[get("/timeseries/noaaoisst")] +async fn oisst_handler( + req: HttpRequest, + query_params: web::Query, +) -> impl Responder { + let source = OISST_SOURCE + .lock() + .unwrap() + .as_ref() + .expect("OISST_SOURCE not initialized at startup") + .clone(); + + serve_timeseries::( + req, + query_params.into_inner(), + &dataset_config::OISST_CONFIG, + &source, + ) + .await +} + // ---- generic timeseries handler -------------------------------------------- /// Generic body of the `/timeseries/{dataset}` endpoint. Parameterized by @@ -440,9 +462,20 @@ async fn main() -> std::io::Result<()> { .expect("failed to load BSOSE dataset source at startup"); *BSOSE_SOURCE.lock().unwrap() = Some(bsose); + let oisst = load_dataset_source::( + "argo", + "noaaOIsst", + "timeseriesMeta", + "noaa-oi-sst-v2-high-res", + ) + .await + .expect("failed to load OI SST dataset source at startup"); + *OISST_SOURCE.lock().unwrap() = Some(oisst); + HttpServer::new(|| { App::new() .service(bsose_handler) + .service(oisst_handler) }) .bind(("0.0.0.0", 8080))? .run() From 2f99ece96b875be5d9e0ceda68dd878734db0ee3 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 19:31:40 -0400 Subject: [PATCH 04/13] test fixtures for noaa oisst --- api/fixtures/timeseriesMeta.json | 30 ++++++++++++++++++++++++++ api/src/bin/seed_test_db.rs | 36 +++++++++++++++++++------------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/api/fixtures/timeseriesMeta.json b/api/fixtures/timeseriesMeta.json index e937a71..75ab60e 100644 --- a/api/fixtures/timeseriesMeta.json +++ b/api/fixtures/timeseriesMeta.json @@ -17,5 +17,35 @@ "depth_r0_to_bottom": 4000.0, "interior_2d_mask": true, "depth_r0_to_ref_surface": 100.0 + }, + { + "_id": "noaa-oi-sst-v2-high-res", + "data_type": "noaa-oi-sst-v2-high-res", + "data_info": [ + ["sst"], + ["units", "long_name"], + [["degC", "Weekly Mean of Sea Surface Temperature"]] + ], + "date_updated_argovis": "2026-05-26T20:48:56Z", + "timeseries": [ + "2020-01-15T00:00:00Z", + "2020-04-15T00:00:00Z", + "2020-07-15T00:00:00Z", + "2020-10-15T00:00:00Z" + ], + "source": [ + { + "source": ["NOAA Optimum Interpolation SST V2 High Resolution"], + "url": "https://psl.noaa.gov/data/gridded/data.noaa.oisst.v2.highres.html" + } + ], + "lattice": { + "center": [0.125, 0.125], + "spacing": [0.25, 0.25], + "minLat": -89.875, + "minLon": -179.875, + "maxLat": 89.875, + "maxLon": 179.875 + } } ] diff --git a/api/src/bin/seed_test_db.rs b/api/src/bin/seed_test_db.rs index 9e94f68..71b99d5 100644 --- a/api/src/bin/seed_test_db.rs +++ b/api/src/bin/seed_test_db.rs @@ -1,16 +1,17 @@ // Seeds a MongoDB instance with the test fixtures used by the integration tests. // // Run before starting the API container so the API picks up the right -// `timeseriesMeta` document at startup: +// metadata documents at startup: // // MONGODB_URI=mongodb://localhost:27017 cargo run --bin seed_test_db // // What it does: -// * drops the `argo.bsose` and `argo.timeseriesMeta` collections +// * drops the `argo.bsose`, `argo.noaaOIsst`, and `argo.timeseriesMeta` +// collections // * loads the JSON fixtures embedded at compile time // * converts ISO-8601 strings in known date fields to BSON DateTimes // * inserts the resulting documents -// * creates a 2dsphere index on `geolocation` for the bsose collection +// * creates a 2dsphere index on `geolocation` for each data collection // // Date fields in the fixtures are written as ISO-8601 strings to keep the // JSON readable; the seeder converts them to BSON DateTimes here, since @@ -26,6 +27,7 @@ use std::env; const TIMESERIES_META_FIXTURE: &str = include_str!("../../fixtures/timeseriesMeta.json"); const BSOSE_FIXTURE: &str = include_str!("../../fixtures/bsose.json"); +const NOAA_OISST_FIXTURE: &str = include_str!("../../fixtures/noaaOIsst.json"); const DB_NAME: &str = "argo"; @@ -37,7 +39,8 @@ async fn main() -> Result<(), Box> { let client = Client::with_options(opts)?; let db = client.database(DB_NAME); - // timeseriesMeta has BSON dates in two fields + // timeseriesMeta carries both BSOSE and OI SST meta docs; + // `date_updated_argovis` and `timeseries` are BSON-date fields on both. seed_collection( &db, "timeseriesMeta", @@ -46,17 +49,20 @@ async fn main() -> Result<(), Box> { ) .await?; - // bsose has no top-level date fields - seed_collection(&db, "bsose", BSOSE_FIXTURE, &[]).await?; - - // Geospatial queries (`$geoWithin`, `$near`) require a 2dsphere index on - // the GeoJSON field. MongoDB picks a default index name from the keys. - let geo_index = IndexModel::builder() - .keys(bson::doc! { "geolocation": "2dsphere" }) - .build(); - db.collection::("bsose") - .create_index(geo_index, None) - .await?; + // Data collections — no top-level date fields. Each gets its own + // 2dsphere index on `geolocation` for `$geoWithin` / `$near` queries. + for (name, fixture) in [ + ("bsose", BSOSE_FIXTURE), + ("noaaOIsst", NOAA_OISST_FIXTURE), + ] { + seed_collection(&db, name, fixture, &[]).await?; + let geo_index = IndexModel::builder() + .keys(bson::doc! { "geolocation": "2dsphere" }) + .build(); + db.collection::(name) + .create_index(geo_index, None) + .await?; + } println!("Seed complete: {} populated.", DB_NAME); Ok(()) From dbfdbcc2d58666cf3009c4a9d1d2619094eb08c3 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 19:33:53 -0400 Subject: [PATCH 05/13] test fixtures for noaa oisst --- api/fixtures/noaaOIsst.json | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 api/fixtures/noaaOIsst.json diff --git a/api/fixtures/noaaOIsst.json b/api/fixtures/noaaOIsst.json new file mode 100644 index 0000000..1582668 --- /dev/null +++ b/api/fixtures/noaaOIsst.json @@ -0,0 +1,42 @@ +[ + { + "_id": "noaaoisst_doc_001", + "metadata": ["noaa-oi-sst-v2-high-res"], + "basin": 1.0, + "geolocation": { "type": "Point", "coordinates": [20.0, -50.0] }, + "level": 0.0, + "data": [ + [10.1, 11.2, 12.3, 13.4] + ] + }, + { + "_id": "noaaoisst_doc_002", + "metadata": ["noaa-oi-sst-v2-high-res"], + "basin": 1.0, + "geolocation": { "type": "Point", "coordinates": [40.0, -40.0] }, + "level": 0.0, + "data": [ + [15.0, 16.0, 17.0, 18.0] + ] + }, + { + "_id": "noaaoisst_doc_003", + "metadata": ["noaa-oi-sst-v2-high-res"], + "basin": 2.0, + "geolocation": { "type": "Point", "coordinates": [-170.0, -55.0] }, + "level": 0.0, + "data": [ + [-1.0, 0.0, 1.0, 2.0] + ] + }, + { + "_id": "noaaoisst_doc_004", + "metadata": ["noaa-oi-sst-v2-high-res"], + "basin": 1.0, + "geolocation": { "type": "Point", "coordinates": [0.0, 45.0] }, + "level": 0.0, + "data": [ + [12.5, 13.5, 14.5, 15.5] + ] + } +] From 1d42f365ab09de850821582aeb23f58ea938c3e2 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 21:50:58 -0400 Subject: [PATCH 06/13] turn on datasets with runtime env --- .github/workflows/test.yml | 6 + api/PAGINATION.md | 36 +++++- api/src/helpers/dataset_config.rs | 40 ++++--- api/src/main.rs | 185 ++++++++++++++++++++---------- api/tests/integration.rs | 30 +++-- 5 files changed, 209 insertions(+), 88 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e51268d..830efa2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,7 +24,13 @@ jobs: --health-retries 12 env: + # MONGODB_URI is used by the seeder binary. The API itself reads a + # per-dataset URI per dataset it serves — URI presence is the enable + # signal. Both point at the same Mongo container in CI; in production + # each dataset is typically deployed against its own Mongo. MONGODB_URI: mongodb://localhost:27017 + MONGODB_URI_BSOSE: mongodb://localhost:27017 + MONGODB_URI_NOAAOISST: mongodb://localhost:27017 API_URL: http://localhost:8080 CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 diff --git a/api/PAGINATION.md b/api/PAGINATION.md index acebde8..a54c7f4 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -160,12 +160,42 @@ implements `IsTimeseries`. discretisation is non-trivial) in `dataset_config.rs`. 3. Add `_SOURCE: Lazy>>` in `main.rs`, next to the existing dataset statics. -4. In `main()`, load the source: `let x = load_dataset_source::(...) - .await?; *_SOURCE.lock().unwrap() = Some(x);`. +4. In `main()`, gate on the dataset's URI env var and load conditionally: + ``` + let mut enabled_ = false; + if let Some(client) = dataset_client("MONGODB_URI_").await { + let x = load_dataset_source::(client, ...).await?; + *_SOURCE.lock().unwrap() = Some(x); + enabled_ = true; + } + ``` 5. Add a 4-line route handler annotated with `#[get("/timeseries/")]` that clones the source out of the lock and forwards to `serve_timeseries::`. -6. Register the handler with `.service(_handler)` on the `App`. +6. Register the handler inside the `App::configure` callback, gated on + `enabled_`. A dataset whose URI env var is unset stays + unregistered: no route, no startup work, no panic. + +### Per-deployment configuration + +Each dataset is enabled iff its `MONGODB_URI_` env var is set +when `main()` runs. URI presence is the enable signal — there's no +separate `DATASETS=` list to keep in sync. A deployment serving only +one dataset just sets one env var: + +``` +MONGODB_URI_BSOSE=mongodb://bsose-mongo/ cargo run # BSOSE-only +MONGODB_URI_NOAAOISST=mongodb://noaa-mongo/ cargo run # OI SST-only +``` + +For local dev / tests where one Mongo serves both, point both env vars +at the same URI: + +``` +MONGODB_URI_BSOSE=mongodb://localhost:27017 \ +MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \ + cargo run +``` ### `data_info` precedence rule diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 0ec6cd0..5ab4084 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -10,15 +10,17 @@ //! actually lives inside, so we skip probing tiles outside it. //! //! `DatasetSource` is the sibling struct that names *where* each dataset -//! lives in Mongo (db, collection, meta collection, meta discriminator) -//! and carries the startup-loaded metadata (the timeseries axis, the -//! meta default `data_info`). It can't be `const`/`static` directly -//! because the metadata fields are loaded from Mongo at runtime; it's -//! built once in `main()` via `load_dataset_source` and stashed in a -//! top-level `Lazy>>` (the same pattern the -//! existing Mongo `CLIENT` static uses, so we don't drag in new -//! initialization vocabulary). Handlers clone it out of the lock at the -//! top of each request, then read its fields as plain owned data. +//! lives in Mongo (db, collection, meta collection, meta discriminator), +//! carries the per-dataset `mongodb::Client`, and carries the +//! startup-loaded metadata (the timeseries axis, the meta default +//! `data_info`). One `mongodb::Client` per dataset because the deployment +//! topology may put each dataset in a different Mongo instance (env vars +//! `MONGODB_URI_` per dataset). It can't be `const`/`static` +//! directly because the metadata fields are loaded from Mongo at +//! runtime; it's built once in `main()` via `load_dataset_source` and +//! stashed in a top-level `Lazy>>`. Handlers +//! clone it out of the lock at the top of each request, then read its +//! fields as plain owned data. use mongodb::bson::DateTime as BsonDateTime; @@ -55,9 +57,17 @@ pub struct DatasetConfig { pub coverage_bbox: Option, } -/// Per-dataset identity + startup-loaded metadata, built once in `main()`. +/// Per-dataset identity + Mongo client + startup-loaded metadata, built +/// once in `main()`. /// -/// The first four fields are the dataset's Mongo identity: +/// `client` is this dataset's dedicated `mongodb::Client`. Each dataset +/// gets its own URI (env `MONGODB_URI_`) and its own client — +/// the deployment topology we expect is "BSOSE deployed against one +/// Mongo, OI SST against another," so a single shared client doesn't +/// fit. `mongodb::Client` is internally Arc-backed, so cloning it (and +/// therefore cloning the whole `DatasetSource`) is cheap. +/// +/// The next four fields are the dataset's Mongo identity: /// - `db_name` / `collection`: where the data docs live. /// - `meta_collection`: where the metadata doc lives. May or may not be /// shared across datasets; today everything is in `timeseriesMeta`, @@ -65,13 +75,14 @@ pub struct DatasetConfig { /// - `meta_data_type`: the `data_type` discriminator that selects this /// dataset's meta doc out of `meta_collection`. /// -/// The remaining two fields are values we load *once* at startup from the +/// The last two fields are values we load *once* at startup from the /// meta doc and read on every request. Plain owned types — no cells. /// /// `Clone` is derived so handlers can copy a `DatasetSource` out of the /// `Lazy>>` static and use it locally without holding -/// the mutex across `.await` points. The clone is small (a few KB of -/// dates plus a few short strings). +/// the mutex across `.await` points. The clone is cheap: the client is +/// Arc-backed, identity strings are `&'static str`, and the metadata +/// fields are a few KB of dates plus a few short strings. /// /// `data_info` is the *meta-level default* for the dataset. Per the /// precedence rule documented on `transforms::transform_timeseries`, a @@ -82,6 +93,7 @@ pub struct DatasetConfig { /// it's still populated, just never consulted. #[derive(Clone)] pub struct DatasetSource { + pub client: mongodb::Client, pub db_name: &'static str, pub collection: &'static str, pub meta_collection: &'static str, diff --git a/api/src/main.rs b/api/src/main.rs index 9857e17..5fca235 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -36,17 +36,22 @@ use serde_json::{json, Value}; use dataset_config::{DatasetConfig, DatasetSource}; -static CLIENT: Lazy>> = Lazy::new(|| Mutex::new(None)); - -// Per-dataset source-of-truth: identity strings + values loaded from the -// dataset's meta doc at startup. Same pattern as CLIENT above — the -// `Lazy>>` is what lets us declare a static that's -// initialized once, after main() reads the value out of Mongo. Handlers -// clone the inner DatasetSource out of the lock at the top of each -// request and use it locally, so the mutex is never held across `.await`. +// Per-dataset source-of-truth: identity strings + the dataset's Mongo +// client + values loaded from the dataset's meta doc at startup. The +// `Lazy>>` shape is what lets us declare a static that's +// initialized once, after main() connects to Mongo and reads the meta +// doc. Handlers clone the inner DatasetSource out of the lock at the top +// of each request and use it locally, so the mutex is never held across +// `.await`. // -// One static per dataset. Adding a new dataset is one more static here -// plus one more load_dataset_source/ load-and-set block in main(). +// One static per dataset. A dataset is "enabled" iff its +// `MONGODB_URI_` env var is set; if so, main() loads it and +// fills this static, and the route handler is registered on the App. +// If the env var is unset, the static stays `None` and the handler is +// never registered (no route, no surprises). +// +// Adding a new dataset is one more static here plus one more +// load-and-register block in main() and a route handler above. static BSOSE_SOURCE: Lazy>> = Lazy::new(|| Mutex::new(None)); static OISST_SOURCE: Lazy>> = Lazy::new(|| Mutex::new(None)); @@ -192,6 +197,7 @@ where let options = FindOptions::builder().build(); let mut cursor = match generate_cursor::( + &source.client, source.db_name, source.collection, filter, @@ -238,6 +244,7 @@ where "_id": { "$in": unique_metadata.into_iter().collect::>() } }; let meta_cursor = match generate_cursor::( + &source.client, source.db_name, source.meta_collection, meta_filter, @@ -393,17 +400,19 @@ fn next_url_value( /// Build a `DatasetSource` by reading the dataset's meta doc out of Mongo. /// -/// Called once per dataset at server startup. The four `'static str` -/// arguments are the dataset's Mongo identity; the resulting struct -/// bundles those identity strings with the values we read out of the -/// meta doc (`timeseries`, `data_info`). `main()` then stashes the -/// returned struct into the dataset's `*_SOURCE` static for handlers -/// to read on each request. +/// Called once per dataset at server startup. `client` is the dataset's +/// dedicated Mongo client (constructed from its `MONGODB_URI_` +/// env var); the four `'static str` arguments are the dataset's Mongo +/// identity. The resulting struct bundles the client + identity strings +/// with the values we read out of the meta doc (`timeseries`, +/// `data_info`). `main()` then stashes the returned struct into the +/// dataset's `*_SOURCE` static for handlers to read on each request. /// /// Panics if the meta doc can't be found or the cursor errors. Both /// indicate startup misconfiguration that should fail loudly rather /// than serve stale or partial data. async fn load_dataset_source( + client: mongodb::Client, db_name: &'static str, collection: &'static str, meta_collection: &'static str, @@ -415,7 +424,7 @@ where let filter = mongodb::bson::doc! {"data_type": meta_data_type}; let options = FindOptions::builder().limit(1).build(); let mut cursor = - generate_cursor::(db_name, meta_collection, filter, Some(options)).await?; + generate_cursor::(&client, db_name, meta_collection, filter, Some(options)).await?; let meta = match cursor.next().await { Some(Ok(m)) => m, @@ -430,6 +439,7 @@ where }; Ok(DatasetSource { + client, db_name, collection, meta_collection, @@ -439,59 +449,110 @@ where }) } +/// Read a per-dataset Mongo URI env var (e.g. `MONGODB_URI_BSOSE`) and +/// build a `mongodb::Client` from it. Returns `Some(client)` if the var +/// is set and parses cleanly, `None` if the var is unset or empty (the +/// dataset is simply disabled for this deployment), or panics on a +/// malformed URI / unreachable Mongo (treated as deployment-config +/// bugs that should fail loudly at startup). +async fn dataset_client(env_var: &str) -> Option { + let uri = match env::var(env_var) { + Ok(u) if !u.is_empty() => u, + _ => return None, + }; + let opts = mongodb::options::ClientOptions::parse(&uri) + .await + .unwrap_or_else(|e| panic!("invalid {}: {}", env_var, e)); + let client = mongodb::Client::with_options(opts) + .unwrap_or_else(|e| panic!("could not build Mongo client for {}: {}", env_var, e)); + Some(client) +} + #[actix_web::main] async fn main() -> std::io::Result<()> { - // Initialize the MongoDB client - let client_options = mongodb::options::ClientOptions::parse(env::var("MONGODB_URI").unwrap()).await.unwrap(); - let client = mongodb::Client::with_options(client_options).unwrap(); - *CLIENT.lock().unwrap() = Some(client); - - // Load each dataset's source-of-truth and stash it in the static for - // handlers to read. The Mongo identity strings live here at the call - // site — one place per dataset — and the returned struct bundles - // them with the values read from the meta doc. Adding a new dataset - // is one more load-and-set block here. - let bsose = load_dataset_source::( - "argo", - "bsose", - "timeseriesMeta", - "BSOSE-profile", - ) - .await - .expect("failed to load BSOSE dataset source at startup"); - *BSOSE_SOURCE.lock().unwrap() = Some(bsose); - - let oisst = load_dataset_source::( - "argo", - "noaaOIsst", - "timeseriesMeta", - "noaa-oi-sst-v2-high-res", - ) - .await - .expect("failed to load OI SST dataset source at startup"); - *OISST_SOURCE.lock().unwrap() = Some(oisst); + // Each dataset is enabled iff its `MONGODB_URI_` env var is + // set. We build a Mongo client per enabled dataset, load its meta + // doc into the corresponding `*_SOURCE` static, and record a bool + // so the HttpServer factory below knows whether to register the + // dataset's route. An unset env var means "this deployment doesn't + // serve that dataset" — no load, no route, no surprises. + let mut enabled_bsose = false; + if let Some(client) = dataset_client("MONGODB_URI_BSOSE").await { + let bsose = load_dataset_source::( + client, + "argo", + "bsose", + "timeseriesMeta", + "BSOSE-profile", + ) + .await + .expect("failed to load BSOSE dataset source at startup"); + *BSOSE_SOURCE.lock().unwrap() = Some(bsose); + enabled_bsose = true; + } - HttpServer::new(|| { - App::new() - .service(bsose_handler) - .service(oisst_handler) + let mut enabled_oisst = false; + if let Some(client) = dataset_client("MONGODB_URI_NOAAOISST").await { + let oisst = load_dataset_source::( + client, + "argo", + "noaaOIsst", + "timeseriesMeta", + "noaa-oi-sst-v2-high-res", + ) + .await + .expect("failed to load OI SST dataset source at startup"); + *OISST_SOURCE.lock().unwrap() = Some(oisst); + enabled_oisst = true; + } + + if !enabled_bsose && !enabled_oisst { + eprintln!( + "warning: no datasets enabled. Set at least one of \ + MONGODB_URI_BSOSE / MONGODB_URI_NOAAOISST." + ); + } else { + println!( + "Datasets enabled:{}{}", + if enabled_bsose { " bsose" } else { "" }, + if enabled_oisst { " noaaoisst" } else { "" }, + ); + } + + // `App::configure` lets us register routes conditionally without + // running into the "each `.service(...)` call returns a new App + // type" problem. + HttpServer::new(move || { + App::new().configure(move |cfg| { + if enabled_bsose { + cfg.service(bsose_handler); + } + if enabled_oisst { + cfg.service(oisst_handler); + } + }) }) .bind(("0.0.0.0", 8080))? .run() .await } -async fn generate_cursor(db_name: &str, collection_name: &str, filter: Document, options: Option) -> Result> { - let client = { - let guard = match CLIENT.lock() { - Ok(guard) => guard, - Err(poisoned) => poisoned.into_inner(), - }; - match guard.as_ref() { - Some(client) => client.clone(), - None => return Err(mongodb::error::Error::from(std::io::Error::new(std::io::ErrorKind::Other, "Client is None"))), - } - }; - client.database(db_name).collection::(collection_name).find(filter, options).await +/// Thin wrapper around `client.database(...).collection(...).find(...)`. +/// Takes the Mongo client by reference so callers can keep one client +/// per dataset (see `DatasetSource::client`) instead of reaching into a +/// global. The wrapper exists mostly so the call sites read uniformly +/// across the codebase. +async fn generate_cursor( + client: &mongodb::Client, + db_name: &str, + collection_name: &str, + filter: Document, + options: Option, +) -> Result> { + client + .database(db_name) + .collection::(collection_name) + .find(filter, options) + .await } \ No newline at end of file diff --git a/api/tests/integration.rs b/api/tests/integration.rs index d8e8f3d..3be701e 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -1,19 +1,31 @@ // Integration tests against a live API + MongoDB. // // Preconditions: -// * `cargo run --bin seed_test_db` has been run against the same MongoDB -// the API is connected to. -// * The API has been (re)started AFTER the seed, so it caches the right -// `timeseriesMeta.timeseries` vector at startup. -// * API_URL points at the running API (default: http://localhost:8080). -// * MONGODB_URI is reachable (default: mongodb://localhost:27017). It is -// not used directly by these tests but is read so that misconfigured -// environments fail loudly. +// * `cargo run --bin seed_test_db` has been run against the MongoDB the +// API is connected to. The seeder uses its own `MONGODB_URI` env var +// (distinct from the per-dataset URIs the API uses) to know where +// to write fixtures. +// * The API has been (re)started AFTER the seed, so it caches the +// right `timeseriesMeta` documents at startup. +// * The API process has per-dataset Mongo URIs set: at minimum +// `MONGODB_URI_BSOSE`, and `MONGODB_URI_NOAAOISST` if OI SST tests +// are enabled. A dataset whose env var is unset is simply not +// served by that deployment. +// * `API_URL` points at the running API (default +// `http://localhost:8080`). `MONGODB_URI` is read by these tests +// only as a sanity-check that the environment is configured; +// they don't connect to Mongo themselves. // // Run with: -// API_URL=http://localhost:8080 MONGODB_URI=mongodb://localhost:27017 \ +// API_URL=http://localhost:8080 \ +// MONGODB_URI=mongodb://localhost:27017 \ // cargo test --test integration -- --test-threads=1 // +// And start the API with, e.g. (same Mongo for both datasets locally): +// MONGODB_URI_BSOSE=mongodb://localhost:27017 \ +// MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \ +// cargo run +// // Every response is the paginated envelope: // { "docs": [...], "next_url": "" | null, "message": "..." } // Multi-tile queries (anything spanning multiple grid cells or vertical From 888d2294ebb9b748504639f559fa54e9a939c521 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Tue, 26 May 2026 23:47:48 -0400 Subject: [PATCH 07/13] dont append data_info and timeseries if we dont need them --- api/PAGINATION.md | 19 ++- api/src/helpers/schema.rs | 49 ++++--- api/src/helpers/transforms.rs | 243 ++++++++++++++++++++++++---------- 3 files changed, 223 insertions(+), 88 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index a54c7f4..a89b25b 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -197,7 +197,24 @@ MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \ cargo run ``` -### `data_info` precedence rule +### Response-shape rule for `data_info` and `timeseries` + +The fields `data_info` and `timeseries` on a response doc appear *only +when the user's query has materially altered the dataset-wide defaults*: + +- `data_info` appears iff the user supplied `data=`. Without `data=`, + the column layout matches the dataset default and the field is + omitted (clients fall back to the meta endpoint). +- `timeseries` appears iff the user supplied `startDate` or `endDate`. + Without either, the time axis matches the dataset default and the + field is omitted. + +The intent is response slimness: the data response carries only what +*differs* from the meta endpoint's dataset-wide values. With both +qsps unset, response docs are short (just `_id`, geolocation, level, +metadata, and an empty `data` array). + +### `data_info` precedence rule (when `data=` is set) `data_info` (variable names, units, per-variable descriptors) may appear on either the data doc, the meta doc, or both: diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs index 4883bb9..abfb6af 100644 --- a/api/src/helpers/schema.rs +++ b/api/src/helpers/schema.rs @@ -34,8 +34,14 @@ pub trait IsTimeseries { fn set_data(&mut self, data: Vec>); fn timeseries(&mut self) -> Option<&mut Vec>; fn set_timeseries(&mut self, timeseries: Vec); - fn data_info(&mut self) -> DataInfo; - fn set_data_info(&mut self, data_info: DataInfo); + /// `data_info` is `Option` because the response carries it only when + /// the user's query has *materially altered* the data layout — + /// concretely, when the `data=` qsp triggers column filtering. In + /// the no-`data=` case `transform_timeseries` writes `None` here so + /// the response omits the field, and clients fall back to the + /// dataset-wide `data_info` on the meta endpoint. + fn data_info(&mut self) -> Option; + fn set_data_info(&mut self, data_info: Option); fn _id(&self) -> String; fn longitude(&self) -> f64; fn latitude(&self) -> f64; @@ -73,9 +79,18 @@ pub struct BsoseSchema { pub(crate) cell_z_size: f64, pub(crate) reference_density_profile: f64, pub(crate) data: Vec>, - // Not present in the source collection — gets populated by transforms. + // Both of the next two are response-shape-driven: present iff the + // user's query made the dataset-wide default insufficient. Per the + // response-shape rule, `data_info` appears only when `data=` qsp + // triggered column filtering; `timeseries` appears only when the + // user cut the time axis with `startDate` / `endDate`. Both + // serialize-absent when None and default to None / empty on + // deserialization, so a BSOSE source doc that carries `data_info` + // (the per-cell default for BSOSE today) still reads cleanly. + #[serde(default, skip_serializing_if = "Option::is_none")] pub(crate) timeseries: Option>, - pub(crate) data_info: DataInfo, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) data_info: Option, } impl IsTimeseries for BsoseSchema { @@ -99,11 +114,11 @@ impl IsTimeseries for BsoseSchema { self.timeseries = Some(timeseries); } - fn data_info(&mut self) -> DataInfo { + fn data_info(&mut self) -> Option { self.data_info.clone() } - fn set_data_info(&mut self, data_info: DataInfo) { + fn set_data_info(&mut self, data_info: Option) { self.data_info = data_info; } @@ -209,16 +224,16 @@ pub struct OisstSchema { pub(crate) level: f64, pub(crate) data: Vec>, // OI SST data docs don't carry `timeseries` or `data_info` of their - // own — both are populated at request time. `timeseries` is filled - // by `slice_timerange`; `data_info` is stamped from the per-dataset - // cached default by `transform_timeseries` (the meta doc holds the - // dataset-wide variable info, single-variable for OI SST). Both - // default at deserialization so an absent field in the source doc - // produces an empty/None value rather than a parse error. - #[serde(default)] + // own — both are populated at request time per the response-shape + // rule. `timeseries` is filled by `slice_timerange` iff the user + // set `startDate` / `endDate`; `data_info` is stamped (from the + // per-dataset cached default) by `transform_timeseries` iff the + // user set `data=`. Both serialize-absent when None and default to + // None on deserialization. + #[serde(default, skip_serializing_if = "Option::is_none")] pub(crate) timeseries: Option>, - #[serde(default)] - pub(crate) data_info: DataInfo, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub(crate) data_info: Option, } impl IsTimeseries for OisstSchema { @@ -242,11 +257,11 @@ impl IsTimeseries for OisstSchema { self.timeseries = Some(timeseries); } - fn data_info(&mut self) -> DataInfo { + fn data_info(&mut self) -> Option { self.data_info.clone() } - fn set_data_info(&mut self, data_info: DataInfo) { + fn set_data_info(&mut self, data_info: Option) { self.data_info = data_info; } diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs index adbda37..c33a71a 100644 --- a/api/src/helpers/transforms.rs +++ b/api/src/helpers/transforms.rs @@ -6,22 +6,33 @@ use mongodb::bson::DateTime as BsonDateTime; /// timeseries document. Returns `None` if the document was filtered out /// entirely (e.g. `data=somefield` produced no matching columns). /// -/// `cached_data_info` is the *meta-level default* `data_info` for this -/// dataset — read once at startup from the meta doc and stashed on the -/// dataset's `DatasetSource`. Precedence rule for `data_info`: +/// Response-shape rules enforced here: +/// +/// - `timeseries` appears on the returned doc *iff* the user supplied +/// `startDate` or `endDate`. Otherwise the field stays `None`, and +/// clients fall back to the dataset-wide timeseries on the meta +/// endpoint. (Mechanically: `slice_timerange` populates the field +/// only when invoked, which only happens for date-bounded queries.) +/// +/// - `data_info` appears on the returned doc *iff* the user supplied +/// `data=`. With `data=` set we materialise the working `data_info` +/// (see precedence rule below) onto the doc so `slice_data` can +/// filter, and the resulting filtered `data_info` rides along in +/// the response. Without `data=`, we scrub `data_info` to `None` +/// (even if the source doc carried one — the BSOSE case) so the +/// response stays slim and clients defer to the meta endpoint. +/// +/// Precedence rule for the working `data_info` when `data=` is set: /// /// - If the data doc carries its own non-empty `data_info` (the BSOSE -/// case today), it wins; the cached default is ignored. +/// case today), it wins; the cached meta-level default is ignored. /// - If the data doc has no `data_info` (the OI SST case — single /// variable, info kept on the meta doc), the cached default is /// stamped onto the doc before `slice_data` runs. -/// - If both are empty, `slice_data` will return `None` for any -/// specific-variable request (no columns to match) and an -/// empty-data passthrough for `data=` / `data=all` — same as today. /// -/// The cache passes through as `&DataInfo` rather than `Option`: an empty -/// tuple is its own "no default" sentinel, matching the convention -/// `slice_data` already uses for "no fields". +/// The cache parameter is `&DataInfo` (not `Option`): an empty tuple +/// is its own "no default" sentinel, so a dataset whose meta doc has +/// no `data_info` field just lands an empty tuple in the cache. pub fn transform_timeseries( params: &serde_json::Value, ts: &[BsonDateTime], @@ -45,15 +56,25 @@ pub fn transform_timeseries( slice_timerange(start_date, end_date, ts, &mut doc); } - // Doc-level data_info takes precedence; only fall back to the cached - // meta-level default when the doc itself carries no variable names. - // `data_info.0` is the variable-names vector, so its emptiness is the - // canonical "no data_info" check. - if doc.data_info().0.is_empty() && !cached_data_info.0.is_empty() { - doc.set_data_info(cached_data_info.clone()); + if data.is_empty() { + // No `data=` qsp. Clear `data` (the historical behaviour for + // this code path) and scrub `data_info` so the response omits + // it. Clients that need variable info read the meta endpoint. + doc.set_data(Vec::new()); + doc.set_data_info(None); + return Some(doc); } - slice_data(&data, doc) + // `data=` qsp set. Materialise the working `data_info`: doc-level + // wins over cache, and an empty (or absent) doc-level value falls + // back to the meta-level cached default. + let working_info: schema::DataInfo = doc + .data_info() + .filter(|di| !di.0.is_empty()) + .unwrap_or_else(|| cached_data_info.clone()); + doc.set_data_info(Some(working_info.clone())); + + slice_data(&data, &working_info, doc) } /// Slice the document's data columns and timeseries field to the time window @@ -91,30 +112,32 @@ pub fn slice_timerange( } } -/// Apply the `data=` query parameter to a single document. Behaviour mirrors -/// the previous Vec-based implementation: +/// Apply the `data=` query parameter to a single document. Called only +/// when `data=` was actually supplied — the empty-`data` case is handled +/// by `transform_timeseries` before getting here. +/// +/// - `data` contains "all": leaves data/data_info untouched. +/// - otherwise: filters the data columns down to the named variables +/// and writes the filtered `data_info` back onto the doc; returns +/// `None` if no requested variables match (caller drops the row). +/// - if `data` also contains "except_data_values", clears the `data` +/// field after column-filtering, but keeps the row (the filtered +/// `data_info` is what the caller wanted to see). /// -/// - empty `data`: clears the document's `data` field but keeps the row. -/// - `data` contains "all": leaves everything untouched. -/// - otherwise: filters the data columns down to the named variables; -/// returns `None` if no requested variables match (caller drops the row). -/// - if `data` also contains "except_data_values", clears the data field -/// after column-filtering, but keeps the row. +/// `data_info` is passed in as an explicit `&DataInfo` rather than read +/// off the doc — the caller (`transform_timeseries`) is responsible +/// for deciding which `data_info` is in effect (doc-level vs cached +/// meta-level default) and putting it on the doc before calling here. pub fn slice_data( data: &[String], + data_info: &schema::DataInfo, mut doc: T, ) -> Option { - if data.is_empty() { - doc.set_data(Vec::new()); - return Some(doc); - } - if data.iter().any(|s| s == "all") { return Some(doc); } // Specific fields requested — filter columns. - let data_info = doc.data_info(); let indexes: Vec = data .iter() .filter_map(|item| data_info.0.iter().position(|x| x == item)) @@ -126,12 +149,12 @@ pub fn slice_data( .collect(); doc.set_data(filtered_data); - let filtered_data_info: (Vec, Vec, Vec>) = ( + let filtered_data_info: schema::DataInfo = ( indexes.iter().filter_map(|&i| data_info.0.get(i).cloned()).collect(), data_info.1.clone(), indexes.iter().filter_map(|&i| data_info.2.get(i).cloned()).collect(), ); - doc.set_data_info(filtered_data_info); + doc.set_data_info(Some(filtered_data_info)); // No matching columns -> caller drops the row. if doc.data().is_empty() { @@ -160,21 +183,33 @@ pub fn timeseries_stub(result: &T) -> schema::Timeserie #[cfg(test)] mod tests { use super::*; - use crate::helpers::schema::{BsoseSchema, GeoJSONPoint, IsTimeseries}; + use crate::helpers::schema::{BsoseSchema, DataInfo, GeoJSONPoint, IsTimeseries}; use serde_json::json; - // Helper: construct a BsoseSchema directly. Field-level visibility is - // `pub(crate)` so this struct literal works from any test in the crate - // and gets compile-time field checking — if a schema field is renamed, - // the test stops compiling. - fn make_bsose(id: &str, data: Vec>, var_names: &[&str]) -> BsoseSchema { + // Helper: construct a DataInfo from variable names. Used both inside + // `make_bsose` and at slice_data call sites that need to pass an + // explicit DataInfo. + fn make_data_info(var_names: &[&str]) -> DataInfo { let names: Vec = var_names.iter().map(|s| s.to_string()).collect(); let units = vec!["units".to_string(), "long_name".to_string()]; let per_var_info: Vec> = names .iter() .map(|n| vec!["u".to_string(), n.clone()]) .collect(); + (names, units, per_var_info) + } + + /// Empty DataInfo sentinel — used as the "no meta-level default" cache + /// value in tests that exercise the BSOSE-style doc-level precedence. + fn empty_data_info() -> DataInfo { + (vec![], vec![], vec![]) + } + // Helper: construct a BsoseSchema directly. Field-level visibility is + // `pub(crate)` so this struct literal works from any test in the crate + // and gets compile-time field checking — if a schema field is renamed, + // the test stops compiling. + fn make_bsose(id: &str, data: Vec>, var_names: &[&str]) -> BsoseSchema { BsoseSchema { _id: id.to_string(), metadata: vec!["meta1".to_string()], @@ -190,7 +225,7 @@ mod tests { reference_density_profile: 1.0, data, timeseries: None, - data_info: (names, units, per_var_info), + data_info: Some(make_data_info(var_names)), } } @@ -243,60 +278,51 @@ mod tests { } // ---- slice_data ---------------------------------------------------------- - - #[test] - fn slice_data_empty_request_drops_data() { - let doc = make_bsose( - "doc1", - vec![vec![1.0, 2.0], vec![3.0, 4.0]], - &["temp", "salinity"], - ); - let mut out = slice_data(&[], doc).expect("empty data param keeps the row"); - assert!(out.data().is_empty()); - } + // + // `slice_data` is only called from `transform_timeseries` in the + // `data=` qsp case, so these tests always supply a non-empty `data` + // vector. The empty-data case is exercised through the + // `transform_timeseries` tests below. #[test] fn slice_data_all_keeps_everything() { + let info = make_data_info(&["temp", "salinity"]); let doc = make_bsose( "doc1", vec![vec![1.0, 2.0], vec![3.0, 4.0]], &["temp", "salinity"], ); - let mut out = slice_data(&["all".to_string()], doc).unwrap(); + let mut out = slice_data(&["all".to_string()], &info, doc).unwrap(); assert_eq!(*out.data(), vec![vec![1.0, 2.0], vec![3.0, 4.0]]); } #[test] fn slice_data_specific_field_filters_columns() { + let info = make_data_info(&["temp", "salinity"]); let doc = make_bsose( "doc1", vec![vec![1.0, 2.0], vec![3.0, 4.0]], &["temp", "salinity"], ); - let mut out = slice_data(&["salinity".to_string()], doc).unwrap(); + let mut out = slice_data(&["salinity".to_string()], &info, doc).unwrap(); assert_eq!(*out.data(), vec![vec![3.0, 4.0]]); } #[test] fn slice_data_unknown_field_drops_result() { - let doc = make_bsose( - "doc1", - vec![vec![1.0, 2.0]], - &["temp"], - ); - let out = slice_data(&["nonexistent".to_string()], doc); + let info = make_data_info(&["temp"]); + let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]); + let out = slice_data(&["nonexistent".to_string()], &info, doc); assert!(out.is_none(), "no matching columns -> row is dropped"); } #[test] fn slice_data_except_data_values_clears_after_filtering() { - let doc = make_bsose( - "doc1", - vec![vec![1.0, 2.0]], - &["temp"], - ); + let info = make_data_info(&["temp"]); + let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]); let mut out = slice_data( &["temp".to_string(), "except_data_values".to_string()], + &info, doc, ) .unwrap(); @@ -323,8 +349,7 @@ mod tests { // Empty cached_data_info — this doc already carries its own // data_info, so the precedence rule means the cache is never // consulted regardless. - let empty_cache: schema::DataInfo = (vec![], vec![], vec![]); - let mut out = transform_timeseries(¶ms, ×eries, &empty_cache, doc).unwrap(); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc).unwrap(); assert_eq!(*out.data(), vec![vec![20.0, 30.0]]); } @@ -339,10 +364,10 @@ mod tests { let mut doc = make_bsose( "doc1", vec![vec![1.0, 2.0]], - &["sst"], // gets cleared below to simulate a docless data_info + &["sst"], // overridden to None below to simulate an OI SST doc ); // Clear the doc's data_info so the cache fallback kicks in. - doc.data_info = (vec![], vec![], vec![]); + doc.data_info = None; let cache: schema::DataInfo = ( vec!["sst".to_string()], @@ -357,7 +382,7 @@ mod tests { assert_eq!(*out.data(), vec![vec![1.0, 2.0]]); // data_info has been stamped from the cache, then column-filtered // by slice_data — should still list sst. - let info = out.data_info(); + let info = out.data_info().expect("data_info present when data= is set"); assert_eq!(info.0, vec!["sst".to_string()]); } @@ -386,8 +411,86 @@ mod tests { let mut out = transform_timeseries(¶ms, ×eries, &cache, doc).expect("should resolve"); assert_eq!(*out.data(), vec![vec![1.0, 2.0]]); - let info = out.data_info(); + let info = out.data_info().expect("data_info present when data= is set"); + assert_eq!(info.0, vec!["temp".to_string()]); + } + + // ---- response-shape rule: data_info & timeseries omitted unless munged -- + + #[test] + fn transform_omits_data_info_when_no_data_qsp() { + // No `data=` in params: the response doc should carry no + // data_info — clients fall back to the meta endpoint. + let timeseries = ts(&[1, 2]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0], vec![3.0, 4.0]], + &["temp", "salinity"], + ); + let params = json!({}); // no data=, no dates + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("doc should survive"); + assert!( + out.data_info().is_none(), + "data_info should be absent when data= qsp is unset, got {:?}", + out.data_info() + ); + // data field is cleared in the no-data= branch (historical behaviour). + assert!(out.data().is_empty()); + } + + #[test] + fn transform_omits_timeseries_when_no_date_qsp() { + // No `startDate` / `endDate`: the response doc should carry no + // timeseries field — clients fall back to the meta endpoint's + // dataset-wide timeseries. + let timeseries = ts(&[1, 2]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0], vec![3.0, 4.0]], + &["temp", "salinity"], + ); + let params = json!({"data": "all"}); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("doc should survive"); + assert!( + out.timeseries().is_none(), + "timeseries should be absent when neither startDate nor endDate is set" + ); + } + + #[test] + fn transform_includes_timeseries_when_start_date_set() { + // startDate alone is enough to trigger timeseries on the response. + let timeseries = ts(&[1, 2, 3]); + let doc = make_bsose("doc1", vec![vec![1.0, 2.0, 3.0]], &["temp"]); + let params = json!({"startDate": "2020-02-01T00:00:00Z"}); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("doc should survive"); + let ts_field = out.timeseries().expect("timeseries present when startDate set"); + assert_eq!(ts_field.len(), 2); + assert!(ts_field[0].starts_with("2020-02-01")); + // Without `data=`, data_info still absent. + assert!(out.data_info().is_none()); + } + + #[test] + fn transform_includes_data_info_when_data_qsp_set() { + // `data=` alone is enough to trigger data_info on the response, + // even without date params. + let timeseries = ts(&[1, 2]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0], vec![3.0, 4.0]], + &["temp", "salinity"], + ); + let params = json!({"data": "temp"}); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("doc should survive"); + let info = out.data_info().expect("data_info present when data= is set"); assert_eq!(info.0, vec!["temp".to_string()]); + // Without dates, timeseries still absent. + assert!(out.timeseries().is_none()); } // ---- timeseries_stub ----------------------------------------------------- From 0b6413b2650da17d42bd47246cc840876845b100 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 12:43:06 -0400 Subject: [PATCH 08/13] manage presence / nonpresence of data prop --- api/PAGINATION.md | 28 +++++++--- api/src/helpers/schema.rs | 9 +++ api/src/helpers/transforms.rs | 101 +++++++++++++++++++++++++++++++++- 3 files changed, 129 insertions(+), 9 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index a89b25b..242dc01 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -95,7 +95,7 @@ antimeridian / north-pole docs aren't lost. | `center` + `radius` | JSON `[lon, lat]` + meters | Disk query. Radius capped at the dataset's `max_radius_meters`. | | `verticalRange` | JSON `[lo, hi]` | Half-open depth range applied on top of tile-level pagination. | | `startDate` / `endDate` | RFC-3339 string | Slices each doc's timeseries to this window. | -| `data` | comma-separated | Variables to include. `all` keeps everything. `except_data_values` keeps the schema but clears values. | +| `data` | comma-separated | Variables to include. `all` keeps everything; a specific list filters columns. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. | | `compression` | `minimal` | See mode flags. | | `batchmeta` | any | See mode flags. | | `tile_index` | non-negative integer | Pagination cursor. Default `0`. Almost always supplied by the previous response's `next_url`. | @@ -197,11 +197,15 @@ MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \ cargo run ``` -### Response-shape rule for `data_info` and `timeseries` +### Response-shape rules -The fields `data_info` and `timeseries` on a response doc appear *only -when the user's query has materially altered the dataset-wide defaults*: +Three fields on a response doc — `data`, `data_info`, `timeseries` — +appear *only when the user's query has materially asked for or altered +them*: +- `data` appears iff the user supplied `data=`. Without `data=`, + the field is omitted entirely and clients are signalled that they + asked for no per-cell values. Use this for slim listings. - `data_info` appears iff the user supplied `data=`. Without `data=`, the column layout matches the dataset default and the field is omitted (clients fall back to the meta endpoint). @@ -209,10 +213,18 @@ when the user's query has materially altered the dataset-wide defaults*: Without either, the time axis matches the dataset default and the field is omitted. -The intent is response slimness: the data response carries only what -*differs* from the meta endpoint's dataset-wide values. With both -qsps unset, response docs are short (just `_id`, geolocation, level, -metadata, and an empty `data` array). +A consequence of the `data` rule: if `data=` *is* supplied but the +resulting data is empty (no columns survived filtering, or every +column's time window collapsed to zero points), the whole doc is +dropped from the response rather than serialized with an empty +array. This keeps responses honest — a doc in the response always +carries something the user asked for. The one exception is +`except_data_values`: when present in the `data=` list, the user has +explicitly opted into a schema-only response, so an empty `data` +array is what they asked for and the doc stays. + +With all three qsps unset, response docs are short: `_id`, +geolocation, level, metadata. ### `data_info` precedence rule (when `data=` is set) diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs index abfb6af..8a7114b 100644 --- a/api/src/helpers/schema.rs +++ b/api/src/helpers/schema.rs @@ -78,6 +78,12 @@ pub struct BsoseSchema { pub(crate) sea_binary_mask_at_t_locaiton: bool, pub(crate) cell_z_size: f64, pub(crate) reference_density_profile: f64, + // `data` is the per-variable per-timestep array. Omitted from the + // response when empty so the no-`data=` qsp response stays slim + // (transform_timeseries clears it in that branch). When `data=` is + // set and `data` ends up empty after filtering, the whole doc gets + // dropped before serialization, so an empty array never ships. + #[serde(skip_serializing_if = "Vec::is_empty")] pub(crate) data: Vec>, // Both of the next two are response-shape-driven: present iff the // user's query made the dataset-wide default insufficient. Per the @@ -222,6 +228,9 @@ pub struct OisstSchema { pub(crate) basin: f64, pub(crate) geolocation: GeoJSONPoint, pub(crate) level: f64, + // Omitted from the response when empty (no `data=` qsp); see the + // matching annotation on `BsoseSchema.data` for the full reasoning. + #[serde(skip_serializing_if = "Vec::is_empty")] pub(crate) data: Vec>, // OI SST data docs don't carry `timeseries` or `data_info` of their // own — both are populated at request time per the response-shape diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs index c33a71a..f17ed9e 100644 --- a/api/src/helpers/transforms.rs +++ b/api/src/helpers/transforms.rs @@ -74,7 +74,27 @@ pub fn transform_timeseries( .unwrap_or_else(|| cached_data_info.clone()); doc.set_data_info(Some(working_info.clone())); - slice_data(&data, &working_info, doc) + let mut sliced = slice_data(&data, &working_info, doc)?; + + // Drop-on-empty: when the user asked for data and the surviving + // data is empty — whether the outer Vec is empty (no matching + // columns survived) OR the outer is non-empty but every inner is + // empty (e.g. a time window that collapsed every column to zero + // points) — the doc has nothing useful to convey, so drop it. + // `iter().all(|inner| inner.is_empty())` returns true for both + // shapes (vacuously true on an empty outer), so a single check + // covers both. + // + // Exception: `except_data_values` in the data list is an explicit + // "I want the (filtered) data_info but not the values" signal, so + // an empty data array is what the user asked for, not a sign that + // we should drop. Skip the drop check in that case. + let user_wants_empty_data = data.iter().any(|s| s == "except_data_values"); + if !user_wants_empty_data && sliced.data().iter().all(|inner| inner.is_empty()) { + return None; + } + + Some(sliced) } /// Slice the document's data columns and timeseries field to the time window @@ -493,6 +513,85 @@ mod tests { assert!(out.timeseries().is_none()); } + // ---- drop-on-empty rule (data= set but data ends up empty) -------------- + + #[test] + fn transform_drops_doc_when_no_matching_columns() { + // User asked for a variable that doesn't exist on this doc. + // slice_data filters to an empty outer Vec → doc is dropped. + let timeseries = ts(&[1, 2]); + let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]); + let params = json!({"data": "nonexistent"}); + let out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc); + assert!(out.is_none(), "doc with no matching columns should be dropped"); + } + + #[test] + fn transform_drops_doc_when_time_window_collapses_to_empty() { + // User asked for a time range past the dataset's last timestamp: + // each column survives column-filtering but ends up with zero + // time points. The doc has nothing useful to report — drop. + let timeseries = ts(&[1, 2, 3]); // Jan, Feb, Mar 2020 + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0, 3.0], vec![10.0, 20.0, 30.0]], + &["temp", "salinity"], + ); + let params = json!({ + "data": "all", + "startDate": "2021-01-01T00:00:00Z", // well past the timeseries + }); + let out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc); + assert!( + out.is_none(), + "doc whose time window collapsed to zero points should be dropped" + ); + } + + #[test] + fn transform_keeps_doc_with_except_data_values_despite_empty_data() { + // `except_data_values` is the user explicitly asking for + // "schema only, no values". slice_data clears the data after + // column-filtering; the drop-on-empty rule should skip this + // case rather than dropping the doc — the empty data was + // requested. + let timeseries = ts(&[1, 2]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0], vec![3.0, 4.0]], + &["temp", "salinity"], + ); + let params = json!({"data": "temp,except_data_values"}); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("except_data_values should not trigger drop-on-empty"); + // Data was deliberately cleared. + assert!(out.data().is_empty()); + // But the filtered data_info still rides along — that's the + // whole point of except_data_values. + let info = out.data_info().expect("data_info still present"); + assert_eq!(info.0, vec!["temp".to_string()]); + } + + #[test] + fn transform_keeps_doc_when_at_least_some_data_remains() { + // User asked for a real column with a non-degenerate time + // window. The doc should survive. + let timeseries = ts(&[1, 2, 3, 4]); + let doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0, 3.0, 4.0], vec![10.0, 20.0, 30.0, 40.0]], + &["temp", "salinity"], + ); + let params = json!({ + "data": "temp", + "startDate": "2020-02-01T00:00:00Z", + "endDate": "2020-04-01T00:00:00Z", + }); + let mut out = transform_timeseries(¶ms, ×eries, &empty_data_info(), doc) + .expect("non-empty doc should survive"); + assert_eq!(*out.data(), vec![vec![2.0, 3.0]]); + } + // ---- timeseries_stub ----------------------------------------------------- #[test] From ec3eec990ad9b429c92db2dadebcc1883104e47d Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 12:48:20 -0400 Subject: [PATCH 09/13] bugfix --- api/src/helpers/transforms.rs | 86 ++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 7 deletions(-) diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs index f17ed9e..4f226dd 100644 --- a/api/src/helpers/transforms.rs +++ b/api/src/helpers/transforms.rs @@ -107,13 +107,33 @@ pub fn slice_timerange( ts: &[BsonDateTime], doc: &mut T, ) { - let start_index = start_date - .and_then(|sd| ts.iter().position(|&t| t >= sd)) - .unwrap_or(0); - - let end_index = end_date - .and_then(|ed| ts.iter().rposition(|&t| t < ed).map(|i| i + 1)) - .unwrap_or(ts.len()); + // Match on the Option directly so "no filter" and "filter present but + // matches nothing" land at different ends of the axis: + // - `start_date = None` → start at 0 (no lower bound). + // - `start_date = Some(sd)` and no timestamp is >= sd + // (the filter is past the end of the data) → start at ts.len() + // so the slice collapses to empty rather than degrading to + // "whole range," which a plain `.unwrap_or(0)` would do. + // Symmetric reasoning for `end_date`. + let start_index = match start_date { + None => 0, + Some(sd) => ts.iter().position(|&t| t >= sd).unwrap_or(ts.len()), + }; + let end_index = match end_date { + None => ts.len(), + Some(ed) => ts + .iter() + .rposition(|&t| t < ed) + .map(|i| i + 1) + .unwrap_or(0), + }; + + // If the user passed mutually unsatisfiable dates (or startDate is + // past everything *and* endDate is before everything), `start_index` + // could exceed `end_index` — slicing `[a..b]` with `a > b` panics. + // Clamp `end` up to `start` so the slice is always empty rather than + // a panic. + let end_index = end_index.max(start_index); let time_window: Vec = ts[start_index..end_index] .iter() @@ -297,6 +317,58 @@ mod tests { assert_eq!(*doc.data(), vec![vec![1.0, 2.0, 3.0]]); } + #[test] + fn slice_timerange_startdate_past_everything_yields_empty_range() { + // startDate is past the last timestamp — the resulting window + // should be empty, not the whole range (which the old + // `.unwrap_or(0)` shape gave by accident). + let timeseries = ts(&[1, 2, 3]); // Jan, Feb, Mar 2020 + let mut doc = make_bsose( + "doc1", + vec![vec![1.0, 2.0, 3.0]], + &["temp"], + ); + let start = helpers::string2bsondate("2021-01-01T00:00:00Z"); + slice_timerange(start, None, ×eries, &mut doc); + assert!(doc.data()[0].is_empty()); + assert!(doc.timeseries().unwrap().is_empty()); + } + + #[test] + fn slice_timerange_enddate_before_everything_yields_empty_range() { + // endDate is before the first timestamp — the resulting window + // should be empty, not the whole range. + let timeseries = ts(&[6, 7, 8]); // Jun, Jul, Aug 2020 + let mut doc = make_bsose( + "doc1", + vec![vec![6.0, 7.0, 8.0]], + &["temp"], + ); + let end = helpers::string2bsondate("2020-01-01T00:00:00Z"); + slice_timerange(None, end, ×eries, &mut doc); + assert!(doc.data()[0].is_empty()); + assert!(doc.timeseries().unwrap().is_empty()); + } + + #[test] + fn slice_timerange_mutually_unsatisfiable_dates_collapse_to_empty() { + // startDate past the data AND endDate before the data: the + // raw indices would be start_index = ts.len(), end_index = 0, + // which would panic on the slice. The `end_index.max(start_index)` + // clamp keeps this safe (and empty). + let timeseries = ts(&[6, 7, 8]); + let mut doc = make_bsose( + "doc1", + vec![vec![6.0, 7.0, 8.0]], + &["temp"], + ); + let start = helpers::string2bsondate("2021-01-01T00:00:00Z"); + let end = helpers::string2bsondate("2019-01-01T00:00:00Z"); + slice_timerange(start, end, ×eries, &mut doc); + assert!(doc.data()[0].is_empty()); + assert!(doc.timeseries().unwrap().is_empty()); + } + // ---- slice_data ---------------------------------------------------------- // // `slice_data` is only called from `transform_timeseries` in the From 4c2b932c8087c7425fb98db23dd035e4f9f45ec5 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 12:54:56 -0400 Subject: [PATCH 10/13] bugfix --- api/tests/integration.rs | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 3be701e..8f3cd39 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -131,13 +131,26 @@ async fn no_filters_returns_all_seeded_documents_across_pages() { let docs = get_paged("/timeseries/bsose", &[]).await; assert_eq!(docs.len(), 4, "expected all 4 seeded docs across pages"); - // Without `data` set, slice_data clears the data field but keeps rows. + // Per the response-shape rule, a request with neither `data=` nor + // `startDate`/`endDate` should return slim docs: `data`, `data_info`, + // and `timeseries` are all omitted because the user hasn't asked to + // see or alter them. Clients fall back to the meta endpoint for the + // dataset-wide variable info and time axis. for row in &docs { - let data = row.get("data").expect("each row should have a data field"); - let outer = data.as_array().expect("data should be an array"); assert!( - outer.is_empty(), - "data should be cleared when `data` query param is absent" + row.get("data").is_none(), + "`data` field should be absent when no `data` qsp is supplied; got: {:?}", + row.get("data") + ); + assert!( + row.get("data_info").is_none(), + "`data_info` field should be absent when no `data` qsp is supplied; got: {:?}", + row.get("data_info") + ); + assert!( + row.get("timeseries").is_none(), + "`timeseries` field should be absent when no date qsp is supplied; got: {:?}", + row.get("timeseries") ); } } From 8b4581c224fd8d19ce0f12672e7b80c83c5f8c80 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 13:07:04 -0400 Subject: [PATCH 11/13] validate QSPs --- api/PAGINATION.md | 5 ++ api/src/helpers/helpers.rs | 158 +++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index 242dc01..913e5ef 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -102,6 +102,11 @@ antimeridian / north-pole docs aren't lost. ## Validation errors (HTTP 400) +- Any query parameter not on the whitelist above. Unknown names are + rejected so typos (e.g. `start_Date` instead of `startDate`) fail + loudly instead of being silently ignored. When the typo is close to + a real param name, the error message includes a "did you mean ..." + suggestion. - More than one of `polygon` / `box` / `center` set. - `center` set without `radius`, or vice versa. - `radius` non-numeric, negative, non-finite, or above the dataset's cap. diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index 862d65f..f3b7c81 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -40,8 +40,92 @@ pub fn create_response(results: Vec) -> HttpResponse { } } +/// Whitelist of query-string parameter names recognised by the +/// `/timeseries/*` endpoints. Any qsp not in this list is rejected with +/// a 400 so typos (e.g. `start_Date` instead of `startDate`) fail loudly +/// with a useful suggestion rather than getting silently ignored +/// downstream. Update this list when adding a new qsp; keep it in sync +/// with the table in `api/PAGINATION.md`. +const ALLOWED_QSP: &[&str] = &[ + "id", + "box", + "polygon", + "center", + "radius", + "verticalRange", + "startDate", + "endDate", + "data", + "compression", + "batchmeta", + "tile_index", +]; + +/// Levenshtein edit distance — used to suggest a "did you mean" target +/// when a qsp doesn't match the whitelist. Iterative table with O(m*n) +/// space; m and n are at most ~15 chars in practice (the longest +/// whitelist entry is `verticalRange`), so this is trivially fast and +/// not worth optimising into the row-rolling form. +fn edit_distance(a: &str, b: &str) -> usize { + let a: Vec = a.chars().collect(); + let b: Vec = b.chars().collect(); + let m = a.len(); + let n = b.len(); + let mut dp = vec![vec![0usize; n + 1]; m + 1]; + for i in 0..=m { + dp[i][0] = i; + } + for j in 0..=n { + dp[0][j] = j; + } + for i in 1..=m { + for j in 1..=n { + let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 }; + dp[i][j] = (dp[i - 1][j] + 1) + .min(dp[i][j - 1] + 1) + .min(dp[i - 1][j - 1] + cost); + } + } + dp[m][n] +} + +/// Return the whitelist entry closest to `unknown` if one is within +/// edit distance 2 — covers common typo flavours (single-char +/// substitution / deletion / insertion, capitalisation differences) +/// without firing on totally unrelated names. `None` when nothing is +/// within range. +fn suggest_qsp(unknown: &str) -> Option<&'static str> { + ALLOWED_QSP + .iter() + .map(|allowed| (*allowed, edit_distance(unknown, allowed))) + .filter(|(_, dist)| *dist <= 2) + .min_by_key(|&(_, dist)| dist) + .map(|(allowed, _)| allowed) +} + pub fn validate_query_params(params: &serde_json::Value) -> Result<(), HttpResponse> { + // Reject unknown qsp names before any further validation so a typo + // (e.g. `start_Date` instead of `startDate`) fails loudly with a + // useful suggestion rather than getting silently ignored downstream. + if let Some(obj) = params.as_object() { + for key in obj.keys() { + if !ALLOWED_QSP.contains(&key.as_str()) { + let msg = match suggest_qsp(key) { + Some(closest) => format!( + "Unknown query parameter '{}'. Did you mean '{}'? Allowed parameters: {}", + key, closest, ALLOWED_QSP.join(", ") + ), + None => format!( + "Unknown query parameter '{}'. Allowed parameters: {}", + key, ALLOWED_QSP.join(", ") + ), + }; + return Err(HttpResponse::BadRequest().json(json!({"error": msg}))); + } + } + } + // should have at most one of polygon, box and center. let mut count = 0; if params.get("polygon").is_some() { @@ -339,6 +423,80 @@ mod tests { assert!(validate_query_params(¶ms).is_ok()); } + // ---- qsp whitelist -------------------------------------------------------- + + #[test] + fn validate_rejects_unknown_qsp() { + let params = json!({"start_Date": "2020-01-01T00:00:00Z"}); + let err = validate_query_params(¶ms).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn validate_rejects_unknown_qsp_even_when_other_params_valid() { + // A request that's well-formed apart from one stray unknown + // param should still be rejected — the whitelist check fires + // before any other validation. + let params = json!({ + "box": "[[0,0],[10,10]]", + "frobnicate": "yes", + }); + let err = validate_query_params(¶ms).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn validate_known_qsps_pass_through_whitelist() { + // Sanity check: every entry in the whitelist that doesn't have + // its own shape constraints should be accepted on its own. + // (Constrained params like `polygon` have their own dedicated + // tests above; this is purely the whitelist check.) + for (key, value) in [ + ("id", "doc1"), + ("data", "all"), + ("compression", "minimal"), + ("batchmeta", "true"), + ("tile_index", "5"), + ("startDate", "2020-01-01T00:00:00Z"), + ("endDate", "2020-12-31T23:59:59Z"), + ] { + let params = json!({ key: value }); + assert!( + validate_query_params(¶ms).is_ok(), + "whitelist should accept '{}' but rejected it", + key + ); + } + } + + #[test] + fn edit_distance_handles_common_typos() { + assert_eq!(edit_distance("startdate", "startDate"), 1); // case-only sub + assert_eq!(edit_distance("start_Date", "startDate"), 1); // single extra char + assert_eq!(edit_distance("verticalrange", "verticalRange"), 1); // case + assert_eq!(edit_distance("Box", "box"), 1); + assert_eq!(edit_distance("", "box"), 3); // pure insertion cost + assert_eq!(edit_distance("box", "box"), 0); + assert!(edit_distance("zzzzzzzzz", "startDate") > 2); + } + + #[test] + fn suggest_qsp_finds_close_match() { + assert_eq!(suggest_qsp("start_Date"), Some("startDate")); + assert_eq!(suggest_qsp("startdate"), Some("startDate")); + assert_eq!(suggest_qsp("verticalrange"), Some("verticalRange")); + assert_eq!(suggest_qsp("Box"), Some("box")); + } + + #[test] + fn suggest_qsp_returns_none_for_unrelated_input() { + // A name with no plausibly close match in the whitelist + // should return None rather than reaching for the nearest + // entry no matter how far. + assert_eq!(suggest_qsp("frobnicate"), None); + assert_eq!(suggest_qsp("xyzzy"), None); + } + // ---- validate_radius_cap ------------------------------------------------- /// Minimal config for radius-cap testing. tile_degrees / levels are From 697f15373bcef3d79811025b074afa88a435e2d1 Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 13:19:46 -0400 Subject: [PATCH 12/13] validate data string --- api/PAGINATION.md | 6 +- api/fixtures/bsose.json | 16 +-- api/src/helpers/dataset_config.rs | 10 ++ api/src/helpers/filter_composer.rs | 1 + api/src/helpers/helpers.rs | 193 +++++++++++++++++++++++++++++ api/src/helpers/tile_generator.rs | 3 + api/src/main.rs | 3 + api/tests/integration.rs | 34 ++++- 8 files changed, 255 insertions(+), 11 deletions(-) diff --git a/api/PAGINATION.md b/api/PAGINATION.md index 913e5ef..1414dbb 100644 --- a/api/PAGINATION.md +++ b/api/PAGINATION.md @@ -95,7 +95,7 @@ antimeridian / north-pole docs aren't lost. | `center` + `radius` | JSON `[lon, lat]` + meters | Disk query. Radius capped at the dataset's `max_radius_meters`. | | `verticalRange` | JSON `[lo, hi]` | Half-open depth range applied on top of tile-level pagination. | | `startDate` / `endDate` | RFC-3339 string | Slices each doc's timeseries to this window. | -| `data` | comma-separated | Variables to include. `all` keeps everything; a specific list filters columns. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. | +| `data` | comma-separated | Variables to include. Each token must be: a dataset-specific variable name (BSOSE: `THETA`, `SALT`; OI SST: `sst`), the universal `all` (keep everything) or `except_data_values` (keep schema, clear values), or an integer (QC filter). Unknown tokens are rejected with a 400 + a "did you mean" suggestion. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. | | `compression` | `minimal` | See mode flags. | | `batchmeta` | any | See mode flags. | | `tile_index` | non-negative integer | Pagination cursor. Default `0`. Almost always supplied by the previous response's `next_url`. | @@ -107,6 +107,10 @@ antimeridian / north-pole docs aren't lost. loudly instead of being silently ignored. When the typo is close to a real param name, the error message includes a "did you mean ..." suggestion. +- Any token inside the `data=` list that isn't one of: the dataset's + declared variable names, the universal `all` / `except_data_values`, + or an integer (QC filter). Suggestions follow the same shape as the + qsp-name check. - More than one of `polygon` / `box` / `center` set. - `center` set without `radius`, or vice versa. - `radius` non-numeric, negative, non-finite, or above the dataset's cap. diff --git a/api/fixtures/bsose.json b/api/fixtures/bsose.json index 77dd0f6..adae199 100644 --- a/api/fixtures/bsose.json +++ b/api/fixtures/bsose.json @@ -14,9 +14,9 @@ [34.50, 34.60, 34.70, 34.80] ], "data_info": [ - ["temp", "salinity"], + ["THETA", "SALT"], ["units", "long_name"], - [["degC", "Temperature"], ["psu", "Salinity"]] + [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]] ] }, { @@ -34,9 +34,9 @@ [35.00, 35.10, 35.20, 35.30] ], "data_info": [ - ["temp", "salinity"], + ["THETA", "SALT"], ["units", "long_name"], - [["degC", "Temperature"], ["psu", "Salinity"]] + [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]] ] }, { @@ -54,9 +54,9 @@ [33.10, 33.20, 33.30, 33.40] ], "data_info": [ - ["temp", "salinity"], + ["THETA", "SALT"], ["units", "long_name"], - [["degC", "Temperature"], ["psu", "Salinity"]] + [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]] ] }, { @@ -74,9 +74,9 @@ [34.10, 34.20, 34.30, 34.40] ], "data_info": [ - ["temp", "salinity"], + ["THETA", "SALT"], ["units", "long_name"], - [["degC", "Temperature"], ["psu", "Salinity"]] + [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]] ] } ] diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs index 5ab4084..3962100 100644 --- a/api/src/helpers/dataset_config.rs +++ b/api/src/helpers/dataset_config.rs @@ -50,11 +50,19 @@ use super::schema::DataInfo; /// — tile generation falls back to walking the whole globe. The /// rectangle is treated as inclusive on its edges; a doc lying exactly /// on the coverage boundary is preserved. +/// +/// `allowed_data_vars`: the per-dataset variable names accepted in the +/// `data=` qsp. Used by `validate_data_param` to reject typos (and to +/// power the "did you mean" suggestion). Does not include the universal +/// tokens (`all`, `except_data_values`) or integer QC filters — +/// those are accepted regardless of dataset and handled by the +/// validator directly. pub struct DatasetConfig { pub tile_degrees: f64, pub max_radius_meters: f64, pub levels: &'static [f64], pub coverage_bbox: Option, + pub allowed_data_vars: &'static [&'static str], } /// Per-dataset identity + Mongo client + startup-loaded metadata, built @@ -123,6 +131,7 @@ pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig { sw: [-180.0, -90.0], ne: [180.0, -30.0], }), + allowed_data_vars: &["THETA", "SALT"], }; /// OI SST has a single vertical level (the sea surface). We model it as @@ -148,6 +157,7 @@ pub const OISST_CONFIG: DatasetConfig = DatasetConfig { max_radius_meters: 100_000.0, // 100 km — relax once usage informs us levels: OISST_LEVELS, coverage_bbox: None, + allowed_data_vars: &["sst"], }; #[cfg(test)] diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs index 834d7aa..2b79d29 100644 --- a/api/src/helpers/filter_composer.rs +++ b/api/src/helpers/filter_composer.rs @@ -170,6 +170,7 @@ mod tests { max_radius_meters: 1.0e6, levels: &[100.0, 500.0], coverage_bbox: None, + allowed_data_vars: &[], }; fn null_tile() -> TileSpec { diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index f3b7c81..841c030 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -194,6 +194,86 @@ pub fn validate_query_params(params: &serde_json::Value) -> Result<(), HttpRespo Ok(()) } +/// Universal tokens accepted in the `data=` qsp regardless of which +/// dataset is being queried. `all` and `except_data_values` control +/// response shape rather than naming a variable; both are documented +/// in `api/PAGINATION.md`. +const UNIVERSAL_DATA_TOKENS: &[&str] = &["all", "except_data_values"]; + +/// Validate the contents of the `data=` qsp against the dataset's +/// `allowed_data_vars`. Each comma-separated token must be one of: +/// +/// - a universal token (`all` or `except_data_values`), +/// - a variable name in `config.allowed_data_vars`, or +/// - an integer (accepted as a QC filter; the QC system is dataset- +/// agnostic so any non-negative integer is allowed at this layer). +/// +/// Anything else returns 400 with a Levenshtein-suggested correction +/// when one of the named candidates is within edit distance 2. +/// +/// No-op when the `data=` qsp is absent. +pub fn validate_data_param( + params: &serde_json::Value, + config: &DatasetConfig, +) -> Result<(), HttpResponse> { + let raw = match params.get("data").and_then(|v| v.as_str()) { + Some(s) => s, + None => return Ok(()), + }; + + for token in raw.split(',') { + if token.is_empty() { + return Err(HttpResponse::BadRequest().json(json!({ + "error": "'data' qsp contains an empty token (likely a leading, trailing, or doubled comma)" + }))); + } + if UNIVERSAL_DATA_TOKENS.contains(&token) { + continue; + } + if config.allowed_data_vars.contains(&token) { + continue; + } + // Integer QC filter (any signed integer; the QC layer + // interprets the value, this layer just gate-keeps the shape). + if token.parse::().is_ok() { + continue; + } + let msg = unknown_data_token_message(token, config); + return Err(HttpResponse::BadRequest().json(json!({"error": msg}))); + } + Ok(()) +} + +fn unknown_data_token_message(token: &str, config: &DatasetConfig) -> String { + // Suggest only against named candidates (universal tokens + per- + // dataset variable names). Integer QC filters aren't suggestible + // — the user either types one correctly or they don't. + let candidates: Vec<&str> = UNIVERSAL_DATA_TOKENS + .iter() + .copied() + .chain(config.allowed_data_vars.iter().copied()) + .collect(); + + let suggestion = candidates + .iter() + .map(|c| (*c, edit_distance(token, c))) + .filter(|(_, d)| *d <= 2) + .min_by_key(|&(_, d)| d) + .map(|(c, _)| c); + + let allowed_list = candidates.join(", "); + match suggestion { + Some(s) => format!( + "Unknown 'data' value '{}'. Did you mean '{}'? Allowed values for this dataset: {} (plus any integer for QC filtering).", + token, s, allowed_list + ), + None => format!( + "Unknown 'data' value '{}'. Allowed values for this dataset: {} (plus any integer for QC filtering).", + token, allowed_list + ), + } +} + /// Enforce the dataset's `max_radius_meters` for `center + radius` queries. /// /// `center + radius` is the one geo mode that *isn't* spatially tiled @@ -506,8 +586,121 @@ mod tests { max_radius_meters: 1_000_000.0, // 1000 km levels: &[0.0], coverage_bbox: None, + allowed_data_vars: &[], }; + /// Test config with a couple of allowed variable names, used by + /// the `validate_data_param` tests below. + const DATA_TEST_CONFIG: DatasetConfig = DatasetConfig { + tile_degrees: 10.0, + max_radius_meters: 1_000_000.0, + levels: &[0.0], + coverage_bbox: None, + allowed_data_vars: &["THETA", "SALT"], + }; + + // ---- validate_data_param -------------------------------------------------- + + #[test] + fn validate_data_param_noop_when_data_absent() { + let params = json!({}); + assert!(validate_data_param(¶ms, &DATA_TEST_CONFIG).is_ok()); + } + + #[test] + fn validate_data_param_accepts_universal_tokens() { + for token in ["all", "except_data_values"] { + let params = json!({ "data": token }); + assert!( + validate_data_param(¶ms, &DATA_TEST_CONFIG).is_ok(), + "universal token '{}' should be accepted", + token + ); + } + } + + #[test] + fn validate_data_param_accepts_per_dataset_var_names() { + for token in ["THETA", "SALT"] { + let params = json!({ "data": token }); + assert!( + validate_data_param(¶ms, &DATA_TEST_CONFIG).is_ok(), + "allowed var '{}' should be accepted", + token + ); + } + } + + #[test] + fn validate_data_param_accepts_integer_qc_filters() { + // Any signed integer should pass; the QC layer (downstream) + // interprets the actual value. + for token in ["0", "1", "42", "-1", "9999"] { + let params = json!({ "data": token }); + assert!( + validate_data_param(¶ms, &DATA_TEST_CONFIG).is_ok(), + "integer token '{}' should be accepted", + token + ); + } + } + + #[test] + fn validate_data_param_accepts_mixed_list() { + let params = json!({ "data": "all,THETA,1,SALT" }); + assert!(validate_data_param(¶ms, &DATA_TEST_CONFIG).is_ok()); + } + + #[test] + fn validate_data_param_rejects_unknown_var() { + let params = json!({ "data": "salinity" }); + let err = validate_data_param(¶ms, &DATA_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn validate_data_param_rejects_unknown_var_in_mixed_list() { + // First token is fine, second is the bad one — should still + // reject (loops over every token). + let params = json!({ "data": "THETA,bogus" }); + let err = validate_data_param(¶ms, &DATA_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + + #[test] + fn validate_data_param_rejects_empty_token() { + // Trailing or doubled commas yield an empty token in the + // split — surface that as its own clear error rather than + // bouncing through the suggestion path. + for raw in ["THETA,", ",THETA", "THETA,,SALT"] { + let params = json!({ "data": raw }); + let err = validate_data_param(¶ms, &DATA_TEST_CONFIG).unwrap_err(); + assert_eq!(err.status(), 400); + } + } + + #[test] + fn unknown_data_token_message_includes_suggestion_when_close() { + // The single typo here is in the case of the last char; edit + // distance 1 → suggest THETA. + let msg = unknown_data_token_message("Theta", &DATA_TEST_CONFIG); + assert!( + msg.contains("Did you mean 'THETA'"), + "suggestion missing in: {}", + msg + ); + } + + #[test] + fn unknown_data_token_message_omits_suggestion_when_far() { + let msg = unknown_data_token_message("zzzzzzzzzzz", &DATA_TEST_CONFIG); + assert!( + !msg.contains("Did you mean"), + "should not suggest a far-away name: {}", + msg + ); + } + #[test] fn radius_cap_skipped_when_no_center() { // No center → mode doesn't apply, cap is moot. diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs index 7997130..6061e8a 100644 --- a/api/src/helpers/tile_generator.rs +++ b/api/src/helpers/tile_generator.rs @@ -368,6 +368,7 @@ mod tests { max_radius_meters: 1.0e6, levels: &[0.0, 100.0], coverage_bbox: None, + allowed_data_vars: &[], }; // ---- top-level dispatch -------------------------------------------------- @@ -658,6 +659,7 @@ mod tests { sw: [-180.0, -90.0], ne: [180.0, -30.0], }), + allowed_data_vars: &[], }; #[test] @@ -881,6 +883,7 @@ mod tests { max_radius_meters: 1.0e6, levels: &[], coverage_bbox: None, + allowed_data_vars: &[], }; let tiles = generate_tiles( &json!({"box": "[[0,0],[10,10]]"}), diff --git a/api/src/main.rs b/api/src/main.rs index 5fca235..10bd141 100644 --- a/api/src/main.rs +++ b/api/src/main.rs @@ -151,6 +151,9 @@ where if let Err(response) = helpers::validate_radius_cap(¶ms, config) { return response; } + if let Err(response) = helpers::validate_data_param(¶ms, config) { + return response; + } let start_idx = match pagination::parse_tile_index(¶ms) { Ok(i) => i, diff --git a/api/tests/integration.rs b/api/tests/integration.rs index 8f3cd39..9f2599f 100644 --- a/api/tests/integration.rs +++ b/api/tests/integration.rs @@ -175,18 +175,48 @@ async fn data_all_returns_full_timeseries_across_pages() { #[tokio::test] async fn data_specific_field_filters_columns_across_pages() { - let docs = get_paged("/timeseries/bsose", &[("data", "salinity")]).await; + let docs = get_paged("/timeseries/bsose", &[("data", "SALT")]).await; assert!(!docs.is_empty()); for row in &docs { let names = &row["data_info"][0]; assert_eq!( names.as_array().unwrap(), - &vec![Value::String("salinity".to_string())] + &vec![Value::String("SALT".to_string())] ); assert_eq!(row["data"].as_array().unwrap().len(), 1); } } +#[tokio::test] +async fn unknown_data_value_returns_400() { + // `salinity` was the old BSOSE variable name; the production names + // are THETA/SALT. The whitelist should reject the typo with a + // suggestion that includes the right name. + let resp = get("/timeseries/bsose", &[("data", "salinity")]).await; + assert_eq!(resp.status(), 400); +} + +#[tokio::test] +async fn integer_data_value_accepted_as_qc_filter() { + // Integers in the data= list are accepted as QC filters at the + // validation layer, regardless of whether the dataset has any + // matching column. `1,SALT` should not 400. + let body = get_envelope( + "/timeseries/bsose", + &[("id", "bsose_doc_001"), ("data", "1,SALT")], + ) + .await; + let docs = body["docs"].as_array().unwrap(); + assert_eq!(docs.len(), 1); + // SALT survives column-filtering; the `1` is silently dropped by + // slice_data (no matching variable name) but accepted by validation. + let names = &docs[0]["data_info"][0]; + assert_eq!( + names.as_array().unwrap(), + &vec![Value::String("SALT".to_string())] + ); +} + // --------------------------------------------------------------------------- // id filter // --------------------------------------------------------------------------- From 5b0c7150b1bea92c911ddc8200c9c49e358d099d Mon Sep 17 00:00:00 2001 From: katieannemills Date: Wed, 27 May 2026 13:25:20 -0400 Subject: [PATCH 13/13] bugfix --- api/src/helpers/helpers.rs | 51 ++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs index 841c030..a89a8a7 100644 --- a/api/src/helpers/helpers.rs +++ b/api/src/helpers/helpers.rs @@ -91,13 +91,19 @@ fn edit_distance(a: &str, b: &str) -> usize { /// Return the whitelist entry closest to `unknown` if one is within /// edit distance 2 — covers common typo flavours (single-char -/// substitution / deletion / insertion, capitalisation differences) -/// without firing on totally unrelated names. `None` when nothing is -/// within range. +/// substitution / deletion / insertion) without firing on totally +/// unrelated names. `None` when nothing is within range. +/// +/// Distance is computed on the *lowercased* forms of both sides, so a +/// fully-uppercased typo (`STARTDATE`) or any mixed-case shout still +/// gets a useful suggestion — case differences between every letter +/// would otherwise push the distance well past the threshold even +/// though the user clearly meant the same name. fn suggest_qsp(unknown: &str) -> Option<&'static str> { + let unknown_lower = unknown.to_lowercase(); ALLOWED_QSP .iter() - .map(|allowed| (*allowed, edit_distance(unknown, allowed))) + .map(|allowed| (*allowed, edit_distance(&unknown_lower, &allowed.to_lowercase()))) .filter(|(_, dist)| *dist <= 2) .min_by_key(|&(_, dist)| dist) .map(|(allowed, _)| allowed) @@ -248,15 +254,21 @@ fn unknown_data_token_message(token: &str, config: &DatasetConfig) -> String { // Suggest only against named candidates (universal tokens + per- // dataset variable names). Integer QC filters aren't suggestible // — the user either types one correctly or they don't. + // + // Distance is computed case-insensitively so a user typing `theta` + // or `Theta` still gets pointed at `THETA`. Validation itself is + // case-sensitive (the user is asked to fix their request), but the + // suggestion should be forgiving. let candidates: Vec<&str> = UNIVERSAL_DATA_TOKENS .iter() .copied() .chain(config.allowed_data_vars.iter().copied()) .collect(); + let token_lower = token.to_lowercase(); let suggestion = candidates .iter() - .map(|c| (*c, edit_distance(token, c))) + .map(|c| (*c, edit_distance(&token_lower, &c.to_lowercase()))) .filter(|(_, d)| *d <= 2) .min_by_key(|&(_, d)| d) .map(|(c, _)| c); @@ -568,6 +580,17 @@ mod tests { assert_eq!(suggest_qsp("Box"), Some("box")); } + #[test] + fn suggest_qsp_is_case_insensitive() { + // Fully-uppercased or arbitrarily-cased typos should still get + // a useful suggestion — without case-folding, every letter + // would count as a substitution and the distance would blow + // past the threshold. + assert_eq!(suggest_qsp("STARTDATE"), Some("startDate")); + assert_eq!(suggest_qsp("StArTdAtE"), Some("startDate")); + assert_eq!(suggest_qsp("BOX"), Some("box")); + } + #[test] fn suggest_qsp_returns_none_for_unrelated_input() { // A name with no plausibly close match in the whitelist @@ -701,6 +724,24 @@ mod tests { ); } + #[test] + fn unknown_data_token_message_suggestion_is_case_insensitive() { + // Fully-lowercased or fully-uppercased typos of a config'd var + // should still produce a suggestion pointing at the correctly- + // cased original. Without case-folding, every letter mismatch + // would be a substitution and the distance would exceed the + // threshold for variables like `THETA`. + for typo in ["theta", "Theta", "ThEtA"] { + let msg = unknown_data_token_message(typo, &DATA_TEST_CONFIG); + assert!( + msg.contains("Did you mean 'THETA'"), + "expected THETA suggestion for '{}', got: {}", + typo, + msg + ); + } + } + #[test] fn radius_cap_skipped_when_no_center() { // No center → mode doesn't apply, cap is moot.