From c81c28da5a1e481c146fd6fbcec462c7d1daddb4 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 18:34:49 -0400
Subject: [PATCH 01/13] refactoring to accommodate arbitrary timeseries grids

---
 api/src/helpers/dataset_config.rs |  52 ++++++++
 api/src/helpers/schema.rs         |  48 ++++++-
 api/src/helpers/transforms.rs     |  95 +++++++++++++-
 api/src/main.rs                   | 207 +++++++++++++++++++++++-------
 4 files changed, 347 insertions(+), 55 deletions(-)
diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs
index 909ce1b..702d95b 100644
--- a/api/src/helpers/dataset_config.rs
+++ b/api/src/helpers/dataset_config.rs
@@ -8,8 +8,19 @@
 //! preventing a runaway disk-of-most-of-the-globe); `coverage_bbox`
 //! tells the tile generator the lat/lon rectangle the dataset's data
 //! actually lives inside, so we skip probing tiles outside it.
+//!
+//! `DatasetSource` is the sibling struct that names *where* each dataset
+//! lives in Mongo (db, collection, meta collection, meta discriminator)
+//! and caches startup-loaded metadata (the timeseries axis, the meta
+//! default `data_info`). One static `*_SOURCE` per dataset; the
+//! generic handler in `main.rs` consumes `(&DatasetConfig, &DatasetSource)`
+//! plus the dataset's schema generic to serve a request.
+
+use mongodb::bson::DateTime as BsonDateTime;
+use once_cell::sync::OnceCell;
 
 use super::geometry::BoundingBox;
+use super::schema::DataInfo;
 
 /// Per-dataset request-size policy.
 ///
@@ -41,6 +52,47 @@ pub struct DatasetConfig {
     pub coverage_bbox: Option<BoundingBox>,
 }
 
+/// Per-dataset *identity*: where the dataset lives in Mongo, and a place
+/// to stash the values we read once at startup so requests don't have to.
+///
+/// `db_name` / `collection` name the data collection the handler queries.
+/// `meta_collection` is the per-dataset (or shared) metadata collection;
+/// `meta_data_type` is the value of the `data_type` discriminator that
+/// picks this dataset's meta doc out of the meta collection.
+///
+/// `timeseries` and `data_info` are populated once at server startup by
+/// the generic loader in `main.rs` and read on every request. They're
+/// behind `OnceCell` rather than `Mutex<Option<_>>` because they're
+/// write-once: the loader fills them in `main()` and nothing else ever
+/// writes again, so handlers can read without locking.
+///
+/// `data_info` is the *meta-level default* for the dataset. Per the
+/// precedence rule documented on `transforms::transform_timeseries`, a
+/// data doc that carries its own `data_info` wins over this default; if
+/// the doc's `data_info` is empty, the cached default is stamped on
+/// before column filtering runs. Datasets that put `data_info` on every
+/// data doc (e.g. BSOSE today) leave this cache as the empty tuple — it
+/// gets populated but never consulted.
+pub struct DatasetSource {
+    pub db_name: &'static str,
+    pub collection: &'static str,
+    pub meta_collection: &'static str,
+    pub meta_data_type: &'static str,
+    pub timeseries: OnceCell<Vec<BsonDateTime>>,
+    pub data_info: OnceCell<DataInfo>,
+}
+
+/// Mongo identity for the BSOSE timeseries dataset. The OnceCells are
+/// filled at startup by `main.rs::populate_dataset_caches`.
+pub static BSOSE_SOURCE: DatasetSource = DatasetSource {
+    db_name: "argo",
+    collection: "bsose",
+    meta_collection: "timeseriesMeta",
+    meta_data_type: "BSOSE-profile",
+    timeseries: OnceCell::new(),
+    data_info: OnceCell::new(),
+};
+
 /// BSOSE's 52 vertical levels, in metres (positive-downward), shallowest
 /// first. From the dataset's published grid; should be updated if BSOSE
 /// re-releases with a different vertical discretisation.
diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs
index a37ce5b..a8cabfb 100644
--- a/api/src/helpers/schema.rs
+++ b/api/src/helpers/schema.rs
@@ -17,6 +17,15 @@ pub struct SourceMeta {
     pub(crate) iter: String,
 }
 
+// type aliases ///////////////////////////////////////////////////////////////
+
+/// Per-variable descriptor carried alongside a timeseries doc:
+/// `(variable_names, info_fields, per_variable_info)`. The shape is
+/// preserved as a tuple for backward-compatible JSON serialization (the
+/// public response format encodes it as a 3-tuple), but giving it a name
+/// makes function signatures and the per-dataset cache easier to read.
+pub type DataInfo = (Vec<String>, Vec<String>, Vec<Vec<String>>);
+
 // categroical traits /////////////////////////////////////////////////////////
 
 pub trait IsTimeseries {
@@ -25,8 +34,8 @@ pub trait IsTimeseries {
     fn set_data(&mut self, data: Vec<Vec<f64>>);
     fn timeseries(&mut self) -> Option<&mut Vec<String>>;
     fn set_timeseries(&mut self, timeseries: Vec<String>);
-    fn data_info(&mut self) -> (Vec<String>, Vec<String>, Vec<Vec<String>>);
-    fn set_data_info(&mut self, data_info: (Vec<String>, Vec<String>, Vec<Vec<String>>));
+    fn data_info(&mut self) -> DataInfo;
+    fn set_data_info(&mut self, data_info: DataInfo);
     fn _id(&self) -> String;
     fn longitude(&self) -> f64;
     fn latitude(&self) -> f64;
@@ -36,6 +45,16 @@ pub trait IsTimeseries {
 
 pub trait IsTimeseriesMeta {
     fn get_timeseries_meta(&self) -> bool;
+    /// Snapshot of the dataset's timestamp axis. Read once at startup and
+    /// cached on `DatasetSource::timeseries` so request handling doesn't
+    /// re-fetch it per request.
+    fn timeseries(&self) -> Vec<BsonDateTime>;
+    /// Per-dataset `data_info` default. Stamped onto a data doc by
+    /// `transform_timeseries` only when that doc carries no `data_info` of
+    /// its own (the precedence rule: doc-level wins over meta-level). May
+    /// be the empty tuple — datasets that store `data_info` on every data
+    /// doc (e.g. BSOSE today) leave the meta-level value blank.
+    fn data_info(&self) -> DataInfo;
 }
 
 // bsose //////////////////////////////////////////////////////////////////////
@@ -56,7 +75,7 @@ pub struct BsoseSchema {
     pub(crate) data: Vec<Vec<f64>>,
     // Not present in the source collection — gets populated by transforms.
     pub(crate) timeseries: Option<Vec<String>>,
-    pub(crate) data_info: (Vec<String>, Vec<String>, Vec<Vec<String>>),
+    pub(crate) data_info: DataInfo,
 }
 
 impl IsTimeseries for BsoseSchema {
@@ -80,11 +99,11 @@ impl IsTimeseries for BsoseSchema {
         self.timeseries = Some(timeseries);
     }
 
-    fn data_info(&mut self) -> (Vec<String>, Vec<String>, Vec<Vec<String>>) {
+    fn data_info(&mut self) -> DataInfo {
         self.data_info.clone()
     }
 
-    fn set_data_info(&mut self, data_info: (Vec<String>, Vec<String>, Vec<Vec<String>>)) {
+    fn set_data_info(&mut self, data_info: DataInfo) {
         self.data_info = data_info;
     }
 
@@ -114,8 +133,8 @@ pub struct BsoseMeta {
     pub(crate) _id: String,
     pub(crate) data_type: String,
     pub(crate) date_updated_argovis: BsonDateTime,
-    // `timeseries` is read from main.rs at startup to populate the cached
-    // TIMESERIES global, so it stays fully `pub`.
+    // `timeseries` and `data_info` are read at startup to populate the
+    // per-dataset cache on `DatasetSource`, so both stay fully `pub`.
     pub timeseries: Vec<BsonDateTime>,
     pub(crate) source: Vec<SourceMeta>,
     pub(crate) cell_area: f64,
@@ -123,12 +142,27 @@ pub struct BsoseMeta {
     pub(crate) depth_r0_to_bottom: f64,
     pub(crate) interior_2d_mask: bool,
     pub(crate) depth_r0_to_ref_surface: f64,
+    // `data_info` may or may not be present on the BSOSE meta doc (today
+    // it lives only on the data docs themselves). `#[serde(default)]`
+    // makes deserialization tolerate either case: when absent the cache
+    // becomes the empty sentinel and per-doc values keep taking
+    // precedence, exactly the current behaviour.
+    #[serde(default)]
+    pub data_info: DataInfo,
 }
 
 impl IsTimeseriesMeta for BsoseMeta {
     fn get_timeseries_meta(&self) -> bool {
         return true;
     }
+
+    fn timeseries(&self) -> Vec<BsonDateTime> {
+        self.timeseries.clone()
+    }
+
+    fn data_info(&self) -> DataInfo {
+        self.data_info.clone()
+    }
 }
 
 #[derive(Deserialize, Debug, Clone)]
diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs
index 2449344..adbda37 100644
--- a/api/src/helpers/transforms.rs
+++ b/api/src/helpers/transforms.rs
@@ -5,9 +5,27 @@ use mongodb::bson::DateTime as BsonDateTime;
 /// Apply the user's `startDate` / `endDate` / `data` parameters to a single
 /// timeseries document. Returns `None` if the document was filtered out
 /// entirely (e.g. `data=somefield` produced no matching columns).
+///
+/// `cached_data_info` is the *meta-level default* `data_info` for this
+/// dataset — read once at startup from the meta doc and stashed on the
+/// dataset's `DatasetSource`. Precedence rule for `data_info`:
+///
+///   - If the data doc carries its own non-empty `data_info` (the BSOSE
+///     case today), it wins; the cached default is ignored.
+///   - If the data doc has no `data_info` (the OI SST case — single
+///     variable, info kept on the meta doc), the cached default is
+///     stamped onto the doc before `slice_data` runs.
+///   - If both are empty, `slice_data` will return `None` for any
+///     specific-variable request (no columns to match) and an
+///     empty-data passthrough for `data=` / `data=all` — same as today.
+///
+/// The cache passes through as `&DataInfo` rather than `Option`: an empty
+/// tuple is its own "no default" sentinel, matching the convention
+/// `slice_data` already uses for "no fields".
 pub fn transform_timeseries<T: schema::IsTimeseries>(
     params: &serde_json::Value,
     ts: &[BsonDateTime],
+    cached_data_info: &schema::DataInfo,
     mut doc: T,
 ) -> Option<T> {
     let start_date = params.get("startDate")
@@ -26,6 +44,15 @@ pub fn transform_timeseries<T: schema::IsTimeseries>(
     if start_date.is_some() || end_date.is_some() {
         slice_timerange(start_date, end_date, ts, &mut doc);
     }
+
+    // Doc-level data_info takes precedence; only fall back to the cached
+    // meta-level default when the doc itself carries no variable names.
+    // `data_info.0` is the variable-names vector, so its emptiness is the
+    // canonical "no data_info" check.
+    if doc.data_info().0.is_empty() && !cached_data_info.0.is_empty() {
+        doc.set_data_info(cached_data_info.clone());
+    }
+
     slice_data(&data, doc)
 }
 
@@ -293,10 +320,76 @@ mod tests {
             "data":      "salinity",
         });
 
-        let mut out = transform_timeseries(&params, &timeseries, doc).unwrap();
+        // Empty cached_data_info — this doc already carries its own
+        // data_info, so the precedence rule means the cache is never
+        // consulted regardless.
+        let empty_cache: schema::DataInfo = (vec![], vec![], vec![]);
+        let mut out = transform_timeseries(&params, &timeseries, &empty_cache, doc).unwrap();
         assert_eq!(*out.data(), vec![vec![20.0, 30.0]]);
     }
 
+    // ---- transform_timeseries: data_info precedence -------------------------
+
+    #[test]
+    fn transform_uses_cached_data_info_when_doc_has_none() {
+        // OI SST-style: the data doc carries no data_info; the meta-level
+        // cache supplies the variable names so slice_data can resolve
+        // `data=sst`.
+        let timeseries = ts(&[1, 2]);
+        let mut doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0]],
+            &["sst"], // gets cleared below to simulate a docless data_info
+        );
+        // Clear the doc's data_info so the cache fallback kicks in.
+        doc.data_info = (vec![], vec![], vec![]);
+
+        let cache: schema::DataInfo = (
+            vec!["sst".to_string()],
+            vec!["units".to_string(), "long_name".to_string()],
+            vec![vec!["degC".to_string(), "SST".to_string()]],
+        );
+
+        let params = json!({"data": "sst"});
+        let mut out =
+            transform_timeseries(&params, &timeseries, &cache, doc).expect("should resolve");
+        // sst column survives.
+        assert_eq!(*out.data(), vec![vec![1.0, 2.0]]);
+        // data_info has been stamped from the cache, then column-filtered
+        // by slice_data — should still list sst.
+        let info = out.data_info();
+        assert_eq!(info.0, vec!["sst".to_string()]);
+    }
+
+    #[test]
+    fn transform_doc_data_info_wins_over_cache() {
+        // BSOSE-style: the doc has data_info and the cache also has
+        // something (hypothetically). The doc's value should take
+        // precedence — the cache should not overwrite it.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
+            &["temp", "salinity"],
+        );
+
+        // Cache says "sst" — but doc has temp/salinity. Doc wins; the
+        // request for `data=temp` should resolve against the doc, not
+        // get clobbered by the cache.
+        let cache: schema::DataInfo = (
+            vec!["sst".to_string()],
+            vec!["units".to_string()],
+            vec![vec!["degC".to_string()]],
+        );
+
+        let params = json!({"data": "temp"});
+        let mut out =
+            transform_timeseries(&params, &timeseries, &cache, doc).expect("should resolve");
+        assert_eq!(*out.data(), vec![vec![1.0, 2.0]]);
+        let info = out.data_info();
+        assert_eq!(info.0, vec!["temp".to_string()]);
+    }
+
     // ---- timeseries_stub -----------------------------------------------------
 
     #[test]
diff --git a/api/src/main.rs b/api/src/main.rs
index 0255f4d..e20f37e 100644
--- a/api/src/main.rs
+++ b/api/src/main.rs
@@ -28,25 +28,68 @@ use std::sync::Mutex;
 use futures::stream::StreamExt;
 use std::env;
 use std::convert::Infallible;
+use serde::Serialize;
 use serde::de::DeserializeOwned;
-use mongodb::bson::DateTime;
 use std::collections::HashSet;
 use async_stream::stream;
 use serde_json::{json, Value};
 
+use dataset_config::{DatasetConfig, DatasetSource};
+
 static CLIENT: Lazy<Mutex<Option<mongodb::Client>>> = Lazy::new(|| Mutex::new(None));
-static TIMESERIES: Lazy<Mutex<Option<Vec<DateTime>>>> = Lazy::new(|| Mutex::new(None));
+
+// ---- route handlers --------------------------------------------------------
+//
+// Each dataset gets its own one-route handler that resolves the dataset
+// generic and hands off to `serve_timeseries`. Keeps Actix's `#[get(...)]`
+// attribute discoverable per-dataset and lets the routing table grow
+// without touching the generic core. Adding a new dataset is: define its
+// `*_CONFIG` + `*_SOURCE` in `dataset_config.rs`, define its schema +
+// meta in `schema.rs`, then add a 4-line handler here.
 
 #[get("/timeseries/bsose")]
-async fn search_data_schema(
+async fn bsose_handler(
     req: HttpRequest,
     query_params: web::Query<serde_json::Value>,
 ) -> impl Responder {
-    let params = query_params.into_inner();
-
-    // Dataset-specific request-size policy: tile size, level set, radius cap.
-    let config = &dataset_config::BSOSE_CONFIG;
+    serve_timeseries::<schema::BsoseSchema>(
+        req,
+        query_params.into_inner(),
+        &dataset_config::BSOSE_CONFIG,
+        &dataset_config::BSOSE_SOURCE,
+    )
+    .await
+}
 
+// ---- generic timeseries handler --------------------------------------------
+
+/// Generic body of the `/timeseries/{dataset}` endpoint. Parameterized by
+/// `S`, the per-dataset data-doc schema (e.g. `BsoseSchema`). The dataset's
+/// request-size policy comes in as `config` (tile size, level set, radius
+/// cap, coverage); its Mongo identity and startup caches come in as
+/// `source` (db/collection names, the cached `timeseries` axis and the
+/// cached meta-level `data_info` default).
+///
+/// Behaviour is the same as the previous BSOSE-specific handler: validate
+/// the query, generate the tile sequence, then probe-forward through
+/// tiles serving at most one non-empty tile per request. The two branches
+/// (streaming docs vs. `batchmeta` metadata lookup) are unchanged from
+/// the original implementation.
+async fn serve_timeseries<S>(
+    req: HttpRequest,
+    params: serde_json::Value,
+    config: &DatasetConfig,
+    source: &'static DatasetSource,
+) -> HttpResponse
+where
+    S: schema::IsTimeseries
+        + DeserializeOwned
+        + Serialize
+        + Send
+        + Sync
+        + Unpin
+        + 'static,
+{
     // The next_url we emit on success uses this request's own path, so the
     // generated URL stays correct even if the route is re-mounted later.
     let path = req.path().to_string();
@@ -67,10 +110,19 @@ async fn search_data_schema(
     // ---- tile sequence + cached startup data --------------------------
     let tiles = tile_generator::generate_tiles(&params, config);
 
-    let timeseries = {
-        let ts = TIMESERIES.lock().unwrap();
-        ts.clone().unwrap()
-    };
+    // Both caches are populated at startup by `populate_dataset_cache`
+    // before the server begins accepting connections. A handler hit
+    // before startup finished is a server bug, so we surface it.
+    let timeseries = source
+        .timeseries
+        .get()
+        .expect("dataset timeseries cache should be populated at startup")
+        .clone();
+    let cached_data_info = source
+        .data_info
+        .get()
+        .expect("dataset data_info cache should be populated at startup")
+        .clone();
 
     let compression: Option<String> = params
         .get("compression")
@@ -100,8 +152,11 @@ async fn search_data_schema(
         let filter = filter_composer::compose_filter_with_tile(params.clone(), tile, config);
         let options = FindOptions::builder().build();
 
-        let mut cursor = match generate_cursor::<schema::BsoseSchema>(
-            "argo", "bsose", filter, Some(options),
+        let mut cursor = match generate_cursor::<S>(
+            source.db_name,
+            source.collection,
+            filter,
+            Some(options),
         )
         .await
         {
@@ -118,11 +173,14 @@ async fn search_data_schema(
             while let Some(result) = cursor.next().await {
                 match result {
                     Ok(doc) => {
-                        if let Some(t) =
-                            transforms::transform_timeseries(&params, &timeseries, doc)
-                        {
-                            for m in t.metadata.iter() {
-                                unique_metadata.insert(m.clone());
+                        if let Some(t) = transforms::transform_timeseries(
+                            &params,
+                            &timeseries,
+                            &cached_data_info,
+                            doc,
+                        ) {
+                            for m in t.metadata().into_iter() {
+                                unique_metadata.insert(m);
                             }
                         }
                     }
@@ -141,7 +199,10 @@ async fn search_data_schema(
                 "_id": { "$in": unique_metadata.into_iter().collect::<Vec<_>>() }
             };
             let meta_cursor = match generate_cursor::<Document>(
-                "argo", "timeseriesMeta", meta_filter, None,
+                source.db_name,
+                source.meta_collection,
+                meta_filter,
+                None,
             )
             .await
             {
@@ -165,13 +226,16 @@ async fn search_data_schema(
         // Peek-ahead until we find a doc that survives transformation.
         // If none, advance to the next tile. If found, keep the cursor —
         // we'll continue draining it from inside the response body.
-        let mut first_doc: Option<schema::BsoseSchema> = None;
+        let mut first_doc: Option<S> = None;
         while let Some(result) = cursor.next().await {
             match result {
                 Ok(doc) => {
-                    if let Some(t) =
-                        transforms::transform_timeseries(&params, &timeseries, doc)
-                    {
+                    if let Some(t) = transforms::transform_timeseries(
+                        &params,
+                        &timeseries,
+                        &cached_data_info,
+                        doc,
+                    ) {
                         first_doc = Some(t);
                         break;
                     }
@@ -190,11 +254,13 @@ async fn search_data_schema(
 
         let next_url = next_url_value(&path, &params, tile_idx, tiles.len());
         let page_message = format!("page {}", tile_idx);
-        // Each of these gets moved into the stream! generator. params and
-        // timeseries are needed to transform each subsequent doc; the rest
-        // are emitted at the end of the response.
+        // Each of these gets moved into the stream! generator. params,
+        // timeseries, and the data_info cache are needed to transform
+        // each subsequent doc; the rest are emitted at the end of the
+        // response.
         let params_for_stream = params.clone();
         let ts_for_stream = timeseries.clone();
+        let data_info_for_stream = cached_data_info.clone();
 
         let body = stream! {
             yield Ok::<_, Infallible>(web::Bytes::from_static(b"{\"docs\":["));
@@ -205,7 +271,7 @@ async fn search_data_schema(
                 serde_json::to_vec(&stub).expect("serializing one stub should not fail")
             } else {
                 serde_json::to_vec(&first_doc)
-                    .expect("serializing one bsose doc should not fail")
+                    .expect("serializing a timeseries doc should not fail")
             };
             yield Ok(web::Bytes::from(first_bytes));
 
@@ -213,7 +279,10 @@ async fn search_data_schema(
                 match result {
                     Ok(doc) => {
                         if let Some(t) = transforms::transform_timeseries(
-                            &params_for_stream, &ts_for_stream, doc,
+                            &params_for_stream,
+                            &ts_for_stream,
+                            &data_info_for_stream,
+                            doc,
                         ) {
                             let bytes = if is_minimal {
                                 let stub = transforms::timeseries_stub(&t);
@@ -221,7 +290,7 @@ async fn search_data_schema(
                                     .expect("serializing one stub should not fail")
                             } else {
                                 serde_json::to_vec(&t)
-                                    .expect("serializing one bsose doc should not fail")
+                                    .expect("serializing a timeseries doc should not fail")
                             };
                             yield Ok(web::Bytes::from_static(b","));
                             yield Ok(web::Bytes::from(bytes));
@@ -283,34 +352,78 @@ fn next_url_value(
     }
 }
 
+/// Populate the startup caches on a `DatasetSource` from its meta doc.
+///
+/// Reads the one meta doc selected by `{data_type: source.meta_data_type}`
+/// from `{db_name}.{meta_collection}`, then sets the source's
+/// `timeseries` and `data_info` OnceCells from it. A dataset whose meta
+/// doc has no `data_info` field will land an empty tuple in the cache
+/// (via `#[serde(default)]` on the meta struct) — the precedence rule in
+/// `transform_timeseries` makes that the right default-suppressing value
+/// for datasets that carry `data_info` per data doc.
+///
+/// Panics if the meta doc can't be found or the OnceCell is already set.
+/// Both indicate startup misconfiguration that should fail loudly rather
+/// than serve stale or partial data.
+async fn populate_dataset_cache<M>(source: &DatasetSource) -> Result<()>
+where
+    M: schema::IsTimeseriesMeta + DeserializeOwned + Unpin + Send + Sync,
+{
+    let filter = mongodb::bson::doc! {"data_type": source.meta_data_type};
+    let options = FindOptions::builder().limit(1).build();
+    let mut cursor = generate_cursor::<M>(
+        source.db_name,
+        source.meta_collection,
+        filter,
+        Some(options),
+    )
+    .await?;
+
+    let meta = match cursor.next().await {
+        Some(Ok(m)) => m,
+        Some(Err(e)) => {
+            panic!(
+                "Error reading meta doc for {}.{} (data_type={}): {}",
+                source.db_name, source.meta_collection, source.meta_data_type, e
+            );
+        }
+        None => panic!(
+            "No meta doc found in {}.{} matching data_type={}",
+            source.db_name, source.meta_collection, source.meta_data_type
+        ),
+    };
+
+    source
+        .timeseries
+        .set(meta.timeseries())
+        .unwrap_or_else(|_| panic!("timeseries cache for data_type={} already set", source.meta_data_type));
+    source
+        .data_info
+        .set(meta.data_info())
+        .unwrap_or_else(|_| panic!("data_info cache for data_type={} already set", source.meta_data_type));
+
+    Ok(())
+}
+
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
 
     // Initialize the MongoDB client
     let client_options = mongodb::options::ClientOptions::parse(env::var("MONGODB_URI").unwrap()).await.unwrap();
-    let client = mongodb::Client::with_options(client_options).unwrap(); 
+    let client = mongodb::Client::with_options(client_options).unwrap();
     *CLIENT.lock().unwrap() = Some(client);
 
-    // some generic data useful to have on hand
-    let mut filter = mongodb::bson::doc! {"data_type": "BSOSE-profile"};
-    let mut options = FindOptions::builder().limit(1).build();
-    let mut metacursor = generate_cursor::<schema::BsoseMeta>("argo", "timeseriesMeta", filter, Some(options)).await.unwrap();
-    let mut metadata = Vec::new();
-    while let Some(result) = metacursor.next().await {
-        match result {
-            Ok(document) => {
-                metadata.push(document);
-            },  
-            Err(e) => {
-                eprintln!("Error: {}", e);
-            }
-        }
-    }
-    *TIMESERIES.lock().unwrap() = Some(metadata[0].timeseries.clone());
+    // Populate per-dataset startup caches. Each dataset's meta-doc-read
+    // happens here, before the server starts accepting connections.
+    // Adding a new dataset is a one-line addition: a call to
+    // populate_dataset_cache::<NewMeta>(&NEW_SOURCE).
+    populate_dataset_cache::<schema::BsoseMeta>(&dataset_config::BSOSE_SOURCE)
+        .await
+        .expect("failed to populate BSOSE dataset cache at startup");
 
     HttpServer::new(|| {
         App::new()
-            .service(search_data_schema)
+            .service(bsose_handler)
     })
     .bind(("0.0.0.0", 8080))?
     .run()

From 0e22089a03537876ba1b890b9df14b420a8c4a9b Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 19:12:26 -0400
Subject: [PATCH 02/13] k.i.s.s.

---
 api/src/helpers/dataset_config.rs |  63 +++++++------
 api/src/main.rs                   | 141 +++++++++++++++++-------------
 2 files changed, 110 insertions(+), 94 deletions(-)

diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs
index 702d95b..06fac18 100644
--- a/api/src/helpers/dataset_config.rs
+++ b/api/src/helpers/dataset_config.rs
@@ -11,13 +11,16 @@
 //!
 //! `DatasetSource` is the sibling struct that names *where* each dataset
 //! lives in Mongo (db, collection, meta collection, meta discriminator)
-//! and caches startup-loaded metadata (the timeseries axis, the meta
-//! default `data_info`). One static `*_SOURCE` per dataset; the
-//! generic handler in `main.rs` consumes `(&DatasetConfig, &DatasetSource)`
-//! plus the dataset's schema generic to serve a request.
+//! and carries the startup-loaded metadata (the timeseries axis, the
+//! meta default `data_info`). It can't be `const`/`static` directly
+//! because the metadata fields are loaded from Mongo at runtime; it's
+//! built once in `main()` via `load_dataset_source` and stashed in a
+//! top-level `Lazy<Mutex<Option<DatasetSource>>>` (the same pattern the
+//! existing Mongo `CLIENT` static uses, so we don't drag in new
+//! initialization vocabulary). Handlers clone it out of the lock at the
+//! top of each request, then read its fields as plain owned data.
 
 use mongodb::bson::DateTime as BsonDateTime;
-use once_cell::sync::OnceCell;
 
 use super::geometry::BoundingBox;
 use super::schema::DataInfo;
@@ -52,47 +55,41 @@ pub struct DatasetConfig {
     pub coverage_bbox: Option<BoundingBox>,
 }
 
-/// Per-dataset *identity*: where the dataset lives in Mongo, and a place
-/// to stash the values we read once at startup so requests don't have to.
+/// Per-dataset identity + startup-loaded metadata, built once in `main()`.
 ///
-/// `db_name` / `collection` name the data collection the handler queries.
-/// `meta_collection` is the per-dataset (or shared) metadata collection;
-/// `meta_data_type` is the value of the `data_type` discriminator that
-/// picks this dataset's meta doc out of the meta collection.
+/// The first four fields are the dataset's Mongo identity:
+///   - `db_name` / `collection`: where the data docs live.
+///   - `meta_collection`: where the metadata doc lives. May or may not be
+///     shared across datasets; today everything is in `timeseriesMeta`,
+///     disambiguated by `meta_data_type`.
+///   - `meta_data_type`: the `data_type` discriminator that selects this
+///     dataset's meta doc out of `meta_collection`.
 ///
-/// `timeseries` and `data_info` are populated once at server startup by
-/// the generic loader in `main.rs` and read on every request. They're
-/// behind `OnceCell` rather than `Mutex<Option<_>>` because they're
-/// write-once: the loader fills them in `main()` and nothing else ever
-/// writes again, so handlers can read without locking.
+/// The remaining two fields are values we load *once* at startup from the
+/// meta doc and read on every request. Plain owned types — no cells.
+///
+/// `Clone` is derived so handlers can copy a `DatasetSource` out of the
+/// `Lazy<Mutex<Option<_>>>` static and use it locally without holding
+/// the mutex across `.await` points. The clone is small (a few KB of
+/// dates plus a few short strings).
 ///
 /// `data_info` is the *meta-level default* for the dataset. Per the
 /// precedence rule documented on `transforms::transform_timeseries`, a
 /// data doc that carries its own `data_info` wins over this default; if
-/// the doc's `data_info` is empty, the cached default is stamped on
-/// before column filtering runs. Datasets that put `data_info` on every
-/// data doc (e.g. BSOSE today) leave this cache as the empty tuple — it
-/// gets populated but never consulted.
+/// the doc's `data_info` is empty, the meta-level value is stamped onto
+/// the doc before column filtering runs. Datasets that put `data_info`
+/// on every data doc (e.g. BSOSE today) leave this as the empty tuple —
+/// it's still populated, just never consulted.
+#[derive(Clone)]
 pub struct DatasetSource {
     pub db_name: &'static str,
     pub collection: &'static str,
     pub meta_collection: &'static str,
     pub meta_data_type: &'static str,
-    pub timeseries: OnceCell<Vec<BsonDateTime>>,
-    pub data_info: OnceCell<DataInfo>,
+    pub timeseries: Vec<BsonDateTime>,
+    pub data_info: DataInfo,
 }
 
-/// Mongo identity for the BSOSE timeseries dataset. The OnceCells are
-/// filled at startup by `main.rs::populate_dataset_caches`.
-pub static BSOSE_SOURCE: DatasetSource = DatasetSource {
-    db_name: "argo",
-    collection: "bsose",
-    meta_collection: "timeseriesMeta",
-    meta_data_type: "BSOSE-profile",
-    timeseries: OnceCell::new(),
-    data_info: OnceCell::new(),
-};
-
 /// BSOSE's 52 vertical levels, in metres (positive-downward), shallowest
 /// first. From the dataset's published grid; should be updated if BSOSE
 /// re-releases with a different vertical discretisation.
diff --git a/api/src/main.rs b/api/src/main.rs
index e20f37e..8ba46c5 100644
--- a/api/src/main.rs
+++ b/api/src/main.rs
@@ -38,25 +38,48 @@ use dataset_config::{DatasetConfig, DatasetSource};
 
 static CLIENT: Lazy<Mutex<Option<mongodb::Client>>> = Lazy::new(|| Mutex::new(None));
 
+// Per-dataset source-of-truth: identity strings + values loaded from the
+// dataset's meta doc at startup. Same pattern as CLIENT above — the
+// `Lazy<Mutex<Option<T>>>` is what lets us declare a static that's
+// initialized once, after main() reads the value out of Mongo. Handlers
+// clone the inner DatasetSource out of the lock at the top of each
+// request and use it locally, so the mutex is never held across `.await`.
+//
+// One static per dataset. Adding a new dataset is one more static here
+// plus one more load_dataset_source/ load-and-set block in main().
+static BSOSE_SOURCE: Lazy<Mutex<Option<DatasetSource>>> = Lazy::new(|| Mutex::new(None));
+
 // ---- route handlers --------------------------------------------------------
 //
 // Each dataset gets its own one-route handler that resolves the dataset
 // generic and hands off to `serve_timeseries`. Keeps Actix's `#[get(...)]`
 // attribute discoverable per-dataset and lets the routing table grow
 // without touching the generic core. Adding a new dataset is: define its
-// `*_CONFIG` + `*_SOURCE` in `dataset_config.rs`, define its schema +
-// meta in `schema.rs`, then add a 4-line handler here.
+// `*_CONFIG` in `dataset_config.rs`, define its schema + meta in
+// `schema.rs`, add a `*_SOURCE` static + a `load_dataset_source` call
+// in `main()`, and add a 4-line handler here.
 
 #[get("/timeseries/bsose")]
 async fn bsose_handler(
     req: HttpRequest,
     query_params: web::Query<serde_json::Value>,
 ) -> impl Responder {
+    // Snapshot the source out of the lock before any `.await`. Holding
+    // the mutex guard across an await would let other tasks on the same
+    // worker deadlock on it, so the idiom is "lock, clone, unlock, then
+    // do async work with the local copy."
+    let source = BSOSE_SOURCE
+        .lock()
+        .unwrap()
+        .as_ref()
+        .expect("BSOSE_SOURCE not initialized at startup")
+        .clone();
+
     serve_timeseries::<schema::BsoseSchema>(
         req,
         query_params.into_inner(),
         &dataset_config::BSOSE_CONFIG,
-        &dataset_config::BSOSE_SOURCE,
+        &source,
     )
     .await
 }
@@ -79,7 +102,7 @@ async fn serve_timeseries<S>(
     req: HttpRequest,
     params: serde_json::Value,
     config: &DatasetConfig,
-    source: &'static DatasetSource,
+    source: &DatasetSource,
 ) -> HttpResponse
 where
     S: schema::IsTimeseries
@@ -107,22 +130,16 @@ where
         Err(e) => return HttpResponse::BadRequest().json(json!({"error": e})),
     };
 
-    // ---- tile sequence + cached startup data --------------------------
+    // ---- tile sequence + startup-loaded data --------------------------
     let tiles = tile_generator::generate_tiles(&params, config);
 
-    // Both caches are populated at startup by `populate_dataset_cache`
-    // before the server begins accepting connections. A handler hit
-    // before startup finished is a server bug, so we surface it.
-    let timeseries = source
-        .timeseries
-        .get()
-        .expect("dataset timeseries cache should be populated at startup")
-        .clone();
-    let cached_data_info = source
-        .data_info
-        .get()
-        .expect("dataset data_info cache should be populated at startup")
-        .clone();
+    // Clone the cached values out of `source` locally so the streaming
+    // branch can move owned copies into its async generator without
+    // juggling lifetimes — both are small (timeseries is a few KB of
+    // dates; data_info is a few strings), so cloning per request is
+    // cheap relative to the actual query work.
+    let timeseries = source.timeseries.clone();
+    let cached_data_info = source.data_info.clone();
 
     let compression: Option<String> = params
         .get("compression")
@@ -352,57 +369,52 @@ fn next_url_value(
     }
 }
 
-/// Populate the startup caches on a `DatasetSource` from its meta doc.
+/// Build a `DatasetSource` by reading the dataset's meta doc out of Mongo.
 ///
-/// Reads the one meta doc selected by `{data_type: source.meta_data_type}`
-/// from `{db_name}.{meta_collection}`, then sets the source's
-/// `timeseries` and `data_info` OnceCells from it. A dataset whose meta
-/// doc has no `data_info` field will land an empty tuple in the cache
-/// (via `#[serde(default)]` on the meta struct) — the precedence rule in
-/// `transform_timeseries` makes that the right default-suppressing value
-/// for datasets that carry `data_info` per data doc.
+/// Called once per dataset at server startup. The four `'static str`
+/// arguments are the dataset's Mongo identity; the resulting struct
+/// bundles those identity strings with the values we read out of the
+/// meta doc (`timeseries`, `data_info`). `main()` then stashes the
+/// returned struct into the dataset's `*_SOURCE` static for handlers
+/// to read on each request.
 ///
-/// Panics if the meta doc can't be found or the OnceCell is already set.
-/// Both indicate startup misconfiguration that should fail loudly rather
+/// Panics if the meta doc can't be found or the cursor errors. Both
+/// indicate startup misconfiguration that should fail loudly rather
 /// than serve stale or partial data.
-async fn populate_dataset_cache<M>(source: &DatasetSource) -> Result<()>
+async fn load_dataset_source<M>(
+    db_name: &'static str,
+    collection: &'static str,
+    meta_collection: &'static str,
+    meta_data_type: &'static str,
+) -> Result<DatasetSource>
 where
     M: schema::IsTimeseriesMeta + DeserializeOwned + Unpin + Send + Sync,
 {
-    let filter = mongodb::bson::doc! {"data_type": source.meta_data_type};
+    let filter = mongodb::bson::doc! {"data_type": meta_data_type};
     let options = FindOptions::builder().limit(1).build();
-    let mut cursor = generate_cursor::<M>(
-        source.db_name,
-        source.meta_collection,
-        filter,
-        Some(options),
-    )
-    .await?;
+    let mut cursor =
+        generate_cursor::<M>(db_name, meta_collection, filter, Some(options)).await?;
 
     let meta = match cursor.next().await {
         Some(Ok(m)) => m,
-        Some(Err(e)) => {
-            panic!(
-                "Error reading meta doc for {}.{} (data_type={}): {}",
-                source.db_name, source.meta_collection, source.meta_data_type, e
-            );
-        }
+        Some(Err(e)) => panic!(
+            "Error reading meta doc for {}.{} (data_type={}): {}",
+            db_name, meta_collection, meta_data_type, e
+        ),
         None => panic!(
             "No meta doc found in {}.{} matching data_type={}",
-            source.db_name, source.meta_collection, source.meta_data_type
+            db_name, meta_collection, meta_data_type
         ),
     };
 
-    source
-        .timeseries
-        .set(meta.timeseries())
-        .unwrap_or_else(|_| panic!("timeseries cache for data_type={} already set", source.meta_data_type));
-    source
-        .data_info
-        .set(meta.data_info())
-        .unwrap_or_else(|_| panic!("data_info cache for data_type={} already set", source.meta_data_type));
-
-    Ok(())
+    Ok(DatasetSource {
+        db_name,
+        collection,
+        meta_collection,
+        meta_data_type,
+        timeseries: meta.timeseries(),
+        data_info: meta.data_info(),
+    })
 }
 
 #[actix_web::main]
@@ -413,13 +425,20 @@ async fn main() -> std::io::Result<()> {
     let client = mongodb::Client::with_options(client_options).unwrap();
     *CLIENT.lock().unwrap() = Some(client);
 
-    // Populate per-dataset startup caches. Each dataset's meta-doc-read
-    // happens here, before the server starts accepting connections.
-    // Adding a new dataset is a one-line addition: a call to
-    // populate_dataset_cache::<NewMeta>(&NEW_SOURCE).
-    populate_dataset_cache::<schema::BsoseMeta>(&dataset_config::BSOSE_SOURCE)
-        .await
-        .expect("failed to populate BSOSE dataset cache at startup");
+    // Load each dataset's source-of-truth and stash it in the static for
+    // handlers to read. The Mongo identity strings live here at the call
+    // site — one place per dataset — and the returned struct bundles
+    // them with the values read from the meta doc. Adding a new dataset
+    // is one more load-and-set block here.
+    let bsose = load_dataset_source::<schema::BsoseMeta>(
+        "argo",
+        "bsose",
+        "timeseriesMeta",
+        "BSOSE-profile",
+    )
+    .await
+    .expect("failed to load BSOSE dataset source at startup");
+    *BSOSE_SOURCE.lock().unwrap() = Some(bsose);
 
     HttpServer::new(|| {
         App::new()

From 8399037939b1cf2320a558f9b0309525081dd85a Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 19:25:39 -0400
Subject: [PATCH 03/13] first attempt at actual noaa oisst guts

---
 api/PAGINATION.md                 |  61 +++++++++++--
 api/src/helpers/dataset_config.rs |  71 ++++++++++++++++
 api/src/helpers/schema.rs         | 137 ++++++++++++++++++++++++++++++
 api/src/main.rs                   |  33 +++++++
 4 files changed, 294 insertions(+), 8 deletions(-)

diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index 071a1a2..acebde8 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -131,11 +131,56 @@ antimeridian / north-pole docs aren't lost.
 
 ## Per-dataset configuration
 
-`api/src/helpers/dataset_config.rs` defines a `DatasetConfig` struct
-with the dataset's `tile_degrees`, `max_radius_meters`, the discrete
-`levels` array, and an optional `coverage_bbox`. The BSOSE handler
-binds `BSOSE_CONFIG` directly; adding a new dataset means defining its
-config there and wiring its handler through the same `tile_generator` /
-`filter_composer` machinery. `coverage_bbox: None` for a new dataset
-gives global-walk semantics; setting it to a bounding rectangle tells
-the tile generator to skip everything outside the rectangle.
+Two per-dataset structs sit side by side in `api/src/helpers/dataset_config.rs`:
+
+- **`DatasetConfig`** — *request-size policy*. `tile_degrees` (spatial
+  page size), `max_radius_meters` (cap for `center + radius` queries),
+  `levels` (discrete vertical pages — single-element `&[0.0]` for
+  surface-only datasets like OI SST), and an optional `coverage_bbox`
+  (rectangle the dataset's data lives inside; `None` means walk the
+  whole globe). Declared as a `pub const` per dataset.
+
+- **`DatasetSource`** — *Mongo identity* (`db_name`, `collection`,
+  `meta_collection`, `meta_data_type`) plus the values read once at
+  startup from the meta doc (`timeseries` axis, `data_info` default).
+  Built at runtime by `main()` via `load_dataset_source::<MetaSchema>`
+  and stashed in a `Lazy<Mutex<Option<DatasetSource>>>` static (same
+  pattern as the Mongo `CLIENT` static).
+
+The generic handler `serve_timeseries::<S>` in `main.rs` consumes
+`(&DatasetConfig, &DatasetSource)` plus a schema generic `S` that
+implements `IsTimeseries`.
+
+### Recipe for adding a new dataset
+
+1. Define the data-doc schema (`S`) and meta-doc schema (`M`) in
+   `api/src/helpers/schema.rs`. `S` implements `IsTimeseries`; `M`
+   implements `IsTimeseriesMeta`.
+2. Define `<NAME>_CONFIG: DatasetConfig` (and `<NAME>_LEVELS` if depth
+   discretisation is non-trivial) in `dataset_config.rs`.
+3. Add `<NAME>_SOURCE: Lazy<Mutex<Option<DatasetSource>>>` in
+   `main.rs`, next to the existing dataset statics.
+4. In `main()`, load the source: `let x = load_dataset_source::<M>(...)
+   .await?; *<NAME>_SOURCE.lock().unwrap() = Some(x);`.
+5. Add a 4-line route handler annotated with `#[get("/timeseries/<name>")]`
+   that clones the source out of the lock and forwards to
+   `serve_timeseries::<S>`.
+6. Register the handler with `.service(<name>_handler)` on the `App`.
+
+### `data_info` precedence rule
+
+`data_info` (variable names, units, per-variable descriptors) may
+appear on either the data doc, the meta doc, or both:
+
+- **Doc-level wins.** If a data doc carries its own non-empty
+  `data_info` (BSOSE today), that's what `slice_data` filters
+  against. The cached meta-level default is ignored.
+- **Cache fallback.** If the data doc has no `data_info` (OI SST: the
+  field lives only on the meta doc), the per-dataset cached default —
+  loaded from the meta doc at startup — is stamped onto the doc
+  before column filtering runs.
+
+The cache for a dataset whose meta doc has no `data_info` is the empty
+tuple, and `transform_timeseries` treats empty as "no default to
+apply". So a dataset can store `data_info` per data doc, per meta doc,
+or per both — the response carries the right thing in each case.
diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs
index 06fac18..0ec6cd0 100644
--- a/api/src/helpers/dataset_config.rs
+++ b/api/src/helpers/dataset_config.rs
@@ -113,6 +113,31 @@ pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig {
     }),
 };
 
+/// OI SST has a single vertical level (the sea surface). We model it as
+/// a one-element levels array of 0.0 so the existing tile_generator /
+/// filter_composer code path works unchanged: each spatial tile crosses
+/// with `level_index = 0` to produce exactly one tile per spatial cell.
+/// The filter composer emits `level: { $gte: 0.0 }` for the only-level
+/// case (no upper bound — it's both the first and last level), which
+/// matches any doc with `level >= 0`. OI SST docs all have `level = 0`.
+pub const OISST_LEVELS: &[f64] = &[0.0];
+
+/// Configuration for the NOAA OI SST v2 high-res timeseries dataset.
+///
+/// 1/4° grid × one level. 5° tiles give ~400 cells per tile (no level
+/// multiplier, unlike BSOSE), matching BSOSE's tile size for uniformity
+/// even though OI SST has lighter per-tile load. `max_radius_meters`
+/// starts at the same 100 km cap as BSOSE — OI SST per-doc payload is
+/// smaller (one variable, weekly cadence), so this cap can be relaxed
+/// once we have real usage to size it against. `coverage_bbox: None`
+/// because OI SST spans the entire globe.
+pub const OISST_CONFIG: DatasetConfig = DatasetConfig {
+    tile_degrees: 5.0,
+    max_radius_meters: 100_000.0, // 100 km — relax once usage informs us
+    levels: OISST_LEVELS,
+    coverage_bbox: None,
+};
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -175,4 +200,50 @@ mod tests {
             assert!(d >= 0.0, "level depths should be non-negative; got {}", d);
         }
     }
+
+    // ---- OI SST config invariants (mirror the BSOSE checks) ----------------
+
+    #[test]
+    fn oisst_tile_degrees_is_positive_and_divides_a_hemisphere() {
+        assert!(OISST_CONFIG.tile_degrees > 0.0);
+        assert!(
+            (180.0_f64 % OISST_CONFIG.tile_degrees).abs() < 1e-9,
+            "tile_degrees should evenly divide 180° for clean global coverage"
+        );
+        assert!(
+            (360.0_f64 % OISST_CONFIG.tile_degrees).abs() < 1e-9,
+            "tile_degrees should evenly divide 360° for clean global coverage"
+        );
+    }
+
+    #[test]
+    fn oisst_max_radius_is_positive_and_subhemispheric() {
+        assert!(OISST_CONFIG.max_radius_meters > 0.0);
+        assert!(OISST_CONFIG.max_radius_meters < 1.0e7);
+    }
+
+    #[test]
+    fn oisst_levels_is_non_empty() {
+        // Even a single-level dataset must have a one-element levels
+        // array so the tile generator emits one tile per spatial cell.
+        // An empty list would produce zero pages.
+        assert!(!OISST_CONFIG.levels.is_empty());
+    }
+
+    #[test]
+    fn oisst_has_exactly_one_level() {
+        // OI SST is a surface-only dataset. The single-element levels
+        // array is what makes the existing tile generator / filter
+        // composer code path work without special-casing "no vertical
+        // dimension". If this ever changes, we should pause and think
+        // about what multi-level OI SST would mean physically.
+        assert_eq!(OISST_CONFIG.levels.len(), 1);
+        assert!((OISST_CONFIG.levels[0] - 0.0).abs() < 1e-9);
+    }
+
+    #[test]
+    fn oisst_has_global_coverage() {
+        // OI SST is global; no coverage_bbox skip available.
+        assert!(OISST_CONFIG.coverage_bbox.is_none());
+    }
 }
diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs
index a8cabfb..4883bb9 100644
--- a/api/src/helpers/schema.rs
+++ b/api/src/helpers/schema.rs
@@ -165,6 +165,143 @@ impl IsTimeseriesMeta for BsoseMeta {
     }
 }
 
+// oi sst /////////////////////////////////////////////////////////////////////
+
+/// `source` substructure for the OI SST metadata doc. Differs from
+/// `SourceMeta` (used by BSOSE) — OI SST uses `url` where BSOSE uses
+/// `iter`. Neither field is read by the handler today; modeled here so
+/// deserialization of the meta doc succeeds.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OisstSourceMeta {
+    pub(crate) source: Vec<String>,
+    pub(crate) url: String,
+}
+
+/// Grid descriptor on the OI SST metadata doc. Captures the regular
+/// lat/lon lattice the dataset is sampled on. Not consulted by the
+/// handler today (the equivalent information lives in `OISST_CONFIG`),
+/// but modeled here so deserialization succeeds. A future cleanup could
+/// derive the dataset's `DatasetConfig.coverage_bbox` / `tile_degrees`
+/// from this struct instead of duplicating the values in code.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+#[serde(rename_all = "camelCase")]
+pub struct Lattice {
+    pub center: [f64; 2],
+    pub spacing: [f64; 2],
+    pub min_lat: f64,
+    pub min_lon: f64,
+    pub max_lat: f64,
+    pub max_lon: f64,
+}
+
+/// One spatial cell of the NOAA OI SST v2 high-res grid. Surface-only
+/// (no vertical dimension; `level` is always `0.0`). `data` holds the
+/// timeseries per variable — there's exactly one variable (SST), so the
+/// outer Vec always has length one.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OisstSchema {
+    pub(crate) _id: String,
+    // Reachable from main.rs (batchmeta branch reads `metadata()`), so
+    // `pub` for symmetry with BsoseSchema.
+    pub metadata: Vec<String>,
+    pub(crate) basin: f64,
+    pub(crate) geolocation: GeoJSONPoint,
+    pub(crate) level: f64,
+    pub(crate) data: Vec<Vec<f64>>,
+    // OI SST data docs don't carry `timeseries` or `data_info` of their
+    // own — both are populated at request time. `timeseries` is filled
+    // by `slice_timerange`; `data_info` is stamped from the per-dataset
+    // cached default by `transform_timeseries` (the meta doc holds the
+    // dataset-wide variable info, single-variable for OI SST). Both
+    // default at deserialization so an absent field in the source doc
+    // produces an empty/None value rather than a parse error.
+    #[serde(default)]
+    pub(crate) timeseries: Option<Vec<String>>,
+    #[serde(default)]
+    pub(crate) data_info: DataInfo,
+}
+
+impl IsTimeseries for OisstSchema {
+    fn get_timeseries(&self) -> bool {
+        return true;
+    }
+
+    fn data(&mut self) -> &mut Vec<Vec<f64>> {
+        &mut self.data
+    }
+
+    fn set_data(&mut self, data: Vec<Vec<f64>>) {
+        self.data = data;
+    }
+
+    fn timeseries(&mut self) -> Option<&mut Vec<String>> {
+        self.timeseries.as_mut()
+    }
+
+    fn set_timeseries(&mut self, timeseries: Vec<String>) {
+        self.timeseries = Some(timeseries);
+    }
+
+    fn data_info(&mut self) -> DataInfo {
+        self.data_info.clone()
+    }
+
+    fn set_data_info(&mut self, data_info: DataInfo) {
+        self.data_info = data_info;
+    }
+
+    fn _id(&self) -> String {
+        self._id.clone()
+    }
+
+    fn longitude(&self) -> f64 {
+        self.geolocation.coordinates[0]
+    }
+
+    fn latitude(&self) -> f64 {
+        self.geolocation.coordinates[1]
+    }
+
+    fn level(&self) -> f64 {
+        self.level
+    }
+
+    fn metadata(&self) -> Vec<String> {
+        self.metadata.clone()
+    }
+}
+
+/// Metadata doc for the OI SST dataset. Crucially, `data_info` lives
+/// here (per-dataset default) rather than on every data doc — the
+/// generic transform layer reads it from the cache and stamps it onto
+/// each data doc before column filtering.
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct OisstMeta {
+    pub(crate) _id: String,
+    pub(crate) data_type: String,
+    pub data_info: DataInfo,
+    pub(crate) date_updated_argovis: BsonDateTime,
+    pub timeseries: Vec<BsonDateTime>,
+    pub(crate) source: Vec<OisstSourceMeta>,
+    pub(crate) lattice: Lattice,
+}
+
+impl IsTimeseriesMeta for OisstMeta {
+    fn get_timeseries_meta(&self) -> bool {
+        return true;
+    }
+
+    fn timeseries(&self) -> Vec<BsonDateTime> {
+        self.timeseries.clone()
+    }
+
+    fn data_info(&self) -> DataInfo {
+        self.data_info.clone()
+    }
+}
+
+// ///////////////////////////////////////////////////////////////////////////
+
 #[derive(Deserialize, Debug, Clone)]
 pub struct TimeseriesStub {
     pub _id: String,
diff --git a/api/src/main.rs b/api/src/main.rs
index 8ba46c5..9857e17 100644
--- a/api/src/main.rs
+++ b/api/src/main.rs
@@ -48,6 +48,7 @@ static CLIENT: Lazy<Mutex<Option<mongodb::Client>>> = Lazy::new(|| Mutex::new(No
 // One static per dataset. Adding a new dataset is one more static here
 // plus one more load_dataset_source/ load-and-set block in main().
 static BSOSE_SOURCE: Lazy<Mutex<Option<DatasetSource>>> = Lazy::new(|| Mutex::new(None));
+static OISST_SOURCE: Lazy<Mutex<Option<DatasetSource>>> = Lazy::new(|| Mutex::new(None));
 
 // ---- route handlers --------------------------------------------------------
 //
@@ -84,6 +85,27 @@ async fn bsose_handler(
     .await
 }
 
+#[get("/timeseries/noaaoisst")]
+async fn oisst_handler(
+    req: HttpRequest,
+    query_params: web::Query<serde_json::Value>,
+) -> impl Responder {
+    let source = OISST_SOURCE
+        .lock()
+        .unwrap()
+        .as_ref()
+        .expect("OISST_SOURCE not initialized at startup")
+        .clone();
+
+    serve_timeseries::<schema::OisstSchema>(
+        req,
+        query_params.into_inner(),
+        &dataset_config::OISST_CONFIG,
+        &source,
+    )
+    .await
+}
+
 // ---- generic timeseries handler --------------------------------------------
 
 /// Generic body of the `/timeseries/{dataset}` endpoint. Parameterized by
@@ -440,9 +462,20 @@ async fn main() -> std::io::Result<()> {
     .expect("failed to load BSOSE dataset source at startup");
     *BSOSE_SOURCE.lock().unwrap() = Some(bsose);
 
+    let oisst = load_dataset_source::<schema::OisstMeta>(
+        "argo",
+        "noaaOIsst",
+        "timeseriesMeta",
+        "noaa-oi-sst-v2-high-res",
+    )
+    .await
+    .expect("failed to load OI SST dataset source at startup");
+    *OISST_SOURCE.lock().unwrap() = Some(oisst);
+
     HttpServer::new(|| {
         App::new()
             .service(bsose_handler)
+            .service(oisst_handler)
     })
     .bind(("0.0.0.0", 8080))?
     .run()

From 2f99ece96b875be5d9e0ceda68dd878734db0ee3 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 19:31:40 -0400
Subject: [PATCH 04/13] test fixtures for noaa oisst

---
 api/fixtures/timeseriesMeta.json | 30 ++++++++++++++++++++++++++
 api/src/bin/seed_test_db.rs      | 36 +++++++++++++++++++-------------
 2 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/api/fixtures/timeseriesMeta.json b/api/fixtures/timeseriesMeta.json
index e937a71..75ab60e 100644
--- a/api/fixtures/timeseriesMeta.json
+++ b/api/fixtures/timeseriesMeta.json
@@ -17,5 +17,35 @@
     "depth_r0_to_bottom": 4000.0,
     "interior_2d_mask": true,
     "depth_r0_to_ref_surface": 100.0
+  },
+  {
+    "_id": "noaa-oi-sst-v2-high-res",
+    "data_type": "noaa-oi-sst-v2-high-res",
+    "data_info": [
+      ["sst"],
+      ["units", "long_name"],
+      [["degC", "Weekly Mean of Sea Surface Temperature"]]
+    ],
+    "date_updated_argovis": "2026-05-26T20:48:56Z",
+    "timeseries": [
+      "2020-01-15T00:00:00Z",
+      "2020-04-15T00:00:00Z",
+      "2020-07-15T00:00:00Z",
+      "2020-10-15T00:00:00Z"
+    ],
+    "source": [
+      {
+        "source": ["NOAA Optimum Interpolation SST V2 High Resolution"],
+        "url": "https://psl.noaa.gov/data/gridded/data.noaa.oisst.v2.highres.html"
+      }
+    ],
+    "lattice": {
+      "center": [0.125, 0.125],
+      "spacing": [0.25, 0.25],
+      "minLat": -89.875,
+      "minLon": -179.875,
+      "maxLat": 89.875,
+      "maxLon": 179.875
+    }
   }
 ]
diff --git a/api/src/bin/seed_test_db.rs b/api/src/bin/seed_test_db.rs
index 9e94f68..71b99d5 100644
--- a/api/src/bin/seed_test_db.rs
+++ b/api/src/bin/seed_test_db.rs
@@ -1,16 +1,17 @@
 // Seeds a MongoDB instance with the test fixtures used by the integration tests.
 //
 // Run before starting the API container so the API picks up the right
-// `timeseriesMeta` document at startup:
+// metadata documents at startup:
 //
 //     MONGODB_URI=mongodb://localhost:27017 cargo run --bin seed_test_db
 //
 // What it does:
-//   * drops the `argo.bsose` and `argo.timeseriesMeta` collections
+//   * drops the `argo.bsose`, `argo.noaaOIsst`, and `argo.timeseriesMeta`
+//     collections
 //   * loads the JSON fixtures embedded at compile time
 //   * converts ISO-8601 strings in known date fields to BSON DateTimes
 //   * inserts the resulting documents
-//   * creates a 2dsphere index on `geolocation` for the bsose collection
+//   * creates a 2dsphere index on `geolocation` for each data collection
 //
 // Date fields in the fixtures are written as ISO-8601 strings to keep the
 // JSON readable; the seeder converts them to BSON DateTimes here, since
@@ -26,6 +27,7 @@ use std::env;
 const TIMESERIES_META_FIXTURE: &str =
     include_str!("../../fixtures/timeseriesMeta.json");
 const BSOSE_FIXTURE: &str = include_str!("../../fixtures/bsose.json");
+const NOAA_OISST_FIXTURE: &str = include_str!("../../fixtures/noaaOIsst.json");
 
 const DB_NAME: &str = "argo";
 
@@ -37,7 +39,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let client = Client::with_options(opts)?;
     let db = client.database(DB_NAME);
 
-    // timeseriesMeta has BSON dates in two fields
+    // timeseriesMeta carries both BSOSE and OI SST meta docs;
+    // `date_updated_argovis` and `timeseries` are BSON-date fields on both.
     seed_collection(
         &db,
         "timeseriesMeta",
@@ -46,17 +49,20 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     )
     .await?;
 
-    // bsose has no top-level date fields
-    seed_collection(&db, "bsose", BSOSE_FIXTURE, &[]).await?;
-
-    // Geospatial queries (`$geoWithin`, `$near`) require a 2dsphere index on
-    // the GeoJSON field. MongoDB picks a default index name from the keys.
-    let geo_index = IndexModel::builder()
-        .keys(bson::doc! { "geolocation": "2dsphere" })
-        .build();
-    db.collection::<Document>("bsose")
-        .create_index(geo_index, None)
-        .await?;
+    // Data collections — no top-level date fields. Each gets its own
+    // 2dsphere index on `geolocation` for `$geoWithin` / `$near` queries.
+    for (name, fixture) in [
+        ("bsose", BSOSE_FIXTURE),
+        ("noaaOIsst", NOAA_OISST_FIXTURE),
+    ] {
+        seed_collection(&db, name, fixture, &[]).await?;
+        let geo_index = IndexModel::builder()
+            .keys(bson::doc! { "geolocation": "2dsphere" })
+            .build();
+        db.collection::<Document>(name)
+            .create_index(geo_index, None)
+            .await?;
+    }
 
     println!("Seed complete: {} populated.", DB_NAME);
     Ok(())

From dbfdbcc2d58666cf3009c4a9d1d2619094eb08c3 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 19:33:53 -0400
Subject: [PATCH 05/13] test fixtures for noaa oisst

---
 api/fixtures/noaaOIsst.json | 42 +++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 api/fixtures/noaaOIsst.json

diff --git a/api/fixtures/noaaOIsst.json b/api/fixtures/noaaOIsst.json
new file mode 100644
index 0000000..1582668
--- /dev/null
+++ b/api/fixtures/noaaOIsst.json
@@ -0,0 +1,42 @@
+[
+  {
+    "_id": "noaaoisst_doc_001",
+    "metadata": ["noaa-oi-sst-v2-high-res"],
+    "basin": 1.0,
+    "geolocation": { "type": "Point", "coordinates": [20.0, -50.0] },
+    "level": 0.0,
+    "data": [
+      [10.1, 11.2, 12.3, 13.4]
+    ]
+  },
+  {
+    "_id": "noaaoisst_doc_002",
+    "metadata": ["noaa-oi-sst-v2-high-res"],
+    "basin": 1.0,
+    "geolocation": { "type": "Point", "coordinates": [40.0, -40.0] },
+    "level": 0.0,
+    "data": [
+      [15.0, 16.0, 17.0, 18.0]
+    ]
+  },
+  {
+    "_id": "noaaoisst_doc_003",
+    "metadata": ["noaa-oi-sst-v2-high-res"],
+    "basin": 2.0,
+    "geolocation": { "type": "Point", "coordinates": [-170.0, -55.0] },
+    "level": 0.0,
+    "data": [
+      [-1.0, 0.0, 1.0, 2.0]
+    ]
+  },
+  {
+    "_id": "noaaoisst_doc_004",
+    "metadata": ["noaa-oi-sst-v2-high-res"],
+    "basin": 1.0,
+    "geolocation": { "type": "Point", "coordinates": [0.0, 45.0] },
+    "level": 0.0,
+    "data": [
+      [12.5, 13.5, 14.5, 15.5]
+    ]
+  }
+]

From 1d42f365ab09de850821582aeb23f58ea938c3e2 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 21:50:58 -0400
Subject: [PATCH 06/13] turn on datasets with runtime env

---
 .github/workflows/test.yml        |   6 +
 api/PAGINATION.md                 |  36 +++++-
 api/src/helpers/dataset_config.rs |  40 ++++---
 api/src/main.rs                   | 185 ++++++++++++++++++++----------
 api/tests/integration.rs          |  30 +++--
 5 files changed, 209 insertions(+), 88 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e51268d..830efa2 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,7 +24,13 @@ jobs:
           --health-retries 12
 
     env:
+      # MONGODB_URI is used by the seeder binary. The API itself reads a
+      # per-dataset URI per dataset it serves — URI presence is the enable
+      # signal. Both point at the same Mongo container in CI; in production
+      # each dataset is typically deployed against its own Mongo.
       MONGODB_URI: mongodb://localhost:27017
+      MONGODB_URI_BSOSE: mongodb://localhost:27017
+      MONGODB_URI_NOAAOISST: mongodb://localhost:27017
       API_URL: http://localhost:8080
       CARGO_TERM_COLOR: always
       RUST_BACKTRACE: 1
diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index acebde8..a54c7f4 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -160,12 +160,42 @@ implements `IsTimeseries`.
    discretisation is non-trivial) in `dataset_config.rs`.
 3. Add `<NAME>_SOURCE: Lazy<Mutex<Option<DatasetSource>>>` in
    `main.rs`, next to the existing dataset statics.
-4. In `main()`, load the source: `let x = load_dataset_source::<M>(...)
-   .await?; *<NAME>_SOURCE.lock().unwrap() = Some(x);`.
+4. In `main()`, gate on the dataset's URI env var and load conditionally:
+   ```
+   let mut enabled_<name> = false;
+   if let Some(client) = dataset_client("MONGODB_URI_<NAME>").await {
+       let x = load_dataset_source::<M>(client, ...).await?;
+       *<NAME>_SOURCE.lock().unwrap() = Some(x);
+       enabled_<name> = true;
+   }
+   ```
 5. Add a 4-line route handler annotated with `#[get("/timeseries/<name>")]`
    that clones the source out of the lock and forwards to
    `serve_timeseries::<S>`.
-6. Register the handler with `.service(<name>_handler)` on the `App`.
+6. Register the handler inside the `App::configure` callback, gated on
+   `enabled_<name>`. A dataset whose URI env var is unset stays
+   unregistered: no route, no startup work, no panic.
+
+### Per-deployment configuration
+
+Each dataset is enabled iff its `MONGODB_URI_<DATASET>` env var is set
+when `main()` runs. URI presence is the enable signal — there's no
+separate `DATASETS=` list to keep in sync. A deployment serving only
+one dataset just sets one env var:
+
+```
+MONGODB_URI_BSOSE=mongodb://bsose-mongo/ cargo run     # BSOSE-only
+MONGODB_URI_NOAAOISST=mongodb://noaa-mongo/ cargo run  # OI SST-only
+```
+
+For local dev / tests where one Mongo serves both, point both env vars
+at the same URI:
+
+```
+MONGODB_URI_BSOSE=mongodb://localhost:27017 \
+MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \
+  cargo run
+```
 
 ### `data_info` precedence rule
 
diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs
index 0ec6cd0..5ab4084 100644
--- a/api/src/helpers/dataset_config.rs
+++ b/api/src/helpers/dataset_config.rs
@@ -10,15 +10,17 @@
 //! actually lives inside, so we skip probing tiles outside it.
 //!
 //! `DatasetSource` is the sibling struct that names *where* each dataset
-//! lives in Mongo (db, collection, meta collection, meta discriminator)
-//! and carries the startup-loaded metadata (the timeseries axis, the
-//! meta default `data_info`). It can't be `const`/`static` directly
-//! because the metadata fields are loaded from Mongo at runtime; it's
-//! built once in `main()` via `load_dataset_source` and stashed in a
-//! top-level `Lazy<Mutex<Option<DatasetSource>>>` (the same pattern the
-//! existing Mongo `CLIENT` static uses, so we don't drag in new
-//! initialization vocabulary). Handlers clone it out of the lock at the
-//! top of each request, then read its fields as plain owned data.
+//! lives in Mongo (db, collection, meta collection, meta discriminator),
+//! carries the per-dataset `mongodb::Client`, and carries the
+//! startup-loaded metadata (the timeseries axis, the meta default
+//! `data_info`). One `mongodb::Client` per dataset because the deployment
+//! topology may put each dataset in a different Mongo instance (env vars
+//! `MONGODB_URI_<DATASET>` per dataset). It can't be `const`/`static`
+//! directly because the metadata fields are loaded from Mongo at
+//! runtime; it's built once in `main()` via `load_dataset_source` and
+//! stashed in a top-level `Lazy<Mutex<Option<DatasetSource>>>`. Handlers
+//! clone it out of the lock at the top of each request, then read its
+//! fields as plain owned data.
 
 use mongodb::bson::DateTime as BsonDateTime;
 
@@ -55,9 +57,17 @@ pub struct DatasetConfig {
     pub coverage_bbox: Option<BoundingBox>,
 }
 
-/// Per-dataset identity + startup-loaded metadata, built once in `main()`.
+/// Per-dataset identity + Mongo client + startup-loaded metadata, built
+/// once in `main()`.
 ///
-/// The first four fields are the dataset's Mongo identity:
+/// `client` is this dataset's dedicated `mongodb::Client`. Each dataset
+/// gets its own URI (env `MONGODB_URI_<DATASET>`) and its own client —
+/// the deployment topology we expect is "BSOSE deployed against one
+/// Mongo, OI SST against another," so a single shared client doesn't
+/// fit. `mongodb::Client` is internally Arc-backed, so cloning it (and
+/// therefore cloning the whole `DatasetSource`) is cheap.
+///
+/// The next four fields are the dataset's Mongo identity:
 ///   - `db_name` / `collection`: where the data docs live.
 ///   - `meta_collection`: where the metadata doc lives. May or may not be
 ///     shared across datasets; today everything is in `timeseriesMeta`,
@@ -65,13 +75,14 @@ pub struct DatasetConfig {
 ///   - `meta_data_type`: the `data_type` discriminator that selects this
 ///     dataset's meta doc out of `meta_collection`.
 ///
-/// The remaining two fields are values we load *once* at startup from the
+/// The last two fields are values we load *once* at startup from the
 /// meta doc and read on every request. Plain owned types — no cells.
 ///
 /// `Clone` is derived so handlers can copy a `DatasetSource` out of the
 /// `Lazy<Mutex<Option<_>>>` static and use it locally without holding
-/// the mutex across `.await` points. The clone is small (a few KB of
-/// dates plus a few short strings).
+/// the mutex across `.await` points. The clone is cheap: the client is
+/// Arc-backed, identity strings are `&'static str`, and the metadata
+/// fields are a few KB of dates plus a few short strings.
 ///
 /// `data_info` is the *meta-level default* for the dataset. Per the
 /// precedence rule documented on `transforms::transform_timeseries`, a
@@ -82,6 +93,7 @@ pub struct DatasetConfig {
 /// it's still populated, just never consulted.
 #[derive(Clone)]
 pub struct DatasetSource {
+    pub client: mongodb::Client,
     pub db_name: &'static str,
     pub collection: &'static str,
     pub meta_collection: &'static str,
diff --git a/api/src/main.rs b/api/src/main.rs
index 9857e17..5fca235 100644
--- a/api/src/main.rs
+++ b/api/src/main.rs
@@ -36,17 +36,22 @@ use serde_json::{json, Value};
 
 use dataset_config::{DatasetConfig, DatasetSource};
 
-static CLIENT: Lazy<Mutex<Option<mongodb::Client>>> = Lazy::new(|| Mutex::new(None));
-
-// Per-dataset source-of-truth: identity strings + values loaded from the
-// dataset's meta doc at startup. Same pattern as CLIENT above — the
-// `Lazy<Mutex<Option<T>>>` is what lets us declare a static that's
-// initialized once, after main() reads the value out of Mongo. Handlers
-// clone the inner DatasetSource out of the lock at the top of each
-// request and use it locally, so the mutex is never held across `.await`.
+// Per-dataset source-of-truth: identity strings + the dataset's Mongo
+// client + values loaded from the dataset's meta doc at startup. The
+// `Lazy<Mutex<Option<T>>>` shape is what lets us declare a static that's
+// initialized once, after main() connects to Mongo and reads the meta
+// doc. Handlers clone the inner DatasetSource out of the lock at the top
+// of each request and use it locally, so the mutex is never held across
+// `.await`.
 //
-// One static per dataset. Adding a new dataset is one more static here
-// plus one more load_dataset_source/ load-and-set block in main().
+// One static per dataset. A dataset is "enabled" iff its
+// `MONGODB_URI_<DATASET>` env var is set; if so, main() loads it and
+// fills this static, and the route handler is registered on the App.
+// If the env var is unset, the static stays `None` and the handler is
+// never registered (no route, no surprises).
+//
+// Adding a new dataset is one more static here plus one more
+// load-and-register block in main() and a route handler above.
 static BSOSE_SOURCE: Lazy<Mutex<Option<DatasetSource>>> = Lazy::new(|| Mutex::new(None));
 static OISST_SOURCE: Lazy<Mutex<Option<DatasetSource>>> = Lazy::new(|| Mutex::new(None));
 
@@ -192,6 +197,7 @@ where
         let options = FindOptions::builder().build();
 
         let mut cursor = match generate_cursor::<S>(
+            &source.client,
             source.db_name,
             source.collection,
             filter,
@@ -238,6 +244,7 @@ where
                 "_id": { "$in": unique_metadata.into_iter().collect::<Vec<_>>() }
             };
             let meta_cursor = match generate_cursor::<Document>(
+                &source.client,
                 source.db_name,
                 source.meta_collection,
                 meta_filter,
@@ -393,17 +400,19 @@ fn next_url_value(
 
 /// Build a `DatasetSource` by reading the dataset's meta doc out of Mongo.
 ///
-/// Called once per dataset at server startup. The four `'static str`
-/// arguments are the dataset's Mongo identity; the resulting struct
-/// bundles those identity strings with the values we read out of the
-/// meta doc (`timeseries`, `data_info`). `main()` then stashes the
-/// returned struct into the dataset's `*_SOURCE` static for handlers
-/// to read on each request.
+/// Called once per dataset at server startup. `client` is the dataset's
+/// dedicated Mongo client (constructed from its `MONGODB_URI_<DATASET>`
+/// env var); the four `'static str` arguments are the dataset's Mongo
+/// identity. The resulting struct bundles the client + identity strings
+/// with the values we read out of the meta doc (`timeseries`,
+/// `data_info`). `main()` then stashes the returned struct into the
+/// dataset's `*_SOURCE` static for handlers to read on each request.
 ///
 /// Panics if the meta doc can't be found or the cursor errors. Both
 /// indicate startup misconfiguration that should fail loudly rather
 /// than serve stale or partial data.
 async fn load_dataset_source<M>(
+    client: mongodb::Client,
     db_name: &'static str,
     collection: &'static str,
     meta_collection: &'static str,
@@ -415,7 +424,7 @@ where
     let filter = mongodb::bson::doc! {"data_type": meta_data_type};
     let options = FindOptions::builder().limit(1).build();
     let mut cursor =
-        generate_cursor::<M>(db_name, meta_collection, filter, Some(options)).await?;
+        generate_cursor::<M>(&client, db_name, meta_collection, filter, Some(options)).await?;
 
     let meta = match cursor.next().await {
         Some(Ok(m)) => m,
@@ -430,6 +439,7 @@ where
     };
 
     Ok(DatasetSource {
+        client,
         db_name,
         collection,
         meta_collection,
@@ -439,59 +449,110 @@ where
     })
 }
 
+/// Read a per-dataset Mongo URI env var (e.g. `MONGODB_URI_BSOSE`) and
+/// build a `mongodb::Client` from it. Returns `Some(client)` if the var
+/// is set and parses cleanly, `None` if the var is unset or empty (the
+/// dataset is simply disabled for this deployment), or panics on a
+/// malformed URI / unreachable Mongo (treated as deployment-config
+/// bugs that should fail loudly at startup).
+async fn dataset_client(env_var: &str) -> Option<mongodb::Client> {
+    let uri = match env::var(env_var) {
+        Ok(u) if !u.is_empty() => u,
+        _ => return None,
+    };
+    let opts = mongodb::options::ClientOptions::parse(&uri)
+        .await
+        .unwrap_or_else(|e| panic!("invalid {}: {}", env_var, e));
+    let client = mongodb::Client::with_options(opts)
+        .unwrap_or_else(|e| panic!("could not build Mongo client for {}: {}", env_var, e));
+    Some(client)
+}
+
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
 
-    // Initialize the MongoDB client
-    let client_options = mongodb::options::ClientOptions::parse(env::var("MONGODB_URI").unwrap()).await.unwrap();
-    let client = mongodb::Client::with_options(client_options).unwrap();
-    *CLIENT.lock().unwrap() = Some(client);
-
-    // Load each dataset's source-of-truth and stash it in the static for
-    // handlers to read. The Mongo identity strings live here at the call
-    // site — one place per dataset — and the returned struct bundles
-    // them with the values read from the meta doc. Adding a new dataset
-    // is one more load-and-set block here.
-    let bsose = load_dataset_source::<schema::BsoseMeta>(
-        "argo",
-        "bsose",
-        "timeseriesMeta",
-        "BSOSE-profile",
-    )
-    .await
-    .expect("failed to load BSOSE dataset source at startup");
-    *BSOSE_SOURCE.lock().unwrap() = Some(bsose);
-
-    let oisst = load_dataset_source::<schema::OisstMeta>(
-        "argo",
-        "noaaOIsst",
-        "timeseriesMeta",
-        "noaa-oi-sst-v2-high-res",
-    )
-    .await
-    .expect("failed to load OI SST dataset source at startup");
-    *OISST_SOURCE.lock().unwrap() = Some(oisst);
+    // Each dataset is enabled iff its `MONGODB_URI_<DATASET>` env var is
+    // set. We build a Mongo client per enabled dataset, load its meta
+    // doc into the corresponding `*_SOURCE` static, and record a bool
+    // so the HttpServer factory below knows whether to register the
+    // dataset's route. An unset env var means "this deployment doesn't
+    // serve that dataset" — no load, no route, no surprises.
+    let mut enabled_bsose = false;
+    if let Some(client) = dataset_client("MONGODB_URI_BSOSE").await {
+        let bsose = load_dataset_source::<schema::BsoseMeta>(
+            client,
+            "argo",
+            "bsose",
+            "timeseriesMeta",
+            "BSOSE-profile",
+        )
+        .await
+        .expect("failed to load BSOSE dataset source at startup");
+        *BSOSE_SOURCE.lock().unwrap() = Some(bsose);
+        enabled_bsose = true;
+    }
 
-    HttpServer::new(|| {
-        App::new()
-            .service(bsose_handler)
-            .service(oisst_handler)
+    let mut enabled_oisst = false;
+    if let Some(client) = dataset_client("MONGODB_URI_NOAAOISST").await {
+        let oisst = load_dataset_source::<schema::OisstMeta>(
+            client,
+            "argo",
+            "noaaOIsst",
+            "timeseriesMeta",
+            "noaa-oi-sst-v2-high-res",
+        )
+        .await
+        .expect("failed to load OI SST dataset source at startup");
+        *OISST_SOURCE.lock().unwrap() = Some(oisst);
+        enabled_oisst = true;
+    }
+
+    if !enabled_bsose && !enabled_oisst {
+        eprintln!(
+            "warning: no datasets enabled. Set at least one of \
+             MONGODB_URI_BSOSE / MONGODB_URI_NOAAOISST."
+        );
+    } else {
+        println!(
+            "Datasets enabled:{}{}",
+            if enabled_bsose { " bsose" } else { "" },
+            if enabled_oisst { " noaaoisst" } else { "" },
+        );
+    }
+
+    // `App::configure` lets us register routes conditionally without
+    // running into the "each `.service(...)` call returns a new App
+    // type" problem.
+    HttpServer::new(move || {
+        App::new().configure(move |cfg| {
+            if enabled_bsose {
+                cfg.service(bsose_handler);
+            }
+            if enabled_oisst {
+                cfg.service(oisst_handler);
+            }
+        })
     })
     .bind(("0.0.0.0", 8080))?
     .run()
     .await
 }
 
-async fn generate_cursor<T: DeserializeOwned>(db_name: &str, collection_name: &str, filter: Document, options: Option<FindOptions>) -> Result<mongodb::Cursor<T>> {
-    let client = {
-        let guard = match CLIENT.lock() {
-            Ok(guard) => guard,
-            Err(poisoned) => poisoned.into_inner(),
-        };
-        match guard.as_ref() {
-            Some(client) => client.clone(),
-            None => return Err(mongodb::error::Error::from(std::io::Error::new(std::io::ErrorKind::Other, "Client is None"))),
-        }
-    };
-    client.database(db_name).collection::<T>(collection_name).find(filter, options).await
+/// Thin wrapper around `client.database(...).collection(...).find(...)`.
+/// Takes the Mongo client by reference so callers can keep one client
+/// per dataset (see `DatasetSource::client`) instead of reaching into a
+/// global. The wrapper exists mostly so the call sites read uniformly
+/// across the codebase.
+async fn generate_cursor<T: DeserializeOwned>(
+    client: &mongodb::Client,
+    db_name: &str,
+    collection_name: &str,
+    filter: Document,
+    options: Option<FindOptions>,
+) -> Result<mongodb::Cursor<T>> {
+    client
+        .database(db_name)
+        .collection::<T>(collection_name)
+        .find(filter, options)
+        .await
 }
\ No newline at end of file
diff --git a/api/tests/integration.rs b/api/tests/integration.rs
index d8e8f3d..3be701e 100644
--- a/api/tests/integration.rs
+++ b/api/tests/integration.rs
@@ -1,19 +1,31 @@
 // Integration tests against a live API + MongoDB.
 //
 // Preconditions:
-//   * `cargo run --bin seed_test_db` has been run against the same MongoDB
-//     the API is connected to.
-//   * The API has been (re)started AFTER the seed, so it caches the right
-//     `timeseriesMeta.timeseries` vector at startup.
-//   * API_URL points at the running API (default: http://localhost:8080).
-//   * MONGODB_URI is reachable (default: mongodb://localhost:27017). It is
-//     not used directly by these tests but is read so that misconfigured
-//     environments fail loudly.
+//   * `cargo run --bin seed_test_db` has been run against the MongoDB the
+//     API is connected to. The seeder uses its own `MONGODB_URI` env var
+//     (distinct from the per-dataset URIs the API uses) to know where
+//     to write fixtures.
+//   * The API has been (re)started AFTER the seed, so it caches the
+//     right `timeseriesMeta` documents at startup.
+//   * The API process has per-dataset Mongo URIs set: at minimum
+//     `MONGODB_URI_BSOSE`, and `MONGODB_URI_NOAAOISST` if OI SST tests
+//     are enabled. A dataset whose env var is unset is simply not
+//     served by that deployment.
+//   * `API_URL` points at the running API (default
+//     `http://localhost:8080`). `MONGODB_URI` is read by these tests
+//     only as a sanity-check that the environment is configured;
+//     they don't connect to Mongo themselves.
 //
 // Run with:
-//   API_URL=http://localhost:8080 MONGODB_URI=mongodb://localhost:27017 \
+//   API_URL=http://localhost:8080 \
+//   MONGODB_URI=mongodb://localhost:27017 \
 //     cargo test --test integration -- --test-threads=1
 //
+// And start the API with, e.g. (same Mongo for both datasets locally):
+//   MONGODB_URI_BSOSE=mongodb://localhost:27017 \
+//   MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \
+//     cargo run
+//
 // Every response is the paginated envelope:
 //   { "docs": [...], "next_url": "<rel path?…>" | null, "message": "..." }
 // Multi-tile queries (anything spanning multiple grid cells or vertical

From 888d2294ebb9b748504639f559fa54e9a939c521 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Tue, 26 May 2026 23:47:48 -0400
Subject: [PATCH 07/13] dont append data_info and timeseries if we dont need
 them

---
 api/PAGINATION.md             |  19 ++-
 api/src/helpers/schema.rs     |  49 ++++---
 api/src/helpers/transforms.rs | 243 ++++++++++++++++++++++++----------
 3 files changed, 223 insertions(+), 88 deletions(-)

diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index a54c7f4..a89b25b 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -197,7 +197,24 @@ MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \
   cargo run
 ```
 
-### `data_info` precedence rule
+### Response-shape rule for `data_info` and `timeseries`
+
+The fields `data_info` and `timeseries` on a response doc appear *only
+when the user's query has materially altered the dataset-wide defaults*:
+
+- `data_info` appears iff the user supplied `data=`. Without `data=`,
+  the column layout matches the dataset default and the field is
+  omitted (clients fall back to the meta endpoint).
+- `timeseries` appears iff the user supplied `startDate` or `endDate`.
+  Without either, the time axis matches the dataset default and the
+  field is omitted.
+
+The intent is response slimness: the data response carries only what
+*differs* from the meta endpoint's dataset-wide values. With both
+qsps unset, response docs are short (just `_id`, geolocation, level,
+metadata, and an empty `data` array).
+
+### `data_info` precedence rule (when `data=` is set)
 
 `data_info` (variable names, units, per-variable descriptors) may
 appear on either the data doc, the meta doc, or both:
diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs
index 4883bb9..abfb6af 100644
--- a/api/src/helpers/schema.rs
+++ b/api/src/helpers/schema.rs
@@ -34,8 +34,14 @@ pub trait IsTimeseries {
     fn set_data(&mut self, data: Vec<Vec<f64>>);
     fn timeseries(&mut self) -> Option<&mut Vec<String>>;
     fn set_timeseries(&mut self, timeseries: Vec<String>);
-    fn data_info(&mut self) -> DataInfo;
-    fn set_data_info(&mut self, data_info: DataInfo);
+    /// `data_info` is `Option` because the response carries it only when
+    /// the user's query has *materially altered* the data layout —
+    /// concretely, when the `data=` qsp triggers column filtering. In
+    /// the no-`data=` case `transform_timeseries` writes `None` here so
+    /// the response omits the field, and clients fall back to the
+    /// dataset-wide `data_info` on the meta endpoint.
+    fn data_info(&mut self) -> Option<DataInfo>;
+    fn set_data_info(&mut self, data_info: Option<DataInfo>);
     fn _id(&self) -> String;
     fn longitude(&self) -> f64;
     fn latitude(&self) -> f64;
@@ -73,9 +79,18 @@ pub struct BsoseSchema {
     pub(crate) cell_z_size: f64,
     pub(crate) reference_density_profile: f64,
     pub(crate) data: Vec<Vec<f64>>,
-    // Not present in the source collection — gets populated by transforms.
+    // Both of the next two are response-shape-driven: present iff the
+    // user's query made the dataset-wide default insufficient. Per the
+    // response-shape rule, `data_info` appears only when `data=` qsp
+    // triggered column filtering; `timeseries` appears only when the
+    // user cut the time axis with `startDate` / `endDate`. Both
+    // serialize-absent when None and default to None / empty on
+    // deserialization, so a BSOSE source doc that carries `data_info`
+    // (the per-cell default for BSOSE today) still reads cleanly.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
     pub(crate) timeseries: Option<Vec<String>>,
-    pub(crate) data_info: DataInfo,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub(crate) data_info: Option<DataInfo>,
 }
 
 impl IsTimeseries for BsoseSchema {
@@ -99,11 +114,11 @@ impl IsTimeseries for BsoseSchema {
         self.timeseries = Some(timeseries);
     }
 
-    fn data_info(&mut self) -> DataInfo {
+    fn data_info(&mut self) -> Option<DataInfo> {
         self.data_info.clone()
     }
 
-    fn set_data_info(&mut self, data_info: DataInfo) {
+    fn set_data_info(&mut self, data_info: Option<DataInfo>) {
         self.data_info = data_info;
     }
 
@@ -209,16 +224,16 @@ pub struct OisstSchema {
     pub(crate) level: f64,
     pub(crate) data: Vec<Vec<f64>>,
     // OI SST data docs don't carry `timeseries` or `data_info` of their
-    // own — both are populated at request time. `timeseries` is filled
-    // by `slice_timerange`; `data_info` is stamped from the per-dataset
-    // cached default by `transform_timeseries` (the meta doc holds the
-    // dataset-wide variable info, single-variable for OI SST). Both
-    // default at deserialization so an absent field in the source doc
-    // produces an empty/None value rather than a parse error.
-    #[serde(default)]
+    // own — both are populated at request time per the response-shape
+    // rule. `timeseries` is filled by `slice_timerange` iff the user
+    // set `startDate` / `endDate`; `data_info` is stamped (from the
+    // per-dataset cached default) by `transform_timeseries` iff the
+    // user set `data=`. Both serialize-absent when None and default to
+    // None on deserialization.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
     pub(crate) timeseries: Option<Vec<String>>,
-    #[serde(default)]
-    pub(crate) data_info: DataInfo,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub(crate) data_info: Option<DataInfo>,
 }
 
 impl IsTimeseries for OisstSchema {
@@ -242,11 +257,11 @@ impl IsTimeseries for OisstSchema {
         self.timeseries = Some(timeseries);
     }
 
-    fn data_info(&mut self) -> DataInfo {
+    fn data_info(&mut self) -> Option<DataInfo> {
         self.data_info.clone()
     }
 
-    fn set_data_info(&mut self, data_info: DataInfo) {
+    fn set_data_info(&mut self, data_info: Option<DataInfo>) {
         self.data_info = data_info;
     }
 
diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs
index adbda37..c33a71a 100644
--- a/api/src/helpers/transforms.rs
+++ b/api/src/helpers/transforms.rs
@@ -6,22 +6,33 @@ use mongodb::bson::DateTime as BsonDateTime;
 /// timeseries document. Returns `None` if the document was filtered out
 /// entirely (e.g. `data=somefield` produced no matching columns).
 ///
-/// `cached_data_info` is the *meta-level default* `data_info` for this
-/// dataset — read once at startup from the meta doc and stashed on the
-/// dataset's `DatasetSource`. Precedence rule for `data_info`:
+/// Response-shape rules enforced here:
+///
+///   - `timeseries` appears on the returned doc *iff* the user supplied
+///     `startDate` or `endDate`. Otherwise the field stays `None`, and
+///     clients fall back to the dataset-wide timeseries on the meta
+///     endpoint. (Mechanically: `slice_timerange` populates the field
+///     only when invoked, which only happens for date-bounded queries.)
+///
+///   - `data_info` appears on the returned doc *iff* the user supplied
+///     `data=`. With `data=` set we materialise the working `data_info`
+///     (see precedence rule below) onto the doc so `slice_data` can
+///     filter, and the resulting filtered `data_info` rides along in
+///     the response. Without `data=`, we scrub `data_info` to `None`
+///     (even if the source doc carried one — the BSOSE case) so the
+///     response stays slim and clients defer to the meta endpoint.
+///
+/// Precedence rule for the working `data_info` when `data=` is set:
 ///
 ///   - If the data doc carries its own non-empty `data_info` (the BSOSE
-///     case today), it wins; the cached default is ignored.
+///     case today), it wins; the cached meta-level default is ignored.
 ///   - If the data doc has no `data_info` (the OI SST case — single
 ///     variable, info kept on the meta doc), the cached default is
 ///     stamped onto the doc before `slice_data` runs.
-///   - If both are empty, `slice_data` will return `None` for any
-///     specific-variable request (no columns to match) and an
-///     empty-data passthrough for `data=` / `data=all` — same as today.
 ///
-/// The cache passes through as `&DataInfo` rather than `Option`: an empty
-/// tuple is its own "no default" sentinel, matching the convention
-/// `slice_data` already uses for "no fields".
+/// The cache parameter is `&DataInfo` (not `Option`): an empty tuple
+/// is its own "no default" sentinel, so a dataset whose meta doc has
+/// no `data_info` field just lands an empty tuple in the cache.
 pub fn transform_timeseries<T: schema::IsTimeseries>(
     params: &serde_json::Value,
     ts: &[BsonDateTime],
@@ -45,15 +56,25 @@ pub fn transform_timeseries<T: schema::IsTimeseries>(
         slice_timerange(start_date, end_date, ts, &mut doc);
     }
 
-    // Doc-level data_info takes precedence; only fall back to the cached
-    // meta-level default when the doc itself carries no variable names.
-    // `data_info.0` is the variable-names vector, so its emptiness is the
-    // canonical "no data_info" check.
-    if doc.data_info().0.is_empty() && !cached_data_info.0.is_empty() {
-        doc.set_data_info(cached_data_info.clone());
+    if data.is_empty() {
+        // No `data=` qsp. Clear `data` (the historical behaviour for
+        // this code path) and scrub `data_info` so the response omits
+        // it. Clients that need variable info read the meta endpoint.
+        doc.set_data(Vec::new());
+        doc.set_data_info(None);
+        return Some(doc);
     }
 
-    slice_data(&data, doc)
+    // `data=` qsp set. Materialise the working `data_info`: doc-level
+    // wins over cache, and an empty (or absent) doc-level value falls
+    // back to the meta-level cached default.
+    let working_info: schema::DataInfo = doc
+        .data_info()
+        .filter(|di| !di.0.is_empty())
+        .unwrap_or_else(|| cached_data_info.clone());
+    doc.set_data_info(Some(working_info.clone()));
+
+    slice_data(&data, &working_info, doc)
 }
 
 /// Slice the document's data columns and timeseries field to the time window
@@ -91,30 +112,32 @@ pub fn slice_timerange<T: schema::IsTimeseries>(
     }
 }
 
-/// Apply the `data=` query parameter to a single document. Behaviour mirrors
-/// the previous Vec-based implementation:
+/// Apply the `data=` query parameter to a single document. Called only
+/// when `data=` was actually supplied — the empty-`data` case is handled
+/// by `transform_timeseries` before getting here.
+///
+///   - `data` contains "all": leaves data/data_info untouched.
+///   - otherwise: filters the data columns down to the named variables
+///     and writes the filtered `data_info` back onto the doc; returns
+///     `None` if no requested variables match (caller drops the row).
+///   - if `data` also contains "except_data_values", clears the `data`
+///     field after column-filtering, but keeps the row (the filtered
+///     `data_info` is what the caller wanted to see).
 ///
-///   - empty `data`: clears the document's `data` field but keeps the row.
-///   - `data` contains "all": leaves everything untouched.
-///   - otherwise: filters the data columns down to the named variables;
-///     returns `None` if no requested variables match (caller drops the row).
-///   - if `data` also contains "except_data_values", clears the data field
-///     after column-filtering, but keeps the row.
+/// `data_info` is passed in as an explicit `&DataInfo` rather than read
+/// off the doc — the caller (`transform_timeseries`) is responsible
+/// for deciding which `data_info` is in effect (doc-level vs cached
+/// meta-level default) and putting it on the doc before calling here.
 pub fn slice_data<T: schema::IsTimeseries>(
     data: &[String],
+    data_info: &schema::DataInfo,
     mut doc: T,
 ) -> Option<T> {
-    if data.is_empty() {
-        doc.set_data(Vec::new());
-        return Some(doc);
-    }
-
     if data.iter().any(|s| s == "all") {
         return Some(doc);
     }
 
     // Specific fields requested — filter columns.
-    let data_info = doc.data_info();
     let indexes: Vec<usize> = data
         .iter()
         .filter_map(|item| data_info.0.iter().position(|x| x == item))
@@ -126,12 +149,12 @@ pub fn slice_data<T: schema::IsTimeseries>(
         .collect();
     doc.set_data(filtered_data);
 
-    let filtered_data_info: (Vec<String>, Vec<String>, Vec<Vec<String>>) = (
+    let filtered_data_info: schema::DataInfo = (
         indexes.iter().filter_map(|&i| data_info.0.get(i).cloned()).collect(),
         data_info.1.clone(),
         indexes.iter().filter_map(|&i| data_info.2.get(i).cloned()).collect(),
     );
-    doc.set_data_info(filtered_data_info);
+    doc.set_data_info(Some(filtered_data_info));
 
     // No matching columns -> caller drops the row.
     if doc.data().is_empty() {
@@ -160,21 +183,33 @@ pub fn timeseries_stub<T: schema::IsTimeseries>(result: &T) -> schema::Timeserie
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::helpers::schema::{BsoseSchema, GeoJSONPoint, IsTimeseries};
+    use crate::helpers::schema::{BsoseSchema, DataInfo, GeoJSONPoint, IsTimeseries};
     use serde_json::json;
 
-    // Helper: construct a BsoseSchema directly. Field-level visibility is
-    // `pub(crate)` so this struct literal works from any test in the crate
-    // and gets compile-time field checking — if a schema field is renamed,
-    // the test stops compiling.
-    fn make_bsose(id: &str, data: Vec<Vec<f64>>, var_names: &[&str]) -> BsoseSchema {
+    // Helper: construct a DataInfo from variable names. Used both inside
+    // `make_bsose` and at slice_data call sites that need to pass an
+    // explicit DataInfo.
+    fn make_data_info(var_names: &[&str]) -> DataInfo {
         let names: Vec<String> = var_names.iter().map(|s| s.to_string()).collect();
         let units = vec!["units".to_string(), "long_name".to_string()];
         let per_var_info: Vec<Vec<String>> = names
             .iter()
             .map(|n| vec!["u".to_string(), n.clone()])
             .collect();
+        (names, units, per_var_info)
+    }
+
+    /// Empty DataInfo sentinel — used as the "no meta-level default" cache
+    /// value in tests that exercise the BSOSE-style doc-level precedence.
+    fn empty_data_info() -> DataInfo {
+        (vec![], vec![], vec![])
+    }
 
+    // Helper: construct a BsoseSchema directly. Field-level visibility is
+    // `pub(crate)` so this struct literal works from any test in the crate
+    // and gets compile-time field checking — if a schema field is renamed,
+    // the test stops compiling.
+    fn make_bsose(id: &str, data: Vec<Vec<f64>>, var_names: &[&str]) -> BsoseSchema {
         BsoseSchema {
             _id: id.to_string(),
             metadata: vec!["meta1".to_string()],
@@ -190,7 +225,7 @@ mod tests {
             reference_density_profile: 1.0,
             data,
             timeseries: None,
-            data_info: (names, units, per_var_info),
+            data_info: Some(make_data_info(var_names)),
         }
     }
 
@@ -243,60 +278,51 @@ mod tests {
     }
 
     // ---- slice_data ----------------------------------------------------------
-
-    #[test]
-    fn slice_data_empty_request_drops_data() {
-        let doc = make_bsose(
-            "doc1",
-            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
-            &["temp", "salinity"],
-        );
-        let mut out = slice_data(&[], doc).expect("empty data param keeps the row");
-        assert!(out.data().is_empty());
-    }
+    //
+    // `slice_data` is only called from `transform_timeseries` in the
+    // `data=` qsp case, so these tests always supply a non-empty `data`
+    // vector. The empty-data case is exercised through the
+    // `transform_timeseries` tests below.
 
     #[test]
     fn slice_data_all_keeps_everything() {
+        let info = make_data_info(&["temp", "salinity"]);
         let doc = make_bsose(
             "doc1",
             vec![vec![1.0, 2.0], vec![3.0, 4.0]],
             &["temp", "salinity"],
         );
-        let mut out = slice_data(&["all".to_string()], doc).unwrap();
+        let mut out = slice_data(&["all".to_string()], &info, doc).unwrap();
         assert_eq!(*out.data(), vec![vec![1.0, 2.0], vec![3.0, 4.0]]);
     }
 
     #[test]
     fn slice_data_specific_field_filters_columns() {
+        let info = make_data_info(&["temp", "salinity"]);
         let doc = make_bsose(
             "doc1",
             vec![vec![1.0, 2.0], vec![3.0, 4.0]],
             &["temp", "salinity"],
         );
-        let mut out = slice_data(&["salinity".to_string()], doc).unwrap();
+        let mut out = slice_data(&["salinity".to_string()], &info, doc).unwrap();
         assert_eq!(*out.data(), vec![vec![3.0, 4.0]]);
     }
 
     #[test]
     fn slice_data_unknown_field_drops_result() {
-        let doc = make_bsose(
-            "doc1",
-            vec![vec![1.0, 2.0]],
-            &["temp"],
-        );
-        let out = slice_data(&["nonexistent".to_string()], doc);
+        let info = make_data_info(&["temp"]);
+        let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]);
+        let out = slice_data(&["nonexistent".to_string()], &info, doc);
         assert!(out.is_none(), "no matching columns -> row is dropped");
     }
 
     #[test]
     fn slice_data_except_data_values_clears_after_filtering() {
-        let doc = make_bsose(
-            "doc1",
-            vec![vec![1.0, 2.0]],
-            &["temp"],
-        );
+        let info = make_data_info(&["temp"]);
+        let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]);
         let mut out = slice_data(
             &["temp".to_string(), "except_data_values".to_string()],
+            &info,
             doc,
         )
         .unwrap();
@@ -323,8 +349,7 @@ mod tests {
         // Empty cached_data_info — this doc already carries its own
         // data_info, so the precedence rule means the cache is never
         // consulted regardless.
-        let empty_cache: schema::DataInfo = (vec![], vec![], vec![]);
-        let mut out = transform_timeseries(&params, &timeseries, &empty_cache, doc).unwrap();
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc).unwrap();
         assert_eq!(*out.data(), vec![vec![20.0, 30.0]]);
     }
 
@@ -339,10 +364,10 @@ mod tests {
         let mut doc = make_bsose(
             "doc1",
             vec![vec![1.0, 2.0]],
-            &["sst"], // gets cleared below to simulate a docless data_info
+            &["sst"], // overridden to None below to simulate an OI SST doc
         );
         // Clear the doc's data_info so the cache fallback kicks in.
-        doc.data_info = (vec![], vec![], vec![]);
+        doc.data_info = None;
 
         let cache: schema::DataInfo = (
             vec!["sst".to_string()],
@@ -357,7 +382,7 @@ mod tests {
         assert_eq!(*out.data(), vec![vec![1.0, 2.0]]);
         // data_info has been stamped from the cache, then column-filtered
         // by slice_data — should still list sst.
-        let info = out.data_info();
+        let info = out.data_info().expect("data_info present when data= is set");
         assert_eq!(info.0, vec!["sst".to_string()]);
     }
 
@@ -386,8 +411,86 @@ mod tests {
         let mut out =
             transform_timeseries(&params, &timeseries, &cache, doc).expect("should resolve");
         assert_eq!(*out.data(), vec![vec![1.0, 2.0]]);
-        let info = out.data_info();
+        let info = out.data_info().expect("data_info present when data= is set");
+        assert_eq!(info.0, vec!["temp".to_string()]);
+    }
+
+    // ---- response-shape rule: data_info & timeseries omitted unless munged --
+
+    #[test]
+    fn transform_omits_data_info_when_no_data_qsp() {
+        // No `data=` in params: the response doc should carry no
+        // data_info — clients fall back to the meta endpoint.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({}); // no data=, no dates
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("doc should survive");
+        assert!(
+            out.data_info().is_none(),
+            "data_info should be absent when data= qsp is unset, got {:?}",
+            out.data_info()
+        );
+        // data field is cleared in the no-data= branch (historical behaviour).
+        assert!(out.data().is_empty());
+    }
+
+    #[test]
+    fn transform_omits_timeseries_when_no_date_qsp() {
+        // No `startDate` / `endDate`: the response doc should carry no
+        // timeseries field — clients fall back to the meta endpoint's
+        // dataset-wide timeseries.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({"data": "all"});
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("doc should survive");
+        assert!(
+            out.timeseries().is_none(),
+            "timeseries should be absent when neither startDate nor endDate is set"
+        );
+    }
+
+    #[test]
+    fn transform_includes_timeseries_when_start_date_set() {
+        // startDate alone is enough to trigger timeseries on the response.
+        let timeseries = ts(&[1, 2, 3]);
+        let doc = make_bsose("doc1", vec![vec![1.0, 2.0, 3.0]], &["temp"]);
+        let params = json!({"startDate": "2020-02-01T00:00:00Z"});
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("doc should survive");
+        let ts_field = out.timeseries().expect("timeseries present when startDate set");
+        assert_eq!(ts_field.len(), 2);
+        assert!(ts_field[0].starts_with("2020-02-01"));
+        // Without `data=`, data_info still absent.
+        assert!(out.data_info().is_none());
+    }
+
+    #[test]
+    fn transform_includes_data_info_when_data_qsp_set() {
+        // `data=` alone is enough to trigger data_info on the response,
+        // even without date params.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({"data": "temp"});
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("doc should survive");
+        let info = out.data_info().expect("data_info present when data= is set");
         assert_eq!(info.0, vec!["temp".to_string()]);
+        // Without dates, timeseries still absent.
+        assert!(out.timeseries().is_none());
     }
 
     // ---- timeseries_stub -----------------------------------------------------

From 0b6413b2650da17d42bd47246cc840876845b100 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 12:43:06 -0400
Subject: [PATCH 08/13] manage presence / nonpresence of data prop

---
 api/PAGINATION.md             |  28 +++++++---
 api/src/helpers/schema.rs     |   9 +++
 api/src/helpers/transforms.rs | 101 +++++++++++++++++++++++++++++++++-
 3 files changed, 129 insertions(+), 9 deletions(-)

diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index a89b25b..242dc01 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -95,7 +95,7 @@ antimeridian / north-pole docs aren't lost.
 | `center` + `radius` | JSON `[lon, lat]` + meters | Disk query. Radius capped at the dataset's `max_radius_meters`. |
 | `verticalRange` | JSON `[lo, hi]` | Half-open depth range applied on top of tile-level pagination. |
 | `startDate` / `endDate` | RFC-3339 string | Slices each doc's timeseries to this window. |
-| `data` | comma-separated | Variables to include. `all` keeps everything. `except_data_values` keeps the schema but clears values. |
+| `data` | comma-separated | Variables to include. `all` keeps everything; a specific list filters columns. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. |
 | `compression` | `minimal` | See mode flags. |
 | `batchmeta` | any | See mode flags. |
 | `tile_index` | non-negative integer | Pagination cursor. Default `0`. Almost always supplied by the previous response's `next_url`. |
@@ -197,11 +197,15 @@ MONGODB_URI_NOAAOISST=mongodb://localhost:27017 \
   cargo run
 ```
 
-### Response-shape rule for `data_info` and `timeseries`
+### Response-shape rules
 
-The fields `data_info` and `timeseries` on a response doc appear *only
-when the user's query has materially altered the dataset-wide defaults*:
+Three fields on a response doc — `data`, `data_info`, `timeseries` —
+appear *only when the user's query has materially asked for or altered
+them*:
 
+- `data` appears iff the user supplied `data=`. Without `data=`,
+  the field is omitted entirely and clients are signalled that they
+  asked for no per-cell values. Use this for slim listings.
 - `data_info` appears iff the user supplied `data=`. Without `data=`,
   the column layout matches the dataset default and the field is
   omitted (clients fall back to the meta endpoint).
@@ -209,10 +213,18 @@ when the user's query has materially altered the dataset-wide defaults*:
   Without either, the time axis matches the dataset default and the
   field is omitted.
 
-The intent is response slimness: the data response carries only what
-*differs* from the meta endpoint's dataset-wide values. With both
-qsps unset, response docs are short (just `_id`, geolocation, level,
-metadata, and an empty `data` array).
+A consequence of the `data` rule: if `data=` *is* supplied but the
+resulting data is empty (no columns survived filtering, or every
+column's time window collapsed to zero points), the whole doc is
+dropped from the response rather than serialized with an empty
+array. This keeps responses honest — a doc in the response always
+carries something the user asked for. The one exception is
+`except_data_values`: when present in the `data=` list, the user has
+explicitly opted into a schema-only response, so an empty `data`
+array is what they asked for and the doc stays.
+
+With all three qsps unset, response docs are short: `_id`,
+geolocation, level, metadata.
 
 ### `data_info` precedence rule (when `data=` is set)
 
diff --git a/api/src/helpers/schema.rs b/api/src/helpers/schema.rs
index abfb6af..8a7114b 100644
--- a/api/src/helpers/schema.rs
+++ b/api/src/helpers/schema.rs
@@ -78,6 +78,12 @@ pub struct BsoseSchema {
     pub(crate) sea_binary_mask_at_t_locaiton: bool,
     pub(crate) cell_z_size: f64,
     pub(crate) reference_density_profile: f64,
+    // `data` is the per-variable per-timestep array. Omitted from the
+    // response when empty so the no-`data=` qsp response stays slim
+    // (transform_timeseries clears it in that branch). When `data=` is
+    // set and `data` ends up empty after filtering, the whole doc gets
+    // dropped before serialization, so an empty array never ships.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
     pub(crate) data: Vec<Vec<f64>>,
     // Both of the next two are response-shape-driven: present iff the
     // user's query made the dataset-wide default insufficient. Per the
@@ -222,6 +228,9 @@ pub struct OisstSchema {
     pub(crate) basin: f64,
     pub(crate) geolocation: GeoJSONPoint,
     pub(crate) level: f64,
+    // Omitted from the response when empty (no `data=` qsp); see the
+    // matching annotation on `BsoseSchema.data` for the full reasoning.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
     pub(crate) data: Vec<Vec<f64>>,
     // OI SST data docs don't carry `timeseries` or `data_info` of their
     // own — both are populated at request time per the response-shape
diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs
index c33a71a..f17ed9e 100644
--- a/api/src/helpers/transforms.rs
+++ b/api/src/helpers/transforms.rs
@@ -74,7 +74,27 @@ pub fn transform_timeseries<T: schema::IsTimeseries>(
         .unwrap_or_else(|| cached_data_info.clone());
     doc.set_data_info(Some(working_info.clone()));
 
-    slice_data(&data, &working_info, doc)
+    let mut sliced = slice_data(&data, &working_info, doc)?;
+
+    // Drop-on-empty: when the user asked for data and the surviving
+    // data is empty — whether the outer Vec is empty (no matching
+    // columns survived) OR the outer is non-empty but every inner is
+    // empty (e.g. a time window that collapsed every column to zero
+    // points) — the doc has nothing useful to convey, so drop it.
+    // `iter().all(|inner| inner.is_empty())` returns true for both
+    // shapes (vacuously true on an empty outer), so a single check
+    // covers both.
+    //
+    // Exception: `except_data_values` in the data list is an explicit
+    // "I want the (filtered) data_info but not the values" signal, so
+    // an empty data array is what the user asked for, not a sign that
+    // we should drop. Skip the drop check in that case.
+    let user_wants_empty_data = data.iter().any(|s| s == "except_data_values");
+    if !user_wants_empty_data && sliced.data().iter().all(|inner| inner.is_empty()) {
+        return None;
+    }
+
+    Some(sliced)
 }
 
 /// Slice the document's data columns and timeseries field to the time window
@@ -493,6 +513,85 @@ mod tests {
         assert!(out.timeseries().is_none());
     }
 
+    // ---- drop-on-empty rule (data= set but data ends up empty) --------------
+
+    #[test]
+    fn transform_drops_doc_when_no_matching_columns() {
+        // User asked for a variable that doesn't exist on this doc.
+        // slice_data filters to an empty outer Vec → doc is dropped.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose("doc1", vec![vec![1.0, 2.0]], &["temp"]);
+        let params = json!({"data": "nonexistent"});
+        let out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc);
+        assert!(out.is_none(), "doc with no matching columns should be dropped");
+    }
+
+    #[test]
+    fn transform_drops_doc_when_time_window_collapses_to_empty() {
+        // User asked for a time range past the dataset's last timestamp:
+        // each column survives column-filtering but ends up with zero
+        // time points. The doc has nothing useful to report — drop.
+        let timeseries = ts(&[1, 2, 3]); // Jan, Feb, Mar 2020
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0, 3.0], vec![10.0, 20.0, 30.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({
+            "data":      "all",
+            "startDate": "2021-01-01T00:00:00Z", // well past the timeseries
+        });
+        let out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc);
+        assert!(
+            out.is_none(),
+            "doc whose time window collapsed to zero points should be dropped"
+        );
+    }
+
+    #[test]
+    fn transform_keeps_doc_with_except_data_values_despite_empty_data() {
+        // `except_data_values` is the user explicitly asking for
+        // "schema only, no values". slice_data clears the data after
+        // column-filtering; the drop-on-empty rule should skip this
+        // case rather than dropping the doc — the empty data was
+        // requested.
+        let timeseries = ts(&[1, 2]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0], vec![3.0, 4.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({"data": "temp,except_data_values"});
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("except_data_values should not trigger drop-on-empty");
+        // Data was deliberately cleared.
+        assert!(out.data().is_empty());
+        // But the filtered data_info still rides along — that's the
+        // whole point of except_data_values.
+        let info = out.data_info().expect("data_info still present");
+        assert_eq!(info.0, vec!["temp".to_string()]);
+    }
+
+    #[test]
+    fn transform_keeps_doc_when_at_least_some_data_remains() {
+        // User asked for a real column with a non-degenerate time
+        // window. The doc should survive.
+        let timeseries = ts(&[1, 2, 3, 4]);
+        let doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0, 3.0, 4.0], vec![10.0, 20.0, 30.0, 40.0]],
+            &["temp", "salinity"],
+        );
+        let params = json!({
+            "data":      "temp",
+            "startDate": "2020-02-01T00:00:00Z",
+            "endDate":   "2020-04-01T00:00:00Z",
+        });
+        let mut out = transform_timeseries(&params, &timeseries, &empty_data_info(), doc)
+            .expect("non-empty doc should survive");
+        assert_eq!(*out.data(), vec![vec![2.0, 3.0]]);
+    }
+
     // ---- timeseries_stub -----------------------------------------------------
 
     #[test]

From ec3eec990ad9b429c92db2dadebcc1883104e47d Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 12:48:20 -0400
Subject: [PATCH 09/13] bugfix

---
 api/src/helpers/transforms.rs | 86 ++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 7 deletions(-)

diff --git a/api/src/helpers/transforms.rs b/api/src/helpers/transforms.rs
index f17ed9e..4f226dd 100644
--- a/api/src/helpers/transforms.rs
+++ b/api/src/helpers/transforms.rs
@@ -107,13 +107,33 @@ pub fn slice_timerange<T: schema::IsTimeseries>(
     ts: &[BsonDateTime],
     doc: &mut T,
 ) {
-    let start_index = start_date
-        .and_then(|sd| ts.iter().position(|&t| t >= sd))
-        .unwrap_or(0);
-
-    let end_index = end_date
-        .and_then(|ed| ts.iter().rposition(|&t| t < ed).map(|i| i + 1))
-        .unwrap_or(ts.len());
+    // Match on the Option directly so "no filter" and "filter present but
+    // matches nothing" land at different ends of the axis:
+    //   - `start_date = None`           → start at 0 (no lower bound).
+    //   - `start_date = Some(sd)` and no timestamp is >= sd
+    //     (the filter is past the end of the data) → start at ts.len()
+    //     so the slice collapses to empty rather than degrading to
+    //     "whole range," which a plain `.unwrap_or(0)` would do.
+    // Symmetric reasoning for `end_date`.
+    let start_index = match start_date {
+        None => 0,
+        Some(sd) => ts.iter().position(|&t| t >= sd).unwrap_or(ts.len()),
+    };
+    let end_index = match end_date {
+        None => ts.len(),
+        Some(ed) => ts
+            .iter()
+            .rposition(|&t| t < ed)
+            .map(|i| i + 1)
+            .unwrap_or(0),
+    };
+
+    // If the user passed mutually unsatisfiable dates (or startDate is
+    // past everything *and* endDate is before everything), `start_index`
+    // could exceed `end_index` — slicing `[a..b]` with `a > b` panics.
+    // Clamp `end` up to `start` so the slice is always empty rather than
+    // a panic.
+    let end_index = end_index.max(start_index);
 
     let time_window: Vec<String> = ts[start_index..end_index]
         .iter()
@@ -297,6 +317,58 @@ mod tests {
         assert_eq!(*doc.data(), vec![vec![1.0, 2.0, 3.0]]);
     }
 
+    #[test]
+    fn slice_timerange_startdate_past_everything_yields_empty_range() {
+        // startDate is past the last timestamp — the resulting window
+        // should be empty, not the whole range (which the old
+        // `.unwrap_or(0)` shape gave by accident).
+        let timeseries = ts(&[1, 2, 3]); // Jan, Feb, Mar 2020
+        let mut doc = make_bsose(
+            "doc1",
+            vec![vec![1.0, 2.0, 3.0]],
+            &["temp"],
+        );
+        let start = helpers::string2bsondate("2021-01-01T00:00:00Z");
+        slice_timerange(start, None, &timeseries, &mut doc);
+        assert!(doc.data()[0].is_empty());
+        assert!(doc.timeseries().unwrap().is_empty());
+    }
+
+    #[test]
+    fn slice_timerange_enddate_before_everything_yields_empty_range() {
+        // endDate is before the first timestamp — the resulting window
+        // should be empty, not the whole range.
+        let timeseries = ts(&[6, 7, 8]); // Jun, Jul, Aug 2020
+        let mut doc = make_bsose(
+            "doc1",
+            vec![vec![6.0, 7.0, 8.0]],
+            &["temp"],
+        );
+        let end = helpers::string2bsondate("2020-01-01T00:00:00Z");
+        slice_timerange(None, end, &timeseries, &mut doc);
+        assert!(doc.data()[0].is_empty());
+        assert!(doc.timeseries().unwrap().is_empty());
+    }
+
+    #[test]
+    fn slice_timerange_mutually_unsatisfiable_dates_collapse_to_empty() {
+        // startDate past the data AND endDate before the data: the
+        // raw indices would be start_index = ts.len(), end_index = 0,
+        // which would panic on the slice. The `end_index.max(start_index)`
+        // clamp keeps this safe (and empty).
+        let timeseries = ts(&[6, 7, 8]);
+        let mut doc = make_bsose(
+            "doc1",
+            vec![vec![6.0, 7.0, 8.0]],
+            &["temp"],
+        );
+        let start = helpers::string2bsondate("2021-01-01T00:00:00Z");
+        let end = helpers::string2bsondate("2019-01-01T00:00:00Z");
+        slice_timerange(start, end, &timeseries, &mut doc);
+        assert!(doc.data()[0].is_empty());
+        assert!(doc.timeseries().unwrap().is_empty());
+    }
+
     // ---- slice_data ----------------------------------------------------------
     //
     // `slice_data` is only called from `transform_timeseries` in the

From 4c2b932c8087c7425fb98db23dd035e4f9f45ec5 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 12:54:56 -0400
Subject: [PATCH 10/13] bugfix

---
 api/tests/integration.rs | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/api/tests/integration.rs b/api/tests/integration.rs
index 3be701e..8f3cd39 100644
--- a/api/tests/integration.rs
+++ b/api/tests/integration.rs
@@ -131,13 +131,26 @@ async fn no_filters_returns_all_seeded_documents_across_pages() {
     let docs = get_paged("/timeseries/bsose", &[]).await;
     assert_eq!(docs.len(), 4, "expected all 4 seeded docs across pages");
 
-    // Without `data` set, slice_data clears the data field but keeps rows.
+    // Per the response-shape rule, a request with neither `data=` nor
+    // `startDate`/`endDate` should return slim docs: `data`, `data_info`,
+    // and `timeseries` are all omitted because the user hasn't asked to
+    // see or alter them. Clients fall back to the meta endpoint for the
+    // dataset-wide variable info and time axis.
     for row in &docs {
-        let data = row.get("data").expect("each row should have a data field");
-        let outer = data.as_array().expect("data should be an array");
         assert!(
-            outer.is_empty(),
-            "data should be cleared when `data` query param is absent"
+            row.get("data").is_none(),
+            "`data` field should be absent when no `data` qsp is supplied; got: {:?}",
+            row.get("data")
+        );
+        assert!(
+            row.get("data_info").is_none(),
+            "`data_info` field should be absent when no `data` qsp is supplied; got: {:?}",
+            row.get("data_info")
+        );
+        assert!(
+            row.get("timeseries").is_none(),
+            "`timeseries` field should be absent when no date qsp is supplied; got: {:?}",
+            row.get("timeseries")
         );
     }
 }

From 8b4581c224fd8d19ce0f12672e7b80c83c5f8c80 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 13:07:04 -0400
Subject: [PATCH 11/13] validate QSPs

---
 api/PAGINATION.md          |   5 ++
 api/src/helpers/helpers.rs | 158 +++++++++++++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index 242dc01..913e5ef 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -102,6 +102,11 @@ antimeridian / north-pole docs aren't lost.
 
 ## Validation errors (HTTP 400)
 
+- Any query parameter not on the whitelist above. Unknown names are
+  rejected so typos (e.g. `start_Date` instead of `startDate`) fail
+  loudly instead of being silently ignored. When the typo is close to
+  a real param name, the error message includes a "did you mean ..."
+  suggestion.
 - More than one of `polygon` / `box` / `center` set.
 - `center` set without `radius`, or vice versa.
 - `radius` non-numeric, negative, non-finite, or above the dataset's cap.
diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs
index 862d65f..f3b7c81 100644
--- a/api/src/helpers/helpers.rs
+++ b/api/src/helpers/helpers.rs
@@ -40,8 +40,92 @@ pub fn create_response<T: Serialize>(results: Vec<T>) -> HttpResponse {
     }
 }
 
+/// Whitelist of query-string parameter names recognised by the
+/// `/timeseries/*` endpoints. Any qsp not in this list is rejected with
+/// a 400 so typos (e.g. `start_Date` instead of `startDate`) fail loudly
+/// with a useful suggestion rather than getting silently ignored
+/// downstream. Update this list when adding a new qsp; keep it in sync
+/// with the table in `api/PAGINATION.md`.
+const ALLOWED_QSP: &[&str] = &[
+    "id",
+    "box",
+    "polygon",
+    "center",
+    "radius",
+    "verticalRange",
+    "startDate",
+    "endDate",
+    "data",
+    "compression",
+    "batchmeta",
+    "tile_index",
+];
+
+/// Levenshtein edit distance — used to suggest a "did you mean" target
+/// when a qsp doesn't match the whitelist. Iterative table with O(m*n)
+/// space; m and n are at most ~15 chars in practice (the longest
+/// whitelist entry is `verticalRange`), so this is trivially fast and
+/// not worth optimising into the row-rolling form.
+fn edit_distance(a: &str, b: &str) -> usize {
+    let a: Vec<char> = a.chars().collect();
+    let b: Vec<char> = b.chars().collect();
+    let m = a.len();
+    let n = b.len();
+    let mut dp = vec![vec![0usize; n + 1]; m + 1];
+    for i in 0..=m {
+        dp[i][0] = i;
+    }
+    for j in 0..=n {
+        dp[0][j] = j;
+    }
+    for i in 1..=m {
+        for j in 1..=n {
+            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
+            dp[i][j] = (dp[i - 1][j] + 1)
+                .min(dp[i][j - 1] + 1)
+                .min(dp[i - 1][j - 1] + cost);
+        }
+    }
+    dp[m][n]
+}
+
+/// Return the whitelist entry closest to `unknown` if one is within
+/// edit distance 2 — covers common typo flavours (single-char
+/// substitution / deletion / insertion, capitalisation differences)
+/// without firing on totally unrelated names. `None` when nothing is
+/// within range.
+fn suggest_qsp(unknown: &str) -> Option<&'static str> {
+    ALLOWED_QSP
+        .iter()
+        .map(|allowed| (*allowed, edit_distance(unknown, allowed)))
+        .filter(|(_, dist)| *dist <= 2)
+        .min_by_key(|&(_, dist)| dist)
+        .map(|(allowed, _)| allowed)
+}
+
 pub fn validate_query_params(params: &serde_json::Value) -> Result<(), HttpResponse> {
 
+    // Reject unknown qsp names before any further validation so a typo
+    // (e.g. `start_Date` instead of `startDate`) fails loudly with a
+    // useful suggestion rather than getting silently ignored downstream.
+    if let Some(obj) = params.as_object() {
+        for key in obj.keys() {
+            if !ALLOWED_QSP.contains(&key.as_str()) {
+                let msg = match suggest_qsp(key) {
+                    Some(closest) => format!(
+                        "Unknown query parameter '{}'. Did you mean '{}'? Allowed parameters: {}",
+                        key, closest, ALLOWED_QSP.join(", ")
+                    ),
+                    None => format!(
+                        "Unknown query parameter '{}'. Allowed parameters: {}",
+                        key, ALLOWED_QSP.join(", ")
+                    ),
+                };
+                return Err(HttpResponse::BadRequest().json(json!({"error": msg})));
+            }
+        }
+    }
+
     // should have at most one of polygon, box and center.
     let mut count = 0;
     if params.get("polygon").is_some() {
@@ -339,6 +423,80 @@ mod tests {
         assert!(validate_query_params(&params).is_ok());
     }
 
+    // ---- qsp whitelist --------------------------------------------------------
+
+    #[test]
+    fn validate_rejects_unknown_qsp() {
+        let params = json!({"start_Date": "2020-01-01T00:00:00Z"});
+        let err = validate_query_params(&params).unwrap_err();
+        assert_eq!(err.status(), 400);
+    }
+
+    #[test]
+    fn validate_rejects_unknown_qsp_even_when_other_params_valid() {
+        // A request that's well-formed apart from one stray unknown
+        // param should still be rejected — the whitelist check fires
+        // before any other validation.
+        let params = json!({
+            "box": "[[0,0],[10,10]]",
+            "frobnicate": "yes",
+        });
+        let err = validate_query_params(&params).unwrap_err();
+        assert_eq!(err.status(), 400);
+    }
+
+    #[test]
+    fn validate_known_qsps_pass_through_whitelist() {
+        // Sanity check: every entry in the whitelist that doesn't have
+        // its own shape constraints should be accepted on its own.
+        // (Constrained params like `polygon` have their own dedicated
+        // tests above; this is purely the whitelist check.)
+        for (key, value) in [
+            ("id", "doc1"),
+            ("data", "all"),
+            ("compression", "minimal"),
+            ("batchmeta", "true"),
+            ("tile_index", "5"),
+            ("startDate", "2020-01-01T00:00:00Z"),
+            ("endDate", "2020-12-31T23:59:59Z"),
+        ] {
+            let params = json!({ key: value });
+            assert!(
+                validate_query_params(&params).is_ok(),
+                "whitelist should accept '{}' but rejected it",
+                key
+            );
+        }
+    }
+
+    #[test]
+    fn edit_distance_handles_common_typos() {
+        assert_eq!(edit_distance("startdate", "startDate"), 1); // case-only sub
+        assert_eq!(edit_distance("start_Date", "startDate"), 1); // single extra char
+        assert_eq!(edit_distance("verticalrange", "verticalRange"), 1); // case
+        assert_eq!(edit_distance("Box", "box"), 1);
+        assert_eq!(edit_distance("", "box"), 3); // pure insertion cost
+        assert_eq!(edit_distance("box", "box"), 0);
+        assert!(edit_distance("zzzzzzzzz", "startDate") > 2);
+    }
+
+    #[test]
+    fn suggest_qsp_finds_close_match() {
+        assert_eq!(suggest_qsp("start_Date"), Some("startDate"));
+        assert_eq!(suggest_qsp("startdate"), Some("startDate"));
+        assert_eq!(suggest_qsp("verticalrange"), Some("verticalRange"));
+        assert_eq!(suggest_qsp("Box"), Some("box"));
+    }
+
+    #[test]
+    fn suggest_qsp_returns_none_for_unrelated_input() {
+        // A name with no plausibly close match in the whitelist
+        // should return None rather than reaching for the nearest
+        // entry no matter how far.
+        assert_eq!(suggest_qsp("frobnicate"), None);
+        assert_eq!(suggest_qsp("xyzzy"), None);
+    }
+
     // ---- validate_radius_cap -------------------------------------------------
 
     /// Minimal config for radius-cap testing. tile_degrees / levels are

From 697f15373bcef3d79811025b074afa88a435e2d1 Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 13:19:46 -0400
Subject: [PATCH 12/13] validate data string

---
 api/PAGINATION.md                  |   6 +-
 api/fixtures/bsose.json            |  16 +--
 api/src/helpers/dataset_config.rs  |  10 ++
 api/src/helpers/filter_composer.rs |   1 +
 api/src/helpers/helpers.rs         | 193 +++++++++++++++++++++++++++++
 api/src/helpers/tile_generator.rs  |   3 +
 api/src/main.rs                    |   3 +
 api/tests/integration.rs           |  34 ++++-
 8 files changed, 255 insertions(+), 11 deletions(-)

diff --git a/api/PAGINATION.md b/api/PAGINATION.md
index 913e5ef..1414dbb 100644
--- a/api/PAGINATION.md
+++ b/api/PAGINATION.md
@@ -95,7 +95,7 @@ antimeridian / north-pole docs aren't lost.
 | `center` + `radius` | JSON `[lon, lat]` + meters | Disk query. Radius capped at the dataset's `max_radius_meters`. |
 | `verticalRange` | JSON `[lo, hi]` | Half-open depth range applied on top of tile-level pagination. |
 | `startDate` / `endDate` | RFC-3339 string | Slices each doc's timeseries to this window. |
-| `data` | comma-separated | Variables to include. `all` keeps everything; a specific list filters columns. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. |
+| `data` | comma-separated | Variables to include. Each token must be: a dataset-specific variable name (BSOSE: `THETA`, `SALT`; OI SST: `sst`), the universal `all` (keep everything) or `except_data_values` (keep schema, clear values), or an integer (QC filter). Unknown tokens are rejected with a 400 + a "did you mean" suggestion. If the doc has no data to return after filtering (no matching columns, or a time window that collapsed to zero points), the whole doc is dropped from the response. Omitting `data=` entirely also omits the `data` field from each response doc — use that for slim listings. `except_data_values` in the list keeps the row but clears the values (schema-only mode) — that's the one case where an empty `data` array doesn't drop the doc. |
 | `compression` | `minimal` | See mode flags. |
 | `batchmeta` | any | See mode flags. |
 | `tile_index` | non-negative integer | Pagination cursor. Default `0`. Almost always supplied by the previous response's `next_url`. |
@@ -107,6 +107,10 @@ antimeridian / north-pole docs aren't lost.
   loudly instead of being silently ignored. When the typo is close to
   a real param name, the error message includes a "did you mean ..."
   suggestion.
+- Any token inside the `data=` list that isn't one of: the dataset's
+  declared variable names, the universal `all` / `except_data_values`,
+  or an integer (QC filter). Suggestions follow the same shape as the
+  qsp-name check.
 - More than one of `polygon` / `box` / `center` set.
 - `center` set without `radius`, or vice versa.
 - `radius` non-numeric, negative, non-finite, or above the dataset's cap.
diff --git a/api/fixtures/bsose.json b/api/fixtures/bsose.json
index 77dd0f6..adae199 100644
--- a/api/fixtures/bsose.json
+++ b/api/fixtures/bsose.json
@@ -14,9 +14,9 @@
       [34.50, 34.60, 34.70, 34.80]
     ],
     "data_info": [
-      ["temp", "salinity"],
+      ["THETA", "SALT"],
       ["units", "long_name"],
-      [["degC", "Temperature"], ["psu", "Salinity"]]
+      [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]]
     ]
   },
   {
@@ -34,9 +34,9 @@
       [35.00, 35.10, 35.20, 35.30]
     ],
     "data_info": [
-      ["temp", "salinity"],
+      ["THETA", "SALT"],
       ["units", "long_name"],
-      [["degC", "Temperature"], ["psu", "Salinity"]]
+      [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]]
     ]
   },
   {
@@ -54,9 +54,9 @@
       [33.10, 33.20, 33.30, 33.40]
     ],
     "data_info": [
-      ["temp", "salinity"],
+      ["THETA", "SALT"],
       ["units", "long_name"],
-      [["degC", "Temperature"], ["psu", "Salinity"]]
+      [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]]
     ]
   },
   {
@@ -74,9 +74,9 @@
       [34.10, 34.20, 34.30, 34.40]
     ],
     "data_info": [
-      ["temp", "salinity"],
+      ["THETA", "SALT"],
       ["units", "long_name"],
-      [["degC", "Temperature"], ["psu", "Salinity"]]
+      [["degC", "Potential Temperature"], ["psu", "Practical Salinity"]]
     ]
   }
 ]
diff --git a/api/src/helpers/dataset_config.rs b/api/src/helpers/dataset_config.rs
index 5ab4084..3962100 100644
--- a/api/src/helpers/dataset_config.rs
+++ b/api/src/helpers/dataset_config.rs
@@ -50,11 +50,19 @@ use super::schema::DataInfo;
 /// — tile generation falls back to walking the whole globe. The
 /// rectangle is treated as inclusive on its edges; a doc lying exactly
 /// on the coverage boundary is preserved.
+///
+/// `allowed_data_vars`: the per-dataset variable names accepted in the
+/// `data=` qsp. Used by `validate_data_param` to reject typos (and to
+/// power the "did you mean" suggestion). Does not include the universal
+/// tokens (`all`, `except_data_values`) or integer QC filters —
+/// those are accepted regardless of dataset and handled by the
+/// validator directly.
 pub struct DatasetConfig {
     pub tile_degrees: f64,
     pub max_radius_meters: f64,
     pub levels: &'static [f64],
     pub coverage_bbox: Option<BoundingBox>,
+    pub allowed_data_vars: &'static [&'static str],
 }
 
 /// Per-dataset identity + Mongo client + startup-loaded metadata, built
@@ -123,6 +131,7 @@ pub const BSOSE_CONFIG: DatasetConfig = DatasetConfig {
         sw: [-180.0, -90.0],
         ne: [180.0, -30.0],
     }),
+    allowed_data_vars: &["THETA", "SALT"],
 };
 
 /// OI SST has a single vertical level (the sea surface). We model it as
@@ -148,6 +157,7 @@ pub const OISST_CONFIG: DatasetConfig = DatasetConfig {
     max_radius_meters: 100_000.0, // 100 km — relax once usage informs us
     levels: OISST_LEVELS,
     coverage_bbox: None,
+    allowed_data_vars: &["sst"],
 };
 
 #[cfg(test)]
diff --git a/api/src/helpers/filter_composer.rs b/api/src/helpers/filter_composer.rs
index 834d7aa..2b79d29 100644
--- a/api/src/helpers/filter_composer.rs
+++ b/api/src/helpers/filter_composer.rs
@@ -170,6 +170,7 @@ mod tests {
         max_radius_meters: 1.0e6,
         levels: &[100.0, 500.0],
         coverage_bbox: None,
+        allowed_data_vars: &[],
     };
 
     fn null_tile() -> TileSpec {
diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs
index f3b7c81..841c030 100644
--- a/api/src/helpers/helpers.rs
+++ b/api/src/helpers/helpers.rs
@@ -194,6 +194,86 @@ pub fn validate_query_params(params: &serde_json::Value) -> Result<(), HttpRespo
     Ok(())
 }
 
+/// Universal tokens accepted in the `data=` qsp regardless of which
+/// dataset is being queried. `all` and `except_data_values` control
+/// response shape rather than naming a variable; both are documented
+/// in `api/PAGINATION.md`.
+const UNIVERSAL_DATA_TOKENS: &[&str] = &["all", "except_data_values"];
+
+/// Validate the contents of the `data=` qsp against the dataset's
+/// `allowed_data_vars`. Each comma-separated token must be one of:
+///
+///   - a universal token (`all` or `except_data_values`),
+///   - a variable name in `config.allowed_data_vars`, or
+///   - an integer (accepted as a QC filter; the QC system is dataset-
+///     agnostic so any non-negative integer is allowed at this layer).
+///
+/// Anything else returns 400 with a Levenshtein-suggested correction
+/// when one of the named candidates is within edit distance 2.
+///
+/// No-op when the `data=` qsp is absent.
+pub fn validate_data_param(
+    params: &serde_json::Value,
+    config: &DatasetConfig,
+) -> Result<(), HttpResponse> {
+    let raw = match params.get("data").and_then(|v| v.as_str()) {
+        Some(s) => s,
+        None => return Ok(()),
+    };
+
+    for token in raw.split(',') {
+        if token.is_empty() {
+            return Err(HttpResponse::BadRequest().json(json!({
+                "error": "'data' qsp contains an empty token (likely a leading, trailing, or doubled comma)"
+            })));
+        }
+        if UNIVERSAL_DATA_TOKENS.contains(&token) {
+            continue;
+        }
+        if config.allowed_data_vars.contains(&token) {
+            continue;
+        }
+        // Integer QC filter (any signed integer; the QC layer
+        // interprets the value, this layer just gate-keeps the shape).
+        if token.parse::<i64>().is_ok() {
+            continue;
+        }
+        let msg = unknown_data_token_message(token, config);
+        return Err(HttpResponse::BadRequest().json(json!({"error": msg})));
+    }
+    Ok(())
+}
+
+fn unknown_data_token_message(token: &str, config: &DatasetConfig) -> String {
+    // Suggest only against named candidates (universal tokens + per-
+    // dataset variable names). Integer QC filters aren't suggestible
+    // — the user either types one correctly or they don't.
+    let candidates: Vec<&str> = UNIVERSAL_DATA_TOKENS
+        .iter()
+        .copied()
+        .chain(config.allowed_data_vars.iter().copied())
+        .collect();
+
+    let suggestion = candidates
+        .iter()
+        .map(|c| (*c, edit_distance(token, c)))
+        .filter(|(_, d)| *d <= 2)
+        .min_by_key(|&(_, d)| d)
+        .map(|(c, _)| c);
+
+    let allowed_list = candidates.join(", ");
+    match suggestion {
+        Some(s) => format!(
+            "Unknown 'data' value '{}'. Did you mean '{}'? Allowed values for this dataset: {} (plus any integer for QC filtering).",
+            token, s, allowed_list
+        ),
+        None => format!(
+            "Unknown 'data' value '{}'. Allowed values for this dataset: {} (plus any integer for QC filtering).",
+            token, allowed_list
+        ),
+    }
+}
+
 /// Enforce the dataset's `max_radius_meters` for `center + radius` queries.
 ///
 /// `center + radius` is the one geo mode that *isn't* spatially tiled
@@ -506,8 +586,121 @@ mod tests {
         max_radius_meters: 1_000_000.0, // 1000 km
         levels: &[0.0],
         coverage_bbox: None,
+        allowed_data_vars: &[],
     };
 
+    /// Test config with a couple of allowed variable names, used by
+    /// the `validate_data_param` tests below.
+    const DATA_TEST_CONFIG: DatasetConfig = DatasetConfig {
+        tile_degrees: 10.0,
+        max_radius_meters: 1_000_000.0,
+        levels: &[0.0],
+        coverage_bbox: None,
+        allowed_data_vars: &["THETA", "SALT"],
+    };
+
+    // ---- validate_data_param --------------------------------------------------
+
+    #[test]
+    fn validate_data_param_noop_when_data_absent() {
+        let params = json!({});
+        assert!(validate_data_param(&params, &DATA_TEST_CONFIG).is_ok());
+    }
+
+    #[test]
+    fn validate_data_param_accepts_universal_tokens() {
+        for token in ["all", "except_data_values"] {
+            let params = json!({ "data": token });
+            assert!(
+                validate_data_param(&params, &DATA_TEST_CONFIG).is_ok(),
+                "universal token '{}' should be accepted",
+                token
+            );
+        }
+    }
+
+    #[test]
+    fn validate_data_param_accepts_per_dataset_var_names() {
+        for token in ["THETA", "SALT"] {
+            let params = json!({ "data": token });
+            assert!(
+                validate_data_param(&params, &DATA_TEST_CONFIG).is_ok(),
+                "allowed var '{}' should be accepted",
+                token
+            );
+        }
+    }
+
+    #[test]
+    fn validate_data_param_accepts_integer_qc_filters() {
+        // Any signed integer should pass; the QC layer (downstream)
+        // interprets the actual value.
+        for token in ["0", "1", "42", "-1", "9999"] {
+            let params = json!({ "data": token });
+            assert!(
+                validate_data_param(&params, &DATA_TEST_CONFIG).is_ok(),
+                "integer token '{}' should be accepted",
+                token
+            );
+        }
+    }
+
+    #[test]
+    fn validate_data_param_accepts_mixed_list() {
+        let params = json!({ "data": "all,THETA,1,SALT" });
+        assert!(validate_data_param(&params, &DATA_TEST_CONFIG).is_ok());
+    }
+
+    #[test]
+    fn validate_data_param_rejects_unknown_var() {
+        let params = json!({ "data": "salinity" });
+        let err = validate_data_param(&params, &DATA_TEST_CONFIG).unwrap_err();
+        assert_eq!(err.status(), 400);
+    }
+
+    #[test]
+    fn validate_data_param_rejects_unknown_var_in_mixed_list() {
+        // First token is fine, second is the bad one — should still
+        // reject (loops over every token).
+        let params = json!({ "data": "THETA,bogus" });
+        let err = validate_data_param(&params, &DATA_TEST_CONFIG).unwrap_err();
+        assert_eq!(err.status(), 400);
+    }
+
+    #[test]
+    fn validate_data_param_rejects_empty_token() {
+        // Trailing or doubled commas yield an empty token in the
+        // split — surface that as its own clear error rather than
+        // bouncing through the suggestion path.
+        for raw in ["THETA,", ",THETA", "THETA,,SALT"] {
+            let params = json!({ "data": raw });
+            let err = validate_data_param(&params, &DATA_TEST_CONFIG).unwrap_err();
+            assert_eq!(err.status(), 400);
+        }
+    }
+
+    #[test]
+    fn unknown_data_token_message_includes_suggestion_when_close() {
+        // The single typo here is in the case of the last char; edit
+        // distance 1 → suggest THETA.
+        let msg = unknown_data_token_message("Theta", &DATA_TEST_CONFIG);
+        assert!(
+            msg.contains("Did you mean 'THETA'"),
+            "suggestion missing in: {}",
+            msg
+        );
+    }
+
+    #[test]
+    fn unknown_data_token_message_omits_suggestion_when_far() {
+        let msg = unknown_data_token_message("zzzzzzzzzzz", &DATA_TEST_CONFIG);
+        assert!(
+            !msg.contains("Did you mean"),
+            "should not suggest a far-away name: {}",
+            msg
+        );
+    }
+
     #[test]
     fn radius_cap_skipped_when_no_center() {
         // No center → mode doesn't apply, cap is moot.
diff --git a/api/src/helpers/tile_generator.rs b/api/src/helpers/tile_generator.rs
index 7997130..6061e8a 100644
--- a/api/src/helpers/tile_generator.rs
+++ b/api/src/helpers/tile_generator.rs
@@ -368,6 +368,7 @@ mod tests {
         max_radius_meters: 1.0e6,
         levels: &[0.0, 100.0],
         coverage_bbox: None,
+        allowed_data_vars: &[],
     };
 
     // ---- top-level dispatch --------------------------------------------------
@@ -658,6 +659,7 @@ mod tests {
             sw: [-180.0, -90.0],
             ne: [180.0, -30.0],
         }),
+        allowed_data_vars: &[],
     };
 
     #[test]
@@ -881,6 +883,7 @@ mod tests {
             max_radius_meters: 1.0e6,
             levels: &[],
             coverage_bbox: None,
+            allowed_data_vars: &[],
         };
         let tiles = generate_tiles(
             &json!({"box": "[[0,0],[10,10]]"}),
diff --git a/api/src/main.rs b/api/src/main.rs
index 5fca235..10bd141 100644
--- a/api/src/main.rs
+++ b/api/src/main.rs
@@ -151,6 +151,9 @@ where
     if let Err(response) = helpers::validate_radius_cap(&params, config) {
         return response;
     }
+    if let Err(response) = helpers::validate_data_param(&params, config) {
+        return response;
+    }
 
     let start_idx = match pagination::parse_tile_index(&params) {
         Ok(i) => i,
diff --git a/api/tests/integration.rs b/api/tests/integration.rs
index 8f3cd39..9f2599f 100644
--- a/api/tests/integration.rs
+++ b/api/tests/integration.rs
@@ -175,18 +175,48 @@ async fn data_all_returns_full_timeseries_across_pages() {
 
 #[tokio::test]
 async fn data_specific_field_filters_columns_across_pages() {
-    let docs = get_paged("/timeseries/bsose", &[("data", "salinity")]).await;
+    let docs = get_paged("/timeseries/bsose", &[("data", "SALT")]).await;
     assert!(!docs.is_empty());
     for row in &docs {
         let names = &row["data_info"][0];
         assert_eq!(
             names.as_array().unwrap(),
-            &vec![Value::String("salinity".to_string())]
+            &vec![Value::String("SALT".to_string())]
         );
         assert_eq!(row["data"].as_array().unwrap().len(), 1);
     }
 }
 
+#[tokio::test]
+async fn unknown_data_value_returns_400() {
+    // `salinity` was the old BSOSE variable name; the production names
+    // are THETA/SALT. The whitelist should reject the typo with a
+    // suggestion that includes the right name.
+    let resp = get("/timeseries/bsose", &[("data", "salinity")]).await;
+    assert_eq!(resp.status(), 400);
+}
+
+#[tokio::test]
+async fn integer_data_value_accepted_as_qc_filter() {
+    // Integers in the data= list are accepted as QC filters at the
+    // validation layer, regardless of whether the dataset has any
+    // matching column. `1,SALT` should not 400.
+    let body = get_envelope(
+        "/timeseries/bsose",
+        &[("id", "bsose_doc_001"), ("data", "1,SALT")],
+    )
+    .await;
+    let docs = body["docs"].as_array().unwrap();
+    assert_eq!(docs.len(), 1);
+    // SALT survives column-filtering; the `1` is silently dropped by
+    // slice_data (no matching variable name) but accepted by validation.
+    let names = &docs[0]["data_info"][0];
+    assert_eq!(
+        names.as_array().unwrap(),
+        &vec![Value::String("SALT".to_string())]
+    );
+}
+
 // ---------------------------------------------------------------------------
 // id filter
 // ---------------------------------------------------------------------------

From 5b0c7150b1bea92c911ddc8200c9c49e358d099d Mon Sep 17 00:00:00 2001
From: katieannemills <you@example.com>
Date: Wed, 27 May 2026 13:25:20 -0400
Subject: [PATCH 13/13] bugfix

---
 api/src/helpers/helpers.rs | 51 ++++++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/api/src/helpers/helpers.rs b/api/src/helpers/helpers.rs
index 841c030..a89a8a7 100644
--- a/api/src/helpers/helpers.rs
+++ b/api/src/helpers/helpers.rs
@@ -91,13 +91,19 @@ fn edit_distance(a: &str, b: &str) -> usize {
 
 /// Return the whitelist entry closest to `unknown` if one is within
 /// edit distance 2 — covers common typo flavours (single-char
-/// substitution / deletion / insertion, capitalisation differences)
-/// without firing on totally unrelated names. `None` when nothing is
-/// within range.
+/// substitution / deletion / insertion) without firing on totally
+/// unrelated names. `None` when nothing is within range.
+///
+/// Distance is computed on the *lowercased* forms of both sides, so a
+/// fully-uppercased typo (`STARTDATE`) or any mixed-case shout still
+/// gets a useful suggestion — case differences between every letter
+/// would otherwise push the distance well past the threshold even
+/// though the user clearly meant the same name.
 fn suggest_qsp(unknown: &str) -> Option<&'static str> {
+    let unknown_lower = unknown.to_lowercase();
     ALLOWED_QSP
         .iter()
-        .map(|allowed| (*allowed, edit_distance(unknown, allowed)))
+        .map(|allowed| (*allowed, edit_distance(&unknown_lower, &allowed.to_lowercase())))
         .filter(|(_, dist)| *dist <= 2)
         .min_by_key(|&(_, dist)| dist)
         .map(|(allowed, _)| allowed)
@@ -248,15 +254,21 @@ fn unknown_data_token_message(token: &str, config: &DatasetConfig) -> String {
     // Suggest only against named candidates (universal tokens + per-
     // dataset variable names). Integer QC filters aren't suggestible
     // — the user either types one correctly or they don't.
+    //
+    // Distance is computed case-insensitively so a user typing `theta`
+    // or `Theta` still gets pointed at `THETA`. Validation itself is
+    // case-sensitive (the user is asked to fix their request), but the
+    // suggestion should be forgiving.
     let candidates: Vec<&str> = UNIVERSAL_DATA_TOKENS
         .iter()
         .copied()
         .chain(config.allowed_data_vars.iter().copied())
         .collect();
 
+    let token_lower = token.to_lowercase();
     let suggestion = candidates
         .iter()
-        .map(|c| (*c, edit_distance(token, c)))
+        .map(|c| (*c, edit_distance(&token_lower, &c.to_lowercase())))
         .filter(|(_, d)| *d <= 2)
         .min_by_key(|&(_, d)| d)
         .map(|(c, _)| c);
@@ -568,6 +580,17 @@ mod tests {
         assert_eq!(suggest_qsp("Box"), Some("box"));
     }
 
+    #[test]
+    fn suggest_qsp_is_case_insensitive() {
+        // Fully-uppercased or arbitrarily-cased typos should still get
+        // a useful suggestion — without case-folding, every letter
+        // would count as a substitution and the distance would blow
+        // past the threshold.
+        assert_eq!(suggest_qsp("STARTDATE"), Some("startDate"));
+        assert_eq!(suggest_qsp("StArTdAtE"), Some("startDate"));
+        assert_eq!(suggest_qsp("BOX"), Some("box"));
+    }
+
     #[test]
     fn suggest_qsp_returns_none_for_unrelated_input() {
         // A name with no plausibly close match in the whitelist
@@ -701,6 +724,24 @@ mod tests {
         );
     }
 
+    #[test]
+    fn unknown_data_token_message_suggestion_is_case_insensitive() {
+        // Fully-lowercased or fully-uppercased typos of a config'd var
+        // should still produce a suggestion pointing at the correctly-
+        // cased original. Without case-folding, every letter mismatch
+        // would be a substitution and the distance would exceed the
+        // threshold for variables like `THETA`.
+        for typo in ["theta", "Theta", "ThEtA"] {
+            let msg = unknown_data_token_message(typo, &DATA_TEST_CONFIG);
+            assert!(
+                msg.contains("Did you mean 'THETA'"),
+                "expected THETA suggestion for '{}', got: {}",
+                typo,
+                msg
+            );
+        }
+    }
+
     #[test]
     fn radius_cap_skipped_when_no_center() {
         // No center → mode doesn't apply, cap is moot.