From 545e65da0396140ad50c1141e4deb60fffd1d65d Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 2 May 2025 14:45:30 +0000 Subject: [PATCH 01/49] wip: first rough draft of a GribJumpSource --- src/earthkit/data/sources/gribjump.py | 156 ++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 src/earthkit/data/sources/gribjump.py diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py new file mode 100644 index 000000000..ce5385c20 --- /dev/null +++ b/src/earthkit/data/sources/gribjump.py @@ -0,0 +1,156 @@ +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +try: + import pygribjump as pygj +except ImportError: + raise ImportError("GribJump access requires 'pygribjump' to be installed") + +import itertools +import os +from typing import Any + +import numpy as np + +from earthkit.data.sources import Source + + +def expand_multivalued_dicts( + request: dict[str, str | list[str]], +) -> list[dict[str, str]]: + """ + Expands a dictionary with list values into multiple dictionaries, + each containing one combination of the list values. + + Example: + Input: {'a': [1, 2], 'b': [3, 4], 'c': 5} + Output: [{'a': 1, 'b': 3, 'c': 5}, {'a': 1, 'b': 4, 'c': 5}, + {'a': 2, 'b': 3, 'c': 5}, {'a': 2, 'b': 4, 'c': 5}] + + Args: + request (dict): The original dictionary containing keys and values. + + Returns: + list: A list of dictionaries, each representing a unique combination + of the list values in the original dictionary. + """ + list_keywords = [k for k, v in request.items() if isinstance(v, list)] + values = [request[k] for k in list_keywords] + expanded_requests = [] + for combination in itertools.product(*values): + new_request = request.copy() + for k, v in zip(list_keywords, combination): + new_request[k] = v + expanded_requests.append(new_request) + return expanded_requests + + +class GribJumpSource(Source): + def __init__( + self, + request: dict, + *, + ranges: list[tuple[int, int]] | None = None, + mask: np.ndarray | None = None, + indices: np.ndarray | None = None, + **kwargs, + ): + super().__init__(**kwargs) + + if sum(opt is not None for opt in (ranges, mask, indices)) != 1: + raise ValueError( + "Exactly one of 'ranges', 'mask' or 'indices' must be set. " + f"Got {ranges=}, {mask=}, {indices=}" + ) + self._ranges = ranges + self._masks = mask + self._indices = indices + + self._check_env() + self._gj = pygj.GribJump() + self._requests = self._split_mars_requests(request) + + def _check_env(self): + gj_config_file = os.environ.get("GRIBJUMP_CONFIG_FILE", None) + gj_ignore_grid = os.environ.get("GRIBJUMP_IGNORE_GRID", None) + + if gj_config_file is None: + raise RuntimeError( + "Environment variable 'GRIBJUMP_CONFIG_FILE' is not set but " + "is required by GribJump. Please set it to the path of the GribJump " + "configuration file." + ) + if gj_ignore_grid is None: + # We could consider setting this automatically but this would need + # to be done carefully to not accidentally activate this for other + # gribjump accesses (e.g. through polytope). + raise RuntimeError( + "Environment variable 'GRIBJUMP_IGNORE_GRID' is not set but " + "must be set (to '1' or 'True') for the 'gribjump' source to work." + ) + + @staticmethod + def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: + """Splits request into many single requests that load one field each. + + Since GribJump returns its result arrays without metadata, we need to split the + request into many single requests to later map the outputs to the correct fields. + Additionally performs some basic validation and converts all values to strings. + """ + + request = request.copy() + + # Check for invalid values and cast anything but lists to strings + for k in request.keys(): + v = request[k] + if isinstance(v, str) and "/" in v: + # TODO: Check if there are valid reasons to use '/' apart from + # lists and ranges. + raise ValueError( + f"Found unexpected '/' in value '{v}' for keyword '{k}'. " + "Use Python lists to load from multiple fields." + ) + elif not isinstance(v, list): + request[k] = str(v) + + # Expand the request into all combinations of the list values + expanded_requests = expand_multivalued_dicts(request) + return expanded_requests + + def _build_extraction_requests(self) -> list[pygj.ExtractionRequest]: + if self._ranges is not None: + requests = [ + pygj.ExtractionRequest(request, self._ranges) + for request in self._requests + ] + elif self._masks is not None: + requests = [ + pygj.ExtractionRequest.from_mask(request, self._masks) + for request in self._requests + ] + elif self._indices is not None: + requests = [ + pygj.ExtractionRequest.from_indices(request, self._indices) + for request in self._requests + ] + else: + raise ValueError( + "No valid extraction request found. " + "Please set either 'ranges', 'mask' or 'indices'." + ) + return requests + + def to_numpy(self, **kwargs): + extract_iter = self._gj.extract(self._build_extraction_requests()) + flattened_arrays = [res.values_flat for res in extract_iter] + combined_array = np.stack(flattened_arrays) + return combined_array + + +source = GribJumpSource From 134086f64bb7fb21627737a9adc6fc9248c36e31 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 2 May 2025 16:13:30 +0000 Subject: [PATCH 02/49] wip: experimental tests for easier development --- tests/sources/test_gribjump.py | 172 +++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 tests/sources/test_gribjump.py diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py new file mode 100644 index 000000000..ffb179deb --- /dev/null +++ b/tests/sources/test_gribjump.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + + +import shutil +from pathlib import Path + +import numpy as np +import pytest +import yaml + +from earthkit.data import from_source +from earthkit.data.core.temporary import temp_directory, temp_env +from earthkit.data.testing import earthkit_test_data_file + + +@pytest.fixture +def setup_fdb_with_gribjump(): + import pyfdb + + with temp_directory() as tmpdir: + fdb_dir = Path(tmpdir) / "fdb" + fdb_dir.mkdir(exist_ok=True) + + # Copy of FDB schema + fdb_schema = earthkit_test_data_file("fdb_schema.txt") + shutil.copy(fdb_schema, fdb_dir / "schema") + + # FDB config + fdb_config = { + "type": "local", + "engine": "toc", + "schema": str(fdb_dir / "schema"), + "spaces": [{"handler": "Default", "roots": [{"path": str(fdb_dir)}]}], + } + fdb_config_path = fdb_dir / "config.yaml" + fdb_config_path.write_text(yaml.dump(fdb_config)) + + # Gribjump config + gj_config = { + "plugin": { + "select": "class=(.)", + } + } + gj_config_path = fdb_dir / "gribjump.yaml" + gj_config_path.write_text(yaml.dump(gj_config)) + + with temp_env( + # FDB5_CONFIG="", + FDB5_CONFIG_FILE=str(fdb_config_path), + FDB_ENABLE_GRIBJUMP="1", + FDB_HOME="", + GRIBJUMP_CONFIG_FILE=str(gj_config_path), + GRIBJUMP_IGNORE_GRID="1", + ): + fdb = pyfdb.FDB(config=fdb_config) + yield fdb + + +@pytest.fixture +def seed_fdb(setup_fdb_with_gribjump): + ds = from_source("file", earthkit_test_data_file("t_time_series.grib")) + for f in ds: + setup_fdb_with_gribjump.archive(f.message()) + setup_fdb_with_gribjump.flush() + yield setup_fdb_with_gribjump + + +def test_gribjump_with_ranges(seed_fdb): + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + source = from_source("gribjump", request, ranges=[(0, 1), (10, 12)]) + arr = source.to_numpy() + + assert arr is not None and isinstance(arr, np.ndarray) + assert arr.shape == (2, 3) + + +def test_gribjump_with_mask(seed_fdb): + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + mask = np.eye(7, 12, dtype=bool) + source = from_source("gribjump", request, mask=mask) + arr = source.to_numpy() + + assert arr is not None and isinstance(arr, np.ndarray) + assert arr.shape == (2, 7) + + +def test_gribjump_with_indices(seed_fdb): + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + indices = np.array([0, 7, 14, 21, 28, 35, 42]) + source = from_source("gribjump", request, indices=indices) + arr = source.to_numpy() + + assert arr is not None and isinstance(arr, np.ndarray) + assert arr.shape == (2, 7) + + +def test_gribjump_with_invalid_options(seed_fdb): + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + with pytest.raises(ValueError, match="Exactly one of"): + from_source( + "gribjump", + request, + ranges=[(0, 1), (10, 12)], + indices=np.array([0, 7, 14, 21, 28, 35, 42]) + ) + + +if __name__ == "__main__": + from earthkit.data.testing import main + + main(__file__) From bb467a2c46e4519c3b431c7e21951efd6d734708 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 5 May 2025 08:27:13 +0000 Subject: [PATCH 03/49] format changes using pre-commit hooks and fix small bug --- src/earthkit/data/sources/gribjump.py | 18 ++++++------------ tests/sources/test_gribjump.py | 10 +++++++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index ce5385c20..a0fae8ab0 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -118,6 +118,8 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: ) elif not isinstance(v, list): request[k] = str(v) + else: + request[k] = [str(i) for i in v] # Expand the request into all combinations of the list values expanded_requests = expand_multivalued_dicts(request) @@ -125,24 +127,16 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: def _build_extraction_requests(self) -> list[pygj.ExtractionRequest]: if self._ranges is not None: - requests = [ - pygj.ExtractionRequest(request, self._ranges) - for request in self._requests - ] + requests = [pygj.ExtractionRequest(request, self._ranges) for request in self._requests] elif self._masks is not None: - requests = [ - pygj.ExtractionRequest.from_mask(request, self._masks) - for request in self._requests - ] + requests = [pygj.ExtractionRequest.from_mask(request, self._masks) for request in self._requests] elif self._indices is not None: requests = [ - pygj.ExtractionRequest.from_indices(request, self._indices) - for request in self._requests + pygj.ExtractionRequest.from_indices(request, self._indices) for request in self._requests ] else: raise ValueError( - "No valid extraction request found. " - "Please set either 'ranges', 'mask' or 'indices'." + "No valid extraction request found. " "Please set either 'ranges', 'mask' or 'indices'." ) return requests diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index ffb179deb..98dd3620a 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -18,7 +18,8 @@ import yaml from earthkit.data import from_source -from earthkit.data.core.temporary import temp_directory, temp_env +from earthkit.data.core.temporary import temp_directory +from earthkit.data.core.temporary import temp_env from earthkit.data.testing import earthkit_test_data_file @@ -161,8 +162,11 @@ def test_gribjump_with_invalid_options(seed_fdb): from_source( "gribjump", request, - ranges=[(0, 1), (10, 12)], - indices=np.array([0, 7, 14, 21, 28, 35, 42]) + ) + + with pytest.raises(ValueError, match="Exactly one of"): + from_source( + "gribjump", request, ranges=[(0, 1), (10, 12)], indices=np.array([0, 7, 14, 21, 28, 35, 42]) ) From 47d0f05dec4296349242ee3c39ea2e256bb89b4a Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 5 May 2025 13:24:38 +0000 Subject: [PATCH 04/49] add prototype for the GribJumpSource based on SimpleFieldList Support for to_xarray, robust selection, efficient loading of subsets, tests, etc. still missing. This approach is inspired by FieldlistFromDicts and GribFieldListInMemory and I suspect this approach is most in line with how similar problems have been dealt with in earthkit-data by other contributors. The main issues I encountered are the following: The extracted output of GribJump is not an actual Field but an array with raw, extracted values from different grid cells in the field. Therefore, most of the existing abstractions and utilities do not fit this scenario. After trying out a few different things myself, I found the "list-of-dicts" source which theoretically also supports non-field values (numpy arrays with arbitrary metadata in dictionary form). I am still a bit concerned that breaking abstractions here in our case could be more confusing and bug-prone in the future. Inspired by GribFieldListInMemory, the FieldExtractList wraps the methods of its superclass SimpleFieldList that make it an iterator to lazily load the data using gribjump once it's actually needed. However, this might be a bit brittle as currently implemented and could be improved. --- src/earthkit/data/sources/gribjump.py | 137 +++++++++++++++++++++----- tests/sources/test_gribjump.py | 24 ++++- 2 files changed, 136 insertions(+), 25 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index a0fae8ab0..a283c17ee 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -18,29 +18,42 @@ import numpy as np +from earthkit.data.indexing.fieldlist import SimpleFieldList from earthkit.data.sources import Source +from earthkit.data.sources.array_list import ArrayField +from earthkit.data.utils.metadata.dict import UserMetadata def expand_multivalued_dicts( request: dict[str, str | list[str]], ) -> list[dict[str, str]]: """ - Expands a dictionary with list values into multiple dictionaries, - each containing one combination of the list values. + Expands a dictionary containing list values into multiple dictionaries representing all possible combinations. + + For each list-type value in the input dictionary, this function creates all possible combinations + with other list values, while keeping non-list values constant across all output dictionaries. + + The list keys are sorted alphabetically before generating combinations to ensure consistent + and deterministic ordering of the output dictionaries regardless of the original key order. Example: Input: {'a': [1, 2], 'b': [3, 4], 'c': 5} - Output: [{'a': 1, 'b': 3, 'c': 5}, {'a': 1, 'b': 4, 'c': 5}, - {'a': 2, 'b': 3, 'c': 5}, {'a': 2, 'b': 4, 'c': 5}] + Output: [ + {'a': 1, 'b': 3, 'c': 5}, + {'a': 1, 'b': 4, 'c': 5}, + {'a': 2, 'b': 3, 'c': 5}, + {'a': 2, 'b': 4, 'c': 5} + ] Args: - request (dict): The original dictionary containing keys and values. + request (dict[str, str | list[str]]): Dictionary with string keys and either string + or list of strings as values. Returns: - list: A list of dictionaries, each representing a unique combination - of the list values in the original dictionary. + list[dict[str, str]]: A list of dictionaries, where each dictionary contains one + specific combination of the input list values, with non-list values preserved. """ - list_keywords = [k for k, v in request.items() if isinstance(v, list)] + list_keywords = sorted(k for k, v in request.items() if isinstance(v, list)) values = [request[k] for k in list_keywords] expanded_requests = [] for combination in itertools.product(*values): @@ -51,6 +64,76 @@ def expand_multivalued_dicts( return expanded_requests +class FieldExtractList(SimpleFieldList): + """Lazily loaded representation of the points extrated from multiple fields using GribJump. + + For simplicity, this class currently inherits from SimpleFieldList and is + inspired by the FieldlistFromDicts and GribFieldListInMemory classes. + However, it is not a complete implementation and can break in unexpected + ways. The main reason for this is that although the arrays with the + extracted values are represented as ArrayFields, they are not truly proper + Field implementations. They are neither stored as 2D grids, nor do they + possess any geographical information or well-defined metadata. + + Known limitations: + * FieldExtractList.sel is quite brittle as any filter value must be a string. + The underlying metadata is stored as a dictionary of strings, and no + automatic type conversion is done. Any more complex filtering and slicing + will not work for most data types. Also, order_by and simialr methods will + perform lexicographical sorting on the string values. + * Efficient lazy loading of selections / slices only is not supported. + * Pickling / unpickling might not work. + * to_pandas and to_xarray methods are not implemented. + """ + + def __init__( + self, + gj: pygj.GribJump, + requests: list[dict[str, str]], + extraction_requests: list[pygj.ExtractionRequest], + ): + if len(requests) != len(extraction_requests): + raise ValueError( + f"Number of MARS requests ({len(requests)}) and GribJump extraction requests ({len(extraction_requests)}) must match." + ) + self._gj = gj + self._requests = requests + self._extraction_requests = extraction_requests + self._loaded = False + + super().__init__(fields=None) # The fields attribute is set lazily + + def __len__(self): + self._load() + return super().__len__() + + def __getitem__(self, n): + self._load() + return super().__getitem__(n) + + def _load(self): + if self._loaded: + return + + extraction_results = self._gj.extract(self._extraction_requests) + + fields = [] + for i, result in enumerate(extraction_results): + arr = result.values_flat + metadata = self._requests[i] + field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) + fields.append(field) + + self.fields = fields + self._loaded = True + + def to_xarray(self, *args, **kwargs): + self._not_implemented() + + def to_pandas(self, *args, **kwargs): + self._not_implemented() + + class GribJumpSource(Source): def __init__( self, @@ -74,12 +157,22 @@ def __init__( self._check_env() self._gj = pygj.GribJump() - self._requests = self._split_mars_requests(request) + + self._mars_requests = self._split_mars_requests(request) + self._gj_extraction_requests = self._build_extraction_requests(self._mars_requests) def _check_env(self): + fdb_conf = os.environ.get("FDB5_CONFIG", None) + fdb_home = os.environ.get("FDB_HOME", None) gj_config_file = os.environ.get("GRIBJUMP_CONFIG_FILE", None) gj_ignore_grid = os.environ.get("GRIBJUMP_IGNORE_GRID", None) + if fdb_home is None and fdb_conf is None: + raise RuntimeError( + """Neither FDB_HOME nor FDB5_CONFIG environment variable + was set! Please define either one to access FDB. + See: https://fields-database.readthedocs.io for details about FDB.""" + ) if gj_config_file is None: raise RuntimeError( "Environment variable 'GRIBJUMP_CONFIG_FILE' is not set but " @@ -113,7 +206,7 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: # TODO: Check if there are valid reasons to use '/' apart from # lists and ranges. raise ValueError( - f"Found unexpected '/' in value '{v}' for keyword '{k}'. " + f"Found unsupported list or range using '/' in value '{v}' for keyword '{k}'. " "Use Python lists to load from multiple fields." ) elif not isinstance(v, list): @@ -121,30 +214,28 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: else: request[k] = [str(i) for i in v] - # Expand the request into all combinations of the list values expanded_requests = expand_multivalued_dicts(request) return expanded_requests - def _build_extraction_requests(self) -> list[pygj.ExtractionRequest]: + def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[pygj.ExtractionRequest]: if self._ranges is not None: - requests = [pygj.ExtractionRequest(request, self._ranges) for request in self._requests] + requests = [pygj.ExtractionRequest(request, self._ranges) for request in mars_requests] elif self._masks is not None: - requests = [pygj.ExtractionRequest.from_mask(request, self._masks) for request in self._requests] + requests = [pygj.ExtractionRequest.from_mask(request, self._masks) for request in mars_requests] elif self._indices is not None: requests = [ - pygj.ExtractionRequest.from_indices(request, self._indices) for request in self._requests + pygj.ExtractionRequest.from_indices(request, self._indices) for request in mars_requests ] else: - raise ValueError( - "No valid extraction request found. " "Please set either 'ranges', 'mask' or 'indices'." - ) + raise ValueError("No valid extraction method specified.") return requests - def to_numpy(self, **kwargs): - extract_iter = self._gj.extract(self._build_extraction_requests()) - flattened_arrays = [res.values_flat for res in extract_iter] - combined_array = np.stack(flattened_arrays) - return combined_array + def mutate(self): + return FieldExtractList( + self._gj, + self._mars_requests, + self._gj_extraction_requests, + ) source = GribJumpSource diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 98dd3620a..74e1449a6 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -23,6 +23,27 @@ from earthkit.data.testing import earthkit_test_data_file +def test_expand_multivalued_dicts(): + from earthkit.data.sources.gribjump import expand_multivalued_dicts + + request = { + "b": ["hello", "world"], + "a": [1, 2, 3], + "c": 5, + } + expected_dicts = [ + {"a": 1, "b": "hello", "c": 5}, + {"a": 1, "b": "world", "c": 5}, + {"a": 2, "b": "hello", "c": 5}, + {"a": 2, "b": "world", "c": 5}, + {"a": 3, "b": "hello", "c": 5}, + {"a": 3, "b": "world", "c": 5}, + ] + + expanded_requests = expand_multivalued_dicts(request) + assert expanded_requests == expected_dicts + + @pytest.fixture def setup_fdb_with_gribjump(): import pyfdb @@ -55,10 +76,9 @@ def setup_fdb_with_gribjump(): gj_config_path.write_text(yaml.dump(gj_config)) with temp_env( - # FDB5_CONFIG="", FDB5_CONFIG_FILE=str(fdb_config_path), FDB_ENABLE_GRIBJUMP="1", - FDB_HOME="", + FDB_HOME=str(fdb_dir), GRIBJUMP_CONFIG_FILE=str(gj_config_path), GRIBJUMP_IGNORE_GRID="1", ): From 824421ce54fea43003adb7417f245d76133bdcb9 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 5 May 2025 14:33:58 +0000 Subject: [PATCH 05/49] tests: add a few more simple tests and add NO_GRIBJUMP flag for pytest --- src/earthkit/data/sources/gribjump.py | 11 ++++-- src/earthkit/data/testing.py | 2 +- tests/sources/test_gribjump.py | 51 ++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index a283c17ee..b5c7b24ac 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -53,10 +53,17 @@ def expand_multivalued_dicts( list[dict[str, str]]: A list of dictionaries, where each dictionary contains one specific combination of the input list values, with non-list values preserved. """ + if empty_list_keys := [k for k, v in request.items() if isinstance(v, list) and len(v) == 0]: + raise ValueError( + "Cannot expand dictionary with empty list. " + f"Found empty list for keys: {', '.join(empty_list_keys)}" + ) + list_keywords = sorted(k for k, v in request.items() if isinstance(v, list)) - values = [request[k] for k in list_keywords] + lists = [request[k] for k in list_keywords] + expanded_requests = [] - for combination in itertools.product(*values): + for combination in itertools.product(*lists): new_request = request.copy() for k, v in zip(list_keywords, combination): new_request[k] = v diff --git a/src/earthkit/data/testing.py b/src/earthkit/data/testing.py index e9b08d28b..d19f63445 100644 --- a/src/earthkit/data/testing.py +++ b/src/earthkit/data/testing.py @@ -120,7 +120,7 @@ def modules_installed(*modules): fdb_home = os.environ.get("FDB_HOME", None) NO_PROD_FDB = fdb_home is None - +NO_GRIBJUMP = NO_FDB or not modules_installed("pygribjump") NO_POLYTOPE = not os.path.exists(os.path.expanduser("~/.polytopeapirc")) NO_COVJSONKIT = not modules_installed("covjsonkit") NO_RIOXARRAY = not modules_installed("rioxarray") diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 74e1449a6..825b87925 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -13,16 +13,17 @@ import shutil from pathlib import Path -import numpy as np import pytest import yaml from earthkit.data import from_source from earthkit.data.core.temporary import temp_directory from earthkit.data.core.temporary import temp_env +from earthkit.data.testing import NO_GRIBJUMP from earthkit.data.testing import earthkit_test_data_file +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_expand_multivalued_dicts(): from earthkit.data.sources.gribjump import expand_multivalued_dicts @@ -43,6 +44,13 @@ def test_expand_multivalued_dicts(): expanded_requests = expand_multivalued_dicts(request) assert expanded_requests == expected_dicts + assert expand_multivalued_dicts({}) == [{}] + assert expand_multivalued_dicts({"a": 1}) == [{"a": 1}] + assert expand_multivalued_dicts({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] + + with pytest.raises(ValueError, match="Cannot expand dictionary with empty list"): + expand_multivalued_dicts({"a": 1, "b": []}) + @pytest.fixture def setup_fdb_with_gribjump(): @@ -95,7 +103,10 @@ def seed_fdb(setup_fdb_with_gribjump): yield setup_fdb_with_gribjump +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_ranges(seed_fdb): + import numpy as np + request = { "class": "od", "date": "20201221", @@ -117,7 +128,10 @@ def test_gribjump_with_ranges(seed_fdb): assert arr.shape == (2, 3) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_mask(seed_fdb): + import numpy as np + request = { "class": "od", "date": "20201221", @@ -140,7 +154,10 @@ def test_gribjump_with_mask(seed_fdb): assert arr.shape == (2, 7) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_indices(seed_fdb): + import numpy as np + request = { "class": "od", "date": "20201221", @@ -163,7 +180,39 @@ def test_gribjump_with_indices(seed_fdb): assert arr.shape == (2, 7) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_source_against_manually_masked_grid(seed_fdb): + import numpy as np + + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + mask = (np.eye(7, 12, dtype=bool) | np.eye(7, 12, k=1, dtype=bool)).ravel() + + gj_source = from_source("gribjump", request, mask=mask) + file_source = from_source("file", earthkit_test_data_file("t_time_series.grib")) + + expected_arr = file_source.sel(step=[0, 6], param="z").to_numpy().reshape(2, -1)[:, mask] + extracted_arr = gj_source.to_numpy() + + assert np.allclose(expected_arr, extracted_arr) + + +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_invalid_options(seed_fdb): + import numpy as np + request = { "class": "od", "date": "20201221", From 5d2bb856b7d624cff34127480c575228bad7fb4a Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 5 May 2025 16:25:08 +0000 Subject: [PATCH 06/49] tidy: small cleanup, improve variable naming and fix type hints I might remove the type hints as I noticed that they aren't used anywhere else in the codebase. --- src/earthkit/data/sources/gribjump.py | 29 +++++++++++++++------------ tests/sources/test_gribjump.py | 14 ++++++------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index b5c7b24ac..548a0c70d 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -15,6 +15,8 @@ import itertools import os from typing import Any +from typing import Optional +from typing import Union import numpy as np @@ -24,8 +26,8 @@ from earthkit.data.utils.metadata.dict import UserMetadata -def expand_multivalued_dicts( - request: dict[str, str | list[str]], +def expand_dict_with_lists( + request: dict[str, Union[str, list[str]]], ) -> list[dict[str, str]]: """ Expands a dictionary containing list values into multiple dictionaries representing all possible combinations. @@ -46,7 +48,7 @@ def expand_multivalued_dicts( ] Args: - request (dict[str, str | list[str]]): Dictionary with string keys and either string + request (dict[str, Union[str, list[str]]]): Dictionary with string keys and either string or list of strings as values. Returns: @@ -72,7 +74,7 @@ def expand_multivalued_dicts( class FieldExtractList(SimpleFieldList): - """Lazily loaded representation of the points extrated from multiple fields using GribJump. + """Lazily loaded representation of the points extracted from multiple fields using GribJump. For simplicity, this class currently inherits from SimpleFieldList and is inspired by the FieldlistFromDicts and GribFieldListInMemory classes. @@ -86,7 +88,7 @@ class FieldExtractList(SimpleFieldList): * FieldExtractList.sel is quite brittle as any filter value must be a string. The underlying metadata is stored as a dictionary of strings, and no automatic type conversion is done. Any more complex filtering and slicing - will not work for most data types. Also, order_by and simialr methods will + will not work for most data types. Also, order_by and similar methods will perform lexicographical sorting on the string values. * Efficient lazy loading of selections / slices only is not supported. * Pickling / unpickling might not work. @@ -146,9 +148,9 @@ def __init__( self, request: dict, *, - ranges: list[tuple[int, int]] | None = None, - mask: np.ndarray | None = None, - indices: np.ndarray | None = None, + ranges: Optional[list[tuple[int, int]]] = None, + mask: Optional[np.ndarray] = None, + indices: Optional[np.ndarray] = None, **kwargs, ): super().__init__(**kwargs) @@ -159,7 +161,7 @@ def __init__( f"Got {ranges=}, {mask=}, {indices=}" ) self._ranges = ranges - self._masks = mask + self._mask = mask self._indices = indices self._check_env() @@ -201,7 +203,8 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: Since GribJump returns its result arrays without metadata, we need to split the request into many single requests to later map the outputs to the correct fields. - Additionally performs some basic validation and converts all values to strings. + Additionally performs some basic validation and converts all values to strings since + GribJump only supports string values in the request. """ request = request.copy() @@ -221,14 +224,14 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: else: request[k] = [str(i) for i in v] - expanded_requests = expand_multivalued_dicts(request) + expanded_requests = expand_dict_with_lists(request) return expanded_requests def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[pygj.ExtractionRequest]: if self._ranges is not None: requests = [pygj.ExtractionRequest(request, self._ranges) for request in mars_requests] - elif self._masks is not None: - requests = [pygj.ExtractionRequest.from_mask(request, self._masks) for request in mars_requests] + elif self._mask is not None: + requests = [pygj.ExtractionRequest.from_mask(request, self._mask) for request in mars_requests] elif self._indices is not None: requests = [ pygj.ExtractionRequest.from_indices(request, self._indices) for request in mars_requests diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 825b87925..e64dc2333 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -24,8 +24,8 @@ @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_expand_multivalued_dicts(): - from earthkit.data.sources.gribjump import expand_multivalued_dicts +def test_expand_dict_with_lists(): + from earthkit.data.sources.gribjump import expand_dict_with_lists request = { "b": ["hello", "world"], @@ -41,15 +41,15 @@ def test_expand_multivalued_dicts(): {"a": 3, "b": "world", "c": 5}, ] - expanded_requests = expand_multivalued_dicts(request) + expanded_requests = expand_dict_with_lists(request) assert expanded_requests == expected_dicts - assert expand_multivalued_dicts({}) == [{}] - assert expand_multivalued_dicts({"a": 1}) == [{"a": 1}] - assert expand_multivalued_dicts({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] + assert expand_dict_with_lists({}) == [{}] + assert expand_dict_with_lists({"a": 1}) == [{"a": 1}] + assert expand_dict_with_lists({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] with pytest.raises(ValueError, match="Cannot expand dictionary with empty list"): - expand_multivalued_dicts({"a": 1, "b": []}) + expand_dict_with_lists({"a": 1, "b": []}) @pytest.fixture From 8a4fe3176d00f63737d051bf261654f38811bfb2 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 15 May 2025 09:53:20 +0000 Subject: [PATCH 07/49] use original type in request dictionaries to make .sel more intuitive Before, we immediately casted all values in the users' request dictionary to strings since GribJump mandates that for its requests. However, this meant that also each field's metadata would solely have string values, making any filtering only work with strings. This commit only does this casting for the actual dictionaries passed to GribJump. Example: request = {"a": [1, 2]} from_source("gribjump", request).sel(a=1) # -> empty from_source("gribjump", request).sel(a=1) # -> fields for {"a": 1} --- src/earthkit/data/sources/gribjump.py | 28 +++++++------- tests/sources/test_gribjump.py | 54 ++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 548a0c70d..8aef51ea5 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -85,11 +85,10 @@ class FieldExtractList(SimpleFieldList): possess any geographical information or well-defined metadata. Known limitations: - * FieldExtractList.sel is quite brittle as any filter value must be a string. - The underlying metadata is stored as a dictionary of strings, and no - automatic type conversion is done. Any more complex filtering and slicing - will not work for most data types. Also, order_by and similar methods will - perform lexicographical sorting on the string values. + * FieldExtractList.sel is quite brittle as any filter values must have the same type + as the metadata in the user's request dictionary. The actual type of the underlying + MARS keyword is not respected. So ".sel(step=0) would not work with a request + {"step": "0"} but only {"step": 0}. * Efficient lazy loading of selections / slices only is not supported. * Pickling / unpickling might not work. * to_pandas and to_xarray methods are not implemented. @@ -203,31 +202,32 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: Since GribJump returns its result arrays without metadata, we need to split the request into many single requests to later map the outputs to the correct fields. - Additionally performs some basic validation and converts all values to strings since - GribJump only supports string values in the request. + Additionally performs some basic validation. """ request = request.copy() - # Check for invalid values and cast anything but lists to strings + # Check if user passed unspoorted lists and ranges as strings using "/" for k in request.keys(): v = request[k] if isinstance(v, str) and "/" in v: - # TODO: Check if there are valid reasons to use '/' apart from - # lists and ranges. raise ValueError( f"Found unsupported list or range using '/' in value '{v}' for keyword '{k}'. " "Use Python lists to load from multiple fields." ) - elif not isinstance(v, list): - request[k] = str(v) - else: - request[k] = [str(i) for i in v] + elif isinstance(v, list) and len({type(v_) for v_ in v}) != 1: + raise TypeError( + f"All list values must share the same type but found types {set(map(type, v))} " + f"in {k}={v}" + ) expanded_requests = expand_dict_with_lists(request) return expanded_requests def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[pygj.ExtractionRequest]: + # GribJump currently only supports strings as request values + mars_requests = [{k: str(v) for (k, v) in req.items()} for req in mars_requests] + if self._ranges is not None: requests = [pygj.ExtractionRequest(request, self._ranges) for request in mars_requests] elif self._mask is not None: diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index e64dc2333..b9baedcce 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -209,6 +209,55 @@ def test_gribjump_source_against_manually_masked_grid(seed_fdb): assert np.allclose(expected_arr, extracted_arr) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_selection(seed_fdb): + import numpy as np + + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + indices = np.array([0, 7, 14, 21, 28, 35, 42]) + source = from_source("gribjump", request, indices=indices) + + arr_orig = source.to_numpy() + arr_subset = source.sel(step=6).to_numpy() + + assert arr_subset.shape == (1, 7) + assert np.allclose(arr_orig[[1]], arr_subset) + + +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_with_mixed_types_in_lists(seed_fdb): + + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, "6"], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + with pytest.raises(TypeError): + from_source("gribjump", request, ranges=[(1, 2)]) + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_invalid_options(seed_fdb): import numpy as np @@ -235,7 +284,10 @@ def test_gribjump_with_invalid_options(seed_fdb): with pytest.raises(ValueError, match="Exactly one of"): from_source( - "gribjump", request, ranges=[(0, 1), (10, 12)], indices=np.array([0, 7, 14, 21, 28, 35, 42]) + "gribjump", + request, + ranges=[(0, 1), (10, 12)], + indices=np.array([0, 7, 14, 21, 28, 35, 42]), ) From 86249d8d33fbce37ebae921dce39b3e19ec21788 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 19 May 2025 09:46:13 +0000 Subject: [PATCH 08/49] use SimpleFieldList.to_xarray method for GribJumpSource. Recently, SimpleFieldList.to_xarray was extended to allow also user-supplied metadata dictionaries to be used for xarray dataset construction. This commit removes the marking of FieldExtractList.to_xarray as not implemented, allowing that class to use the implementation from SimpleFieldList. --- src/earthkit/data/sources/gribjump.py | 4 +--- tests/sources/test_gribjump.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 8aef51ea5..ed04c4702 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -129,15 +129,13 @@ def _load(self): for i, result in enumerate(extraction_results): arr = result.values_flat metadata = self._requests[i] + # TODO: Allow modifying user metadata (e.g. to use hdate as forecast reference time) field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) fields.append(field) self.fields = fields self._loaded = True - def to_xarray(self, *args, **kwargs): - self._not_implemented() - def to_pandas(self, *args, **kwargs): self._not_implemented() diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index b9baedcce..78de8a3d0 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -131,6 +131,7 @@ def test_gribjump_with_ranges(seed_fdb): @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_mask(seed_fdb): import numpy as np + import xarray as xr request = { "class": "od", @@ -149,9 +150,29 @@ def test_gribjump_with_mask(seed_fdb): mask = np.eye(7, 12, dtype=bool) source = from_source("gribjump", request, mask=mask) arr = source.to_numpy() + ds = source.to_xarray() + + ds_expected = xr.Dataset( + {"129": (("step", "values"), arr)}, + coords={"step": np.array([0, 21600000000000], dtype="timedelta64[ns]")}, + attrs={ + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "stream": "oper", + "time": "1200", + "type": "fc", + "Conventions:": "CF-1.8", + "institution": "ECMWF", + }, + ) assert arr is not None and isinstance(arr, np.ndarray) assert arr.shape == (2, 7) + assert ds.equals(ds_expected) @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") From 2f8c1b013e4109531f3653928d481241d04d800d Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 3 Jun 2025 09:35:31 +0000 Subject: [PATCH 09/49] assign grid index to each value in to_xarray When users create an xarray dataset for the values retrieved via GribJump, we think it is important for users to explicitly attach the indices from the original grid to each returned value. Unfortunately, this is currently not easy to do directly via the UserMetadata / UserGeomery, which is why we manually add this information in FieldExtractList.to_xarray for now. This should be cleaned up and likely should be natively supported in earthkit-data. Also, although returning the index in the flattened, original grid works for now, we likely want to rather include the origin (x, y) index and ultimately (lat, lon) coordinates instead (or additionaly). The change in pygribjump that enables this lives in [this PR](https://github.com/ecmwf/gribjump/pull/60). --- src/earthkit/data/sources/gribjump.py | 22 ++++++++++++++++++++-- tests/sources/test_gribjump.py | 9 ++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index ed04c4702..455d8cc9e 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -108,6 +108,7 @@ def __init__( self._requests = requests self._extraction_requests = extraction_requests self._loaded = False + self._grid_indices = None super().__init__(fields=None) # The fields attribute is set lazily @@ -126,19 +127,36 @@ def _load(self): extraction_results = self._gj.extract(self._extraction_requests) fields = [] - for i, result in enumerate(extraction_results): + indices = None + for i, (request, result) in enumerate(zip(self._extraction_requests, extraction_results)): arr = result.values_flat + if indices is None: + indices = request.indices() + else: + if not np.array_equal(indices, request.indices()): + raise ValueError( + "Found GribJump result with different indices. " + "All requests must specify the same ranges to construct an xarray Dataset" + ) metadata = self._requests[i] - # TODO: Allow modifying user metadata (e.g. to use hdate as forecast reference time) field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) fields.append(field) self.fields = fields self._loaded = True + self._grid_indices = indices def to_pandas(self, *args, **kwargs): self._not_implemented() + def to_xarray(self, *args, **kwargs): + assert ( + self._grid_indices is not None + ), f"Grid indices must be known before converting to xarray. {self._grid_indices=}" + ds = super().to_xarray(*args, **kwargs) + ds = ds.rename_dims({"values": "index"}).assign_coords({"index": self._grid_indices}) + return ds + class GribJumpSource(Source): def __init__( diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 78de8a3d0..76c644882 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -153,8 +153,11 @@ def test_gribjump_with_mask(seed_fdb): ds = source.to_xarray() ds_expected = xr.Dataset( - {"129": (("step", "values"), arr)}, - coords={"step": np.array([0, 21600000000000], dtype="timedelta64[ns]")}, + {"129": (("step", "index"), arr)}, + coords={ + "step": np.array([0, 21600000000000], dtype="timedelta64[ns]"), + "index": np.array([0, 13, 26, 39, 52, 65, 78]), + }, attrs={ "class": "od", "date": "20201221", @@ -172,7 +175,7 @@ def test_gribjump_with_mask(seed_fdb): assert arr is not None and isinstance(arr, np.ndarray) assert arr.shape == (2, 7) - assert ds.equals(ds_expected) + xr.testing.assert_equal(ds, ds_expected) @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") From e274ae7e365b85ffd0a26d6692a9667640252f62 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 3 Jun 2025 13:10:56 +0000 Subject: [PATCH 10/49] tidy: add some more error handling and improve tests --- src/earthkit/data/sources/gribjump.py | 30 ++--- tests/sources/test_gribjump.py | 157 ++++++++++++++------------ 2 files changed, 103 insertions(+), 84 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 455d8cc9e..905b142f5 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -104,6 +104,16 @@ def __init__( raise ValueError( f"Number of MARS requests ({len(requests)}) and GribJump extraction requests ({len(extraction_requests)}) must match." ) + if len(requests) == 0: + raise ValueError( + "FieldExtractList requires at least one extraction request, but received an empty list" + ) + ranges = extraction_requests[0].ranges + if invalid_requests := [req for req in extraction_requests if req.ranges != ranges]: + raise ValueError( + f"ExtractionRequests must request same ranges but found {len(invalid_requests)} requests with different ranges" + ) + self._gj = gj self._requests = requests self._extraction_requests = extraction_requests @@ -129,15 +139,12 @@ def _load(self): fields = [] indices = None for i, (request, result) in enumerate(zip(self._extraction_requests, extraction_results)): - arr = result.values_flat if indices is None: + # We can assume that all arrays reference the same indices + # because we checked in the constructor that all extraction + # requests share the same ranges. indices = request.indices() - else: - if not np.array_equal(indices, request.indices()): - raise ValueError( - "Found GribJump result with different indices. " - "All requests must specify the same ranges to construct an xarray Dataset" - ) + arr = result.values_flat metadata = self._requests[i] field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) fields.append(field) @@ -146,17 +153,14 @@ def _load(self): self._loaded = True self._grid_indices = indices - def to_pandas(self, *args, **kwargs): - self._not_implemented() - def to_xarray(self, *args, **kwargs): - assert ( - self._grid_indices is not None - ), f"Grid indices must be known before converting to xarray. {self._grid_indices=}" ds = super().to_xarray(*args, **kwargs) ds = ds.rename_dims({"values": "index"}).assign_coords({"index": self._grid_indices}) return ds + def to_pandas(self, *args, **kwargs): + self._not_implemented() + class GribJumpSource(Source): def __init__( diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 76c644882..242982deb 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -103,11 +103,36 @@ def seed_fdb(setup_fdb_with_gribjump): yield setup_fdb_with_gribjump +@pytest.fixture +def ranges(): + return dict(ranges=[(0, 1), (5, 9), (25, 27)]) + + +@pytest.fixture +def indices(): + import numpy as np + + return dict(indices=np.array([0, 5, 6, 7, 8, 25, 26])) + + +@pytest.fixture +def mask(): + import numpy as np + + mask = np.zeros((7, 12), dtype=bool) + mask[0, 0] = True + mask[0, 5:9] = True + mask[2, 1:3] = True + return dict(mask=mask) + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_gribjump_with_ranges(seed_fdb): +@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) +def test_gribjump_to_numpy(seed_fdb, method, request): import numpy as np - request = { + kwargs = request.getfixturevalue(method) + mars_request = { "class": "od", "date": "20201221", "domain": "g", @@ -121,19 +146,44 @@ def test_gribjump_with_ranges(seed_fdb): "type": "fc", } - source = from_source("gribjump", request, ranges=[(0, 1), (10, 12)]) + arr_expected = np.array( + [ + [ + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1607.31591797, + 1721.81591797, + ], + [ + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1702.31201172, + 1887.18701172, + ], + ] + ) + source = from_source("gribjump", mars_request, **kwargs) arr = source.to_numpy() assert arr is not None and isinstance(arr, np.ndarray) - assert arr.shape == (2, 3) + assert arr.shape == (2, 7) + np.testing.assert_array_almost_equal(arr, arr_expected) @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_gribjump_with_mask(seed_fdb): +@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) +def test_gribjump_to_xarray(seed_fdb, method, request): import numpy as np import xarray as xr - request = { + kwargs = request.getfixturevalue(method) + mars_request = { "class": "od", "date": "20201221", "domain": "g", @@ -147,16 +197,36 @@ def test_gribjump_with_mask(seed_fdb): "type": "fc", } - mask = np.eye(7, 12, dtype=bool) - source = from_source("gribjump", request, mask=mask) - arr = source.to_numpy() + arr_expected = np.array( + [ + [ + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1743.06591797, + 1607.31591797, + 1721.81591797, + ], + [ + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1641.43701172, + 1702.31201172, + 1887.18701172, + ], + ] + ) + source = from_source("gribjump", mars_request, **kwargs) ds = source.to_xarray() ds_expected = xr.Dataset( - {"129": (("step", "index"), arr)}, + {"129": (("step", "index"), arr_expected)}, coords={ "step": np.array([0, 21600000000000], dtype="timedelta64[ns]"), - "index": np.array([0, 13, 26, 39, 52, 65, 78]), + "index": np.array([0, 5, 6, 7, 8, 25, 26]), }, attrs={ "class": "od", @@ -166,71 +236,16 @@ def test_gribjump_with_mask(seed_fdb): "levelist": "1000", "levtype": "pl", "stream": "oper", + "param": "129", "time": "1200", "type": "fc", - "Conventions:": "CF-1.8", + "Conventions": "CF-1.8", "institution": "ECMWF", }, ) - - assert arr is not None and isinstance(arr, np.ndarray) - assert arr.shape == (2, 7) - xr.testing.assert_equal(ds, ds_expected) - - -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_gribjump_with_indices(seed_fdb): - import numpy as np - - request = { - "class": "od", - "date": "20201221", - "domain": "g", - "expver": "0001", - "levelist": "1000", - "levtype": "pl", - "param": "129", - "step": [0, 6], - "stream": "oper", - "time": "1200", - "type": "fc", - } - - indices = np.array([0, 7, 14, 21, 28, 35, 42]) - source = from_source("gribjump", request, indices=indices) - arr = source.to_numpy() - - assert arr is not None and isinstance(arr, np.ndarray) - assert arr.shape == (2, 7) - - -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_gribjump_source_against_manually_masked_grid(seed_fdb): - import numpy as np - - request = { - "class": "od", - "date": "20201221", - "domain": "g", - "expver": "0001", - "levelist": "1000", - "levtype": "pl", - "param": "129", - "step": [0, 6], - "stream": "oper", - "time": "1200", - "type": "fc", - } - - mask = (np.eye(7, 12, dtype=bool) | np.eye(7, 12, k=1, dtype=bool)).ravel() - - gj_source = from_source("gribjump", request, mask=mask) - file_source = from_source("file", earthkit_test_data_file("t_time_series.grib")) - - expected_arr = file_source.sel(step=[0, 6], param="z").to_numpy().reshape(2, -1)[:, mask] - extracted_arr = gj_source.to_numpy() - - assert np.allclose(expected_arr, extracted_arr) + xr.testing.assert_allclose(ds, ds_expected) + assert ds_expected.attrs == ds.attrs + assert set(ds_expected.coords.keys()) == set(ds.coords.keys()) @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") From 711da54edac1c7b19a18c8ebd671210b6e0870de Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 09:04:46 +0000 Subject: [PATCH 11/49] refactor: introduce ExtractionRequest wrapper that combines pygribjump.ExtractionRequest and original fdb request dict This makes the code a bit easier to read since we don't have to track both as separate lists --- src/earthkit/data/sources/gribjump.py | 67 +++++++++++++++++---------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 905b142f5..207e2b27f 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -12,6 +12,7 @@ except ImportError: raise ImportError("GribJump access requires 'pygribjump' to be installed") +import dataclasses import itertools import os from typing import Any @@ -73,6 +74,27 @@ def expand_dict_with_lists( return expanded_requests +@dataclasses.dataclass +class ExtractionRequest: + """ + Simple wrapper of pygribjump.ExtractionRequest and the original FDB request dict. + + Can be removed once pygribjump.ExtractionRequest provides a reference to the request dictionary. + """ + + extraction_request: pygj.ExtractionRequest + request: dict[str, str] + + @property + def ranges(self) -> list[tuple[int, int]]: + """Returns the ranges of the extraction request.""" + return self.extraction_request.ranges + + def indices(self) -> np.ndarray: + """Returns the indices of the extraction request.""" + return self.extraction_request.indices() + + class FieldExtractList(SimpleFieldList): """Lazily loaded representation of the points extracted from multiple fields using GribJump. @@ -91,32 +113,25 @@ class FieldExtractList(SimpleFieldList): {"step": "0"} but only {"step": 0}. * Efficient lazy loading of selections / slices only is not supported. * Pickling / unpickling might not work. - * to_pandas and to_xarray methods are not implemented. """ def __init__( self, gj: pygj.GribJump, - requests: list[dict[str, str]], - extraction_requests: list[pygj.ExtractionRequest], + requests: list[ExtractionRequest], ): - if len(requests) != len(extraction_requests): - raise ValueError( - f"Number of MARS requests ({len(requests)}) and GribJump extraction requests ({len(extraction_requests)}) must match." - ) if len(requests) == 0: raise ValueError( "FieldExtractList requires at least one extraction request, but received an empty list" ) - ranges = extraction_requests[0].ranges - if invalid_requests := [req for req in extraction_requests if req.ranges != ranges]: + ranges = requests[0].ranges + if invalid_requests := [req for req in requests if req.ranges != ranges]: raise ValueError( f"ExtractionRequests must request same ranges but found {len(invalid_requests)} requests with different ranges" ) self._gj = gj self._requests = requests - self._extraction_requests = extraction_requests self._loaded = False self._grid_indices = None @@ -134,18 +149,19 @@ def _load(self): if self._loaded: return - extraction_results = self._gj.extract(self._extraction_requests) + extraction_requests = [req.extraction_request for req in self._requests] + extraction_results = self._gj.extract(extraction_requests) fields = [] indices = None - for i, (request, result) in enumerate(zip(self._extraction_requests, extraction_results)): + for i, (request, result) in enumerate(zip(self._requests, extraction_results)): if indices is None: # We can assume that all arrays reference the same indices # because we checked in the constructor that all extraction # requests share the same ranges. indices = request.indices() arr = result.values_flat - metadata = self._requests[i] + metadata = self._requests[i].request field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) fields.append(field) @@ -187,7 +203,7 @@ def __init__( self._gj = pygj.GribJump() self._mars_requests = self._split_mars_requests(request) - self._gj_extraction_requests = self._build_extraction_requests(self._mars_requests) + self._extraction_requests = self._build_extraction_requests(self._mars_requests) def _check_env(self): fdb_conf = os.environ.get("FDB5_CONFIG", None) @@ -244,27 +260,30 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: expanded_requests = expand_dict_with_lists(request) return expanded_requests - def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[pygj.ExtractionRequest]: + def _build_extraction_request(self, request: dict[str, str]) -> ExtractionRequest: + """Builds a single extraction request from the given request dictionary.""" # GribJump currently only supports strings as request values - mars_requests = [{k: str(v) for (k, v) in req.items()} for req in mars_requests] + stringified_request_dict = {k: str(v) for (k, v) in request.items()} if self._ranges is not None: - requests = [pygj.ExtractionRequest(request, self._ranges) for request in mars_requests] + extraction_request = pygj.ExtractionRequest(stringified_request_dict, self._ranges) elif self._mask is not None: - requests = [pygj.ExtractionRequest.from_mask(request, self._mask) for request in mars_requests] + extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, self._mask) elif self._indices is not None: - requests = [ - pygj.ExtractionRequest.from_indices(request, self._indices) for request in mars_requests - ] + extraction_request = pygj.ExtractionRequest.from_indices(stringified_request_dict, self._indices) else: raise ValueError("No valid extraction method specified.") - return requests + return ExtractionRequest(extraction_request, request) + + def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[ExtractionRequest]: + """Builds extraction requests from the given MARS requests.""" + extraction_requests = [self._build_extraction_request(request) for request in mars_requests] + return extraction_requests def mutate(self): return FieldExtractList( self._gj, - self._mars_requests, - self._gj_extraction_requests, + self._extraction_requests, ) From 9c31181386bee0f1d1e7de26b1c4ad0a2a90f081 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 09:22:26 +0000 Subject: [PATCH 12/49] feat(test): modify (now failing) test to expect latitude and longitude information --- tests/sources/test_gribjump.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 242982deb..5993416b4 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -123,6 +123,7 @@ def mask(): mask[0, 0] = True mask[0, 5:9] = True mask[2, 1:3] = True + mask = mask.ravel() return dict(mask=mask) @@ -219,6 +220,29 @@ def test_gribjump_to_xarray(seed_fdb, method, request): ], ] ) + latitude_expected = np.array( + [ + 90.0, + 90.0, + 90.0, + 90.0, + 90.0, + 30.0, + 30.0, + ] + ) + longitude_expected = np.array( + [ + 0.0, + 150.0, + 180.0, + 210.0, + 240.0, + 30.0, + 60.0, + ] + ) + source = from_source("gribjump", mars_request, **kwargs) ds = source.to_xarray() @@ -227,6 +251,8 @@ def test_gribjump_to_xarray(seed_fdb, method, request): coords={ "step": np.array([0, 21600000000000], dtype="timedelta64[ns]"), "index": np.array([0, 5, 6, 7, 8, 25, 26]), + "latitude": ("index", latitude_expected), + "longitude": ("index", longitude_expected), }, attrs={ "class": "od", From 162877d04355ed81459e3c19d369b6a62bc4d6b8 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:08:34 +0000 Subject: [PATCH 13/49] feat: wip: allow reference lat/lons to be loaded from an fdb reference field This commit still needs to be cleaned up and a few implementation details to be ironed out. --- src/earthkit/data/sources/gribjump.py | 46 +++++++++++++++++++++++++-- tests/sources/test_gribjump.py | 2 +- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 207e2b27f..a3f5b2331 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -22,7 +22,9 @@ import numpy as np from earthkit.data.indexing.fieldlist import SimpleFieldList +from earthkit.data.readers.grib.metadata import GribMetadata from earthkit.data.sources import Source +from earthkit.data.sources import from_source from earthkit.data.sources.array_list import ArrayField from earthkit.data.utils.metadata.dict import UserMetadata @@ -80,6 +82,13 @@ class ExtractionRequest: Simple wrapper of pygribjump.ExtractionRequest and the original FDB request dict. Can be removed once pygribjump.ExtractionRequest provides a reference to the request dictionary. + + Parameters + ---------- + extraction_request : pygj.ExtractionRequest + The GribJump extraction request object. + request : dict[str, str] + The original request dictionary used to create the extraction request. """ extraction_request: pygj.ExtractionRequest @@ -119,6 +128,7 @@ def __init__( self, gj: pygj.GribJump, requests: list[ExtractionRequest], + reference_metadata: Optional[GribMetadata] = None, ): if len(requests) == 0: raise ValueError( @@ -134,6 +144,7 @@ def __init__( self._requests = requests self._loaded = False self._grid_indices = None + self._reference_metadata = reference_metadata super().__init__(fields=None) # The fields attribute is set lazily @@ -151,6 +162,7 @@ def _load(self): extraction_requests = [req.extraction_request for req in self._requests] extraction_results = self._gj.extract(extraction_requests) + geography = {} fields = [] indices = None @@ -161,8 +173,25 @@ def _load(self): # requests share the same ranges. indices = request.indices() arr = result.values_flat - metadata = self._requests[i].request - field = ArrayField(arr, UserMetadata(metadata, shape=arr.shape)) + + if self._reference_metadata is not None and not geography: + reference_geography = self._reference_metadata.geography + grid_latitudes = reference_geography.latitudes()[indices] + grid_longitudes = reference_geography.longitudes()[indices] + geography = { + "latitudes": grid_latitudes, + "longitudes": grid_longitudes, + } + + metadata = UserMetadata( + { + **geography, + **self._requests[i].request, + }, + shape=arr.shape, + ) + + field = ArrayField(arr, metadata) fields.append(field) self.fields = fields @@ -186,6 +215,7 @@ def __init__( ranges: Optional[list[tuple[int, int]]] = None, mask: Optional[np.ndarray] = None, indices: Optional[np.ndarray] = None, + coords_from_fdb: bool = False, **kwargs, ): super().__init__(**kwargs) @@ -202,6 +232,7 @@ def __init__( self._check_env() self._gj = pygj.GribJump() + self._coords_from_fdb = coords_from_fdb self._mars_requests = self._split_mars_requests(request) self._extraction_requests = self._build_extraction_requests(self._mars_requests) @@ -281,9 +312,20 @@ def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> lis return extraction_requests def mutate(self): + # TODO: Find a more elegant way to load the reference metadata lazily + # and in the right place. + reference_metadata: GribMetadata | None = None + if self._coords_from_fdb: + fdb_source = from_source("fdb", self._mars_requests[0], stream=False) + fdb_metadatas = fdb_source.metadata() + if not fdb_metadatas: + # TODO: This should be handled more gracefully + raise ValueError("FDB source returned no metadata.") + reference_metadata = fdb_metadatas[0] return FieldExtractList( self._gj, self._extraction_requests, + reference_metadata=reference_metadata, ) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 5993416b4..5a28b42c5 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -243,7 +243,7 @@ def test_gribjump_to_xarray(seed_fdb, method, request): ] ) - source = from_source("gribjump", mars_request, **kwargs) + source = from_source("gribjump", mars_request, coords_from_fdb=True, **kwargs) ds = source.to_xarray() ds_expected = xr.Dataset( From 08194b221ab5eb88058fc2706602e963ab59eeaa Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:27:33 +0000 Subject: [PATCH 14/49] refactor: move hardcoded test fixtures into pytest fixtures Probably I'll change it soon to just load the expected data directly from the grib files instead of hardcoding it though. --- tests/sources/test_gribjump.py | 140 ++++++++++++++++++++++----------- 1 file changed, 94 insertions(+), 46 deletions(-) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 5a28b42c5..7a87d7624 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -127,26 +127,10 @@ def mask(): return dict(mask=mask) -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) -def test_gribjump_to_numpy(seed_fdb, method, request): +@pytest.fixture +def arr_expected(): import numpy as np - kwargs = request.getfixturevalue(method) - mars_request = { - "class": "od", - "date": "20201221", - "domain": "g", - "expver": "0001", - "levelist": "1000", - "levtype": "pl", - "param": "129", - "step": [0, 6], - "stream": "oper", - "time": "1200", - "type": "fc", - } - arr_expected = np.array( [ [ @@ -169,35 +153,14 @@ def test_gribjump_to_numpy(seed_fdb, method, request): ], ] ) - source = from_source("gribjump", mars_request, **kwargs) - arr = source.to_numpy() - - assert arr is not None and isinstance(arr, np.ndarray) - assert arr.shape == (2, 7) - np.testing.assert_array_almost_equal(arr, arr_expected) + return arr_expected -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) -def test_gribjump_to_xarray(seed_fdb, method, request): +@pytest.fixture +def ds_expected_with_coords(): import numpy as np import xarray as xr - kwargs = request.getfixturevalue(method) - mars_request = { - "class": "od", - "date": "20201221", - "domain": "g", - "expver": "0001", - "levelist": "1000", - "levtype": "pl", - "param": "129", - "step": [0, 6], - "stream": "oper", - "time": "1200", - "type": "fc", - } - arr_expected = np.array( [ [ @@ -242,10 +205,6 @@ def test_gribjump_to_xarray(seed_fdb, method, request): 60.0, ] ) - - source = from_source("gribjump", mars_request, coords_from_fdb=True, **kwargs) - ds = source.to_xarray() - ds_expected = xr.Dataset( {"129": (("step", "index"), arr_expected)}, coords={ @@ -269,11 +228,100 @@ def test_gribjump_to_xarray(seed_fdb, method, request): "institution": "ECMWF", }, ) + return ds_expected + + +@pytest.fixture +def ds_expected(ds_expected_with_coords): + # Remove coordinates to match the expected output + ds = ds_expected_with_coords.drop_vars(["latitude", "longitude"]) + return ds + + +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) +def test_gribjump_to_numpy(seed_fdb, arr_expected, method, request): + import numpy as np + + kwargs = request.getfixturevalue(method) + mars_request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + source = from_source("gribjump", mars_request, **kwargs) + arr = source.to_numpy() + + assert arr is not None and isinstance(arr, np.ndarray) + assert arr.shape == (2, 7) + np.testing.assert_array_almost_equal(arr, arr_expected) + + +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) +def test_gribjump_to_xarray_without_coords(seed_fdb, ds_expected, method, request): + import xarray as xr + + kwargs = request.getfixturevalue(method) + mars_request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + source = from_source("gribjump", mars_request, **kwargs) + ds = source.to_xarray() + xr.testing.assert_allclose(ds, ds_expected) assert ds_expected.attrs == ds.attrs assert set(ds_expected.coords.keys()) == set(ds.coords.keys()) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +@pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) +def test_gribjump_to_xarray_with_coords(seed_fdb, ds_expected_with_coords, method, request): + import xarray as xr + + kwargs = request.getfixturevalue(method) + mars_request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + source = from_source("gribjump", mars_request, coords_from_fdb=True, **kwargs) + ds = source.to_xarray() + + xr.testing.assert_allclose(ds, ds_expected_with_coords) + assert ds_expected_with_coords.attrs == ds.attrs + assert set(ds_expected_with_coords.coords.keys()) == set(ds.coords.keys()) + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_selection(seed_fdb): import numpy as np From 941269adbd9364c7b5207165454239187cf94081 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 12:25:05 +0000 Subject: [PATCH 15/49] test: add failing test showing bug with geography for gridded extracts --- tests/sources/test_gribjump.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 7a87d7624..2a1622926 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -350,6 +350,26 @@ def test_gribjump_selection(seed_fdb): assert np.allclose(arr_orig[[1]], arr_subset) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_to_xarray_with_coords_does_not_fail_for_grids(seed_fdb): + mars_request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + source = from_source("gribjump", mars_request, coords_from_fdb=True, indices=[0]) + source.to_xarray() + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_mixed_types_in_lists(seed_fdb): From 1016cefe9a343822c8f437e80a031ce3a4eb63a3 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Mon, 30 Jun 2025 12:27:08 +0000 Subject: [PATCH 16/49] docs: add notebook draft with example usage of gribjump source --- docs/examples/gribjump.ipynb | 794 +++++++++++++++++++++++++++++++++++ 1 file changed, 794 insertions(+) create mode 100644 docs/examples/gribjump.ipynb diff --git a/docs/examples/gribjump.ipynb b/docs/examples/gribjump.ipynb new file mode 100644 index 000000000..a54927cb9 --- /dev/null +++ b/docs/examples/gribjump.ipynb @@ -0,0 +1,794 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf28ddeb", + "metadata": {}, + "source": [ + "## Retrieving subsets from Grib files via GribJump\n", + "\n", + "This example demonstrates how the experimental `gribjump` source allows efficient retrieval of individual grid cells from Grib messages stored in an FDB. The source is a thin wrapper around the Python bindings of [GribJump](https://github.com/ecmwf/gribjump)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "06c4aefb", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import earthkit.data" + ] + }, + { + "cell_type": "markdown", + "id": "0e7e19c7", + "metadata": {}, + "source": [ + "GribJump can retrieve ranges of grid cells for GRIB files in an FDB that were\n", + "previously indexed by GribJump (e.g. using `gribjump-scan`). To use the\n", + "`gribjump` source in earthkit-data, the environment must point to an FDB in\n", + "addition to GribJump-specific environment variables.\n", + "\n", + "⚠️ Please be aware that this source currently does not perform any validation\n", + "that the grid indices specified by the user actually correspond to the fields'\n", + "underlying grids. Please make sure that any fields referenced by the specified\n", + "FDB requests will result in your expected grid. Because of this, we also need to\n", + "tell GribJump to ignore any missing grid validation information via the\n", + "`GRIBJUMP_IGNORE_GRID` environment variable." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffc76940", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Configure FDB either via FDB_HOME or FDB5_CONFIG environment variable.\n", + "# os.environ.setdefault(\"FDB_HOME\", \"\")\n", + "os.environ.setdefault(\"FDB5_CONFIG_FILE\", \"\")\n", + "os.environ.setdefault(\"GRIBJUMP_CONFIG_FILE\", \"\")\n", + "os.environ.setdefault(\"GRIBJUMP_IGNORE_GRID\", \"1\")" + ] + }, + { + "cell_type": "markdown", + "id": "d0695a44", + "metadata": {}, + "source": [ + "### How To Use\n", + "\n", + "The `gribjump` source works similar to the `fdb` source and receives a dictionary with an fdb/mars request.\n", + "Please note that the mars syntax for ranges and lists using \"/\" is not supported. Only scalar values and\n", + "Python lists are supported.\n", + "\n", + "The second required parameters is then one of `ranges`, `indices`, and `mask`, selecting the grid cells which should\n", + "be extracted. For convenience, one can set an additional parameter `coords_from_fdb=True` to make an additional\n", + "request directly to the fdb to retrieve latitudes and longitude information for the retrieved cells and include\n", + "them in the retrieved cell's metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cd0c1962", + "metadata": {}, + "outputs": [], + "source": [ + "source = earthkit.data.from_source(\n", + " \"gribjump\",\n", + " {\n", + " \"class\": \"ce\",\n", + " \"expver\": \"0001\",\n", + " \"stream\": \"efcl\",\n", + " \"date\": \"20230101\",\n", + " \"model\": \"lisflood\",\n", + " \"domain\": \"g\",\n", + " \"origin\": \"ecmf\",\n", + " \"step\": 6,\n", + " \"type\": \"sfo\",\n", + " \"levtype\": \"sfc\",\n", + " \"param\": \"240023\",\n", + " \"time\": [\"0000\", \"0600\"],\n", + " \"hdate\": [\"20200101\", \"20200102\"],\n", + " },\n", + " ranges=[(1234, 2345)],\n", + " coords_from_fdb=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "eb808136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gribjump Engine: Built file map: 0.009258 second elapsed, 0.008298 second cpu\n", + "Gribjump Progress: 1 of 1 tasks complete\n", + "Gribjump Engine: All tasks finished: 0.167958 second elapsed, 0.154583 second cpu\n", + "Gribjump Engine: Repackaged results: 1.1e-05 second elapsed, 1e-05 second cpu\n", + "Engine::extract: 1.7e-05 second elapsed, 1.7e-05 second cpu\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paramlevelbase_datetimevalid_datetimestepnumber
0240023None2020-01-01T00:00:002020-01-01T06:00:006None
1240023None2020-01-01T06:00:002020-01-01T12:00:006None
2240023None2020-01-02T00:00:002020-01-02T06:00:006None
3240023None2020-01-02T06:00:002020-01-02T12:00:006None
\n", + "
" + ], + "text/plain": [ + " param level base_datetime valid_datetime step number\n", + "0 240023 None 2020-01-01T00:00:00 2020-01-01T06:00:00 6 None\n", + "1 240023 None 2020-01-01T06:00:00 2020-01-01T12:00:00 6 None\n", + "2 240023 None 2020-01-02T00:00:00 2020-01-02T06:00:00 6 None\n", + "3 240023 None 2020-01-02T06:00:00 2020-01-02T12:00:00 6 None" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source.ls()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7eff5b19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 62kB\n",
+       "Dimensions:                  (forecast_reference_time: 4, index: 1111)\n",
+       "Coordinates:\n",
+       "  * forecast_reference_time  (forecast_reference_time) datetime64[ns] 32B 202...\n",
+       "    latitude                 (index) float64 9kB ...\n",
+       "    longitude                (index) float64 9kB ...\n",
+       "  * index                    (index) int64 9kB 1234 1235 1236 ... 2342 2343 2344\n",
+       "Data variables:\n",
+       "    240023                   (forecast_reference_time, index) float64 36kB ...\n",
+       "Attributes: (12/13)\n",
+       "    param:        240023\n",
+       "    class:        ce\n",
+       "    stream:       efcl\n",
+       "    levtype:      sfc\n",
+       "    type:         sfo\n",
+       "    expver:       0001\n",
+       "    ...           ...\n",
+       "    hdate:        20200101\n",
+       "    time:         0000\n",
+       "    origin:       ecmf\n",
+       "    domain:       g\n",
+       "    Conventions:  CF-1.8\n",
+       "    institution:  ECMWF
" + ], + "text/plain": [ + " Size: 62kB\n", + "Dimensions: (forecast_reference_time: 4, index: 1111)\n", + "Coordinates:\n", + " * forecast_reference_time (forecast_reference_time) datetime64[ns] 32B 202...\n", + " latitude (index) float64 9kB ...\n", + " longitude (index) float64 9kB ...\n", + " * index (index) int64 9kB 1234 1235 1236 ... 2342 2343 2344\n", + "Data variables:\n", + " 240023 (forecast_reference_time, index) float64 36kB ...\n", + "Attributes: (12/13)\n", + " param: 240023\n", + " class: ce\n", + " stream: efcl\n", + " levtype: sfc\n", + " type: sfo\n", + " expver: 0001\n", + " ... ...\n", + " hdate: 20200101\n", + " time: 0000\n", + " origin: ecmf\n", + " domain: g\n", + " Conventions: CF-1.8\n", + " institution: ECMWF" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = source.to_xarray()\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "042e6382", + "metadata": {}, + "source": [ + "### Selection and Groupings\n", + "\n", + "The `gribjump` source offers limited support for selection methods (`.sel()` and\n", + "`.isel()`) and grouping method (`.group_by()`) and anything else implemented for a\n", + "`SimpleFieldList`. However, please keep in mind that the only available metadata\n", + "for these operations comes from the specified fdb request dictionary. Any\n", + "selection value must match the type in this dictionary supplied by the user." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2c1e99ae", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "data=SimpleFieldList(2) 2\n", + "SimpleFieldList(1) (1, 1111) ['2020-01-01T00:00:00']\n", + "SimpleFieldList(1) (1, 1111) ['2020-01-01T06:00:00']\n" + ] + } + ], + "source": [ + "groups = source.sel(hdate=\"20200101\").group_by(\"time\")\n", + "for group in groups:\n", + " print(group, group.to_numpy().shape, group.metadata('base_datetime'))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f660a118d82e056bb16d85dcd4f9c6e152920cc8 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 1 Jul 2025 15:56:25 +0000 Subject: [PATCH 17/49] tidy: move validation that extract request share the same ranges --- src/earthkit/data/sources/gribjump.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index a3f5b2331..76922a70b 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -130,21 +130,12 @@ def __init__( requests: list[ExtractionRequest], reference_metadata: Optional[GribMetadata] = None, ): - if len(requests) == 0: - raise ValueError( - "FieldExtractList requires at least one extraction request, but received an empty list" - ) - ranges = requests[0].ranges - if invalid_requests := [req for req in requests if req.ranges != ranges]: - raise ValueError( - f"ExtractionRequests must request same ranges but found {len(invalid_requests)} requests with different ranges" - ) - self._gj = gj self._requests = requests + self._reference_metadata = reference_metadata + self._loaded = False self._grid_indices = None - self._reference_metadata = reference_metadata super().__init__(fields=None) # The fields attribute is set lazily @@ -166,12 +157,16 @@ def _load(self): fields = [] indices = None + ranges = None for i, (request, result) in enumerate(zip(self._requests, extraction_results)): - if indices is None: - # We can assume that all arrays reference the same indices - # because we checked in the constructor that all extraction - # requests share the same ranges. + if ranges is None: + ranges = request.ranges indices = request.indices() + else: + if request.ranges != ranges: + raise ValueError( + f"Extraction request {i} has different ranges than the first request: {request.ranges} != {ranges}" + ) arr = result.values_flat if self._reference_metadata is not None and not geography: From 617cb4e839186603b03d9f823cf3f8065b8004f8 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 4 Jul 2025 09:50:14 +0000 Subject: [PATCH 18/49] docs: add documentation for gribjump source --- docs/guide/sources.rst | 62 ++++++++++++++++++++++++++++++++++++++++++ docs/install.rst | 8 ++++++ 2 files changed, 70 insertions(+) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index 6599d892d..f275b5b1c 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1229,6 +1229,68 @@ wekeocds - :ref:`/examples/wekeo.ipynb` +.. _data-sources-gribjump: + +gribjump +-------- + +.. py:function:: from_source("gribjump", request, *, ranges=None, mask=None, indices=None, coords_from_fdb=False, **kwargs) + :noindex: + + The ``gribjump`` source enables fast retrieval of subsets of GRIB messages from the `FDB (Fields DataBase)`_ using the `gribjump`_ library. + It requires both the `pygribjump`_ and `pyfdb`_ packages to be installed. + Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. + + :param dict request: the fdb request as a dict + :param list ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form + [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range + :param numpy.array mask: a 1D boolean mask specifying which grid points to retrieve + :param numpy.array indices: a 1D array of grid indices to retrieve + :param bool coords_from_fdb: if ``True``, loads the full first message from + the FDB to extract the coordinates at the specified indices. This is useful + when the coordinates are needed for the retrieved data. If ``False``, the + coordinates are not loaded, which can speed up the retrieval process. + Default is ``False``. Please note that no validation is performed to + ensure that all retrieved fields share the same grid and therefore coordinates. + + ::note:: + + This source is experimental and may change in future versions. + There is no mechanism to verify that the accessed GRIB messages use the grid + expected by the user. The provided ranges might, therefore, correspond to unexpected + points on the grid. + + The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: + .. code-block:: python + + import earthkit.data as ekd + import numpy as np + + request = { + "class": "od", + "type": "fc", + "stream": "oper", + "expver": "0001", + "repres": "gg", + "levtype": "sfc", + "param": "2t", + "date": "20250703", + "time": 0, + "step": list(range(0, 24, 6)), + "domain": "g", + } + + ranges = [(0, 10), (20, 30)] + + source = ekd.from_source("gribjump", request, ranges=ranges) + ds = source.to_xarray() + + + Further examples: + + - :ref:`/examples/gribjump.ipynb` + + .. _MARS catalog: https://apps.ecmwf.int/archive-catalogue/ .. _MARS user documentation: https://confluence.ecmwf.int/display/UDOC/MARS+user+documentation .. _web API: https://www.ecmwf.int/en/forecasts/access-forecasts/ecmwf-web-api diff --git a/docs/install.rst b/docs/install.rst index 260825fe3..d5a52e3d7 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -51,6 +51,8 @@ Alternatively, you can install the following components: - covjsonkit: provides access to CoverageJSON data served by the :ref:`data-sources-polytope` source - s3: provides access to non-public :ref:`s3 ` buckets (new in version *0.11.0*) - geotiff: adds GeoTIFF support (new in version *0.11.0*). Please note that this is not included in the ``[all]`` option and has to be invoked separately. + - gribjump: provides access to the :ref:`data-sources-gribjump` source + E.g. to add :ref:`data-sources-mars` support you can use: @@ -105,3 +107,9 @@ FDB +++++ For FDB (Fields DataBase) access FDB5 must be installed on the system. See the `FDB documentation `_ for details. + + +GribJump +++++++++++++ + +For FDB access with GribJump, both FDB5 and GribJump must be installed on the system. See the `GribJump project `_ for details. From 767f2a6f566540c2a2d0a6fc456bd762d8c7d651 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:10:30 +0000 Subject: [PATCH 19/49] docs: small fixes of markdown syntax --- docs/guide/sources.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index f275b5b1c..bdc7bc4d1 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -66,6 +66,8 @@ We can get data from a given source by using :func:`from_source`: - retrieve data from `WEkEO`_ using the WEkEO grammar * - :ref:`data-sources-wekeocds` - retrieve `CDS `_ data stored on `WEkEO`_ using the `cdsapi`_ grammar + * - :ref:`data-sources-gribjump` + - retrieve data from the `FDB (Fields DataBase)`_ using the `gribjump`_ library ---------------------------------- @@ -1253,14 +1255,15 @@ gribjump Default is ``False``. Please note that no validation is performed to ensure that all retrieved fields share the same grid and therefore coordinates. - ::note:: + .. note:: - This source is experimental and may change in future versions. - There is no mechanism to verify that the accessed GRIB messages use the grid - expected by the user. The provided ranges might, therefore, correspond to unexpected - points on the grid. + This source is experimental and may change in future versions. + There is no mechanism to verify that the accessed GRIB messages use the grid + expected by the user. The provided ranges might, therefore, correspond to unexpected + points on the grid. The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: + .. code-block:: python import earthkit.data as ekd @@ -1285,7 +1288,6 @@ gribjump source = ekd.from_source("gribjump", request, ranges=ranges) ds = source.to_xarray() - Further examples: - :ref:`/examples/gribjump.ipynb` From 5e6e475d7a0ca1efd268523fbb8c0c62619fe5bb Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 4 Jul 2025 12:57:43 +0000 Subject: [PATCH 20/49] feat: wip experiment to verify gridspec of reference field --- src/earthkit/data/sources/gribjump.py | 32 ++++++++++++++++++-- tests/sources/test_gribjump.py | 42 +++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 76922a70b..41cf72515 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -21,6 +21,7 @@ import numpy as np +from earthkit.data.core.gridspec import GridSpec from earthkit.data.indexing.fieldlist import SimpleFieldList from earthkit.data.readers.grib.metadata import GribMetadata from earthkit.data.sources import Source @@ -76,6 +77,19 @@ def expand_dict_with_lists( return expanded_requests +def verify_gridspec(expected: dict, actual: GridSpec) -> None: + actual = dict(actual) + for key, value in expected.items(): + if key not in actual: + raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got None") + if isinstance(value, (list, np.ndarray)): + if not np.array_equal(value, actual[key]): + raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got {actual[key]}") + else: + if value != actual[key]: + raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got {actual[key]}") + + @dataclasses.dataclass class ExtractionRequest: """ @@ -158,14 +172,14 @@ def _load(self): fields = [] indices = None ranges = None - for i, (request, result) in enumerate(zip(self._requests, extraction_results)): + for request, result in zip(self._requests, extraction_results): if ranges is None: ranges = request.ranges indices = request.indices() else: if request.ranges != ranges: raise ValueError( - f"Extraction request {i} has different ranges than the first request: {request.ranges} != {ranges}" + f"Extraction request has different ranges than the first request: {request.ranges} != {ranges}" ) arr = result.values_flat @@ -181,7 +195,7 @@ def _load(self): metadata = UserMetadata( { **geography, - **self._requests[i].request, + **request.request, }, shape=arr.shape, ) @@ -211,6 +225,7 @@ def __init__( mask: Optional[np.ndarray] = None, indices: Optional[np.ndarray] = None, coords_from_fdb: bool = False, + verify_gridspec: Optional[dict] = None, **kwargs, ): super().__init__(**kwargs) @@ -220,6 +235,11 @@ def __init__( "Exactly one of 'ranges', 'mask' or 'indices' must be set. " f"Got {ranges=}, {mask=}, {indices=}" ) + if verify_gridspec is not None and not coords_from_fdb: + raise ValueError( + "If 'verify_gridspec' is set, 'coords_from_fdb' must also be set to True. " + f"Got {coords_from_fdb=}, {verify_gridspec=}" + ) self._ranges = ranges self._mask = mask self._indices = indices @@ -228,6 +248,7 @@ def __init__( self._gj = pygj.GribJump() self._coords_from_fdb = coords_from_fdb + self._verify_gridspec = verify_gridspec self._mars_requests = self._split_mars_requests(request) self._extraction_requests = self._build_extraction_requests(self._mars_requests) @@ -317,6 +338,11 @@ def mutate(self): # TODO: This should be handled more gracefully raise ValueError("FDB source returned no metadata.") reference_metadata = fdb_metadatas[0] + verify_gridspec( + self._verify_gridspec or {}, + reference_metadata.gridspec, + ) + return FieldExtractList( self._gj, self._extraction_requests, diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 2a1622926..82403ca29 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -370,6 +370,48 @@ def test_gribjump_to_xarray_with_coords_does_not_fail_for_grids(seed_fdb): source.to_xarray() +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_verifies_gridspec(seed_fdb): + + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + def assert_okay(gridspec): + source = from_source( + "gribjump", request, ranges=[(1, 5)], coords_from_fdb=True, verify_gridspec=gridspec + ) + source.to_xarray() + + def assert_raises(expected_error, gridspec): + with pytest.raises(expected_error): + source = from_source( + "gribjump", request, ranges=[(1, 5)], coords_from_fdb=True, verify_gridspec=gridspec + ) + source.to_xarray() + + assert_okay({}) + assert_okay({"type": "regular_ll"}) + assert_okay({"type": "regular_ll", "grid": [30.0, 30.0]}) + assert_raises(ValueError, {"grid": [35.0, 30.0]}) + assert_raises(ValueError, {"type": "regular_ll", "grid": [35.0, 30.0]}) + assert_raises(ValueError, {"grid": "O320"}) + assert_raises( + ValueError, + {"type": "regular_ll", "grid": [30.0, 30.0], "projection": "lambert"}, + ) + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_mixed_types_in_lists(seed_fdb): From e477a6ffbf559cd0bc45d09162ee0a616b48fd8d Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Wed, 16 Jul 2025 13:19:08 +0000 Subject: [PATCH 21/49] fix: force flattened array in xarray dataset creation --- src/earthkit/data/sources/gribjump.py | 17 ++++++++++++++++- tests/sources/test_gribjump.py | 4 +++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 41cf72515..061187740 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -208,8 +208,23 @@ def _load(self): self._grid_indices = indices def to_xarray(self, *args, **kwargs): + kwargs = kwargs.copy() + + flatten_values = kwargs.setdefault("flatten_values", True) + rename_dims = kwargs.setdefault("rename_dims", {"values": "index"}) + if not flatten_values: + raise ValueError( + "GribJump source only supports flattening of values. " + "Please skip the 'flatten_values' argument or set it to True." + ) + if rename_dims.get("values") != "index": + raise ValueError( + "GribJump source does not support renaming 'values' dimension. " + "Please remove 'values' from 'rename_dims' argument." + ) + ds = super().to_xarray(*args, **kwargs) - ds = ds.rename_dims({"values": "index"}).assign_coords({"index": self._grid_indices}) + ds = ds.assign_coords({"index": self._grid_indices}) return ds def to_pandas(self, *args, **kwargs): diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 82403ca29..3f41a3247 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -367,7 +367,9 @@ def test_gribjump_to_xarray_with_coords_does_not_fail_for_grids(seed_fdb): } source = from_source("gribjump", mars_request, coords_from_fdb=True, indices=[0]) - source.to_xarray() + ds = source.to_xarray() + assert set(ds.dims) == {"step", "index"} + assert set(ds.coords) == {"step", "index", "latitude", "longitude"} @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") From bc4ffc4a82bf4fefa3c2272487a146c181d1f64c Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Wed, 16 Jul 2025 15:16:25 +0000 Subject: [PATCH 22/49] refactor: tidy up the metadata enrichment a bit --- src/earthkit/data/sources/gribjump.py | 36 ++++++++++++++------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 061187740..f3f3e8e5b 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -167,7 +167,6 @@ def _load(self): extraction_requests = [req.extraction_request for req in self._requests] extraction_results = self._gj.extract(extraction_requests) - geography = {} fields = [] indices = None @@ -182,23 +181,10 @@ def _load(self): f"Extraction request has different ranges than the first request: {request.ranges} != {ranges}" ) arr = result.values_flat + shape = arr.shape - if self._reference_metadata is not None and not geography: - reference_geography = self._reference_metadata.geography - grid_latitudes = reference_geography.latitudes()[indices] - grid_longitudes = reference_geography.longitudes()[indices] - geography = { - "latitudes": grid_latitudes, - "longitudes": grid_longitudes, - } - - metadata = UserMetadata( - { - **geography, - **request.request, - }, - shape=arr.shape, - ) + metadata = UserMetadata(request.request, shape=shape) + metadata = self._enrich_metadata_with_coordinates(indices, metadata) field = ArrayField(arr, metadata) fields.append(field) @@ -207,6 +193,22 @@ def _load(self): self._loaded = True self._grid_indices = indices + def _enrich_metadata_with_coordinates(self, indices: np.ndarray, metadata: UserMetadata) -> UserMetadata: + """Enriches the metadata with coordinates if reference metadata is available.""" + if self._reference_metadata is None: + return metadata + + reference_geography = self._reference_metadata.geography + grid_latitudes = reference_geography.latitudes()[indices] + grid_longitudes = reference_geography.longitudes()[indices] + metadata = metadata.override( + { + "latitudes": grid_latitudes, + "longitudes": grid_longitudes, + } + ) + return metadata + def to_xarray(self, *args, **kwargs): kwargs = kwargs.copy() From f8f463053e66c900fe12e42f92fc0b1cdc1759e3 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Wed, 16 Jul 2025 15:42:26 +0000 Subject: [PATCH 23/49] refactor: create ExtractionRequestCollection --- src/earthkit/data/sources/gribjump.py | 92 ++++++++++++++++++++------- 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index f3f3e8e5b..829f377f0 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -15,6 +15,7 @@ import dataclasses import itertools import os +from collections import UserList from typing import Any from typing import Optional from typing import Union @@ -118,6 +119,65 @@ def indices(self) -> np.ndarray: return self.extraction_request.indices() +def build_extraction_request( + request: dict[str, str], + ranges: Optional[list[tuple[int, int]]] = None, + mask: Optional[np.ndarray] = None, + indices: Optional[np.ndarray] = None, +) -> ExtractionRequest: + """ + Builds an ExtractionRequest from the given request dictionary and optional parameters. + + Parameters + ---------- + request : dict[str, str] + The request dictionary containing MARS keywords. + ranges : Optional[list[tuple[int, int]]], optional + The ranges for the extraction request, by default None. + mask : Optional[np.ndarray], optional + The mask for the extraction request, by default None. + indices : Optional[np.ndarray], optional + The indices for the extraction request, by default None. + + Returns + ------- + ExtractionRequest + The constructed ExtractionRequest object. + """ + stringified_request_dict = {k: str(v) for (k, v) in request.items()} + + if sum(opt is not None for opt in (ranges, mask, indices)) != 1: + raise ValueError( + "Exactly one of 'ranges', 'mask' or 'indices' must be set. " f"Got {ranges=}, {mask=}, {indices=}" + ) + + if ranges is not None: + extraction_request = pygj.ExtractionRequest(stringified_request_dict, ranges) + elif mask is not None: + extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, mask) + elif indices is not None: + extraction_request = pygj.ExtractionRequest.from_indices(stringified_request_dict, indices) + else: + raise ValueError("No valid extraction method specified. Provide either ranges, mask, or indices.") + + return ExtractionRequest(extraction_request, request) + + +class ExtractionRequestCollection(UserList): + + @classmethod + def from_mars_requests( + cls, + mars_requests: list[dict[str, str]], + ranges: Optional[list[tuple[int, int]]] = None, + mask: Optional[np.ndarray] = None, + indices: Optional[np.ndarray] = None, + ) -> "ExtractionRequestCollection": + """Creates an ExtractionRequestCollection from MARS requests.""" + extraction_requests = [build_extraction_request(req, ranges, mask, indices) for req in mars_requests] + return cls(extraction_requests) + + class FieldExtractList(SimpleFieldList): """Lazily loaded representation of the points extracted from multiple fields using GribJump. @@ -141,7 +201,7 @@ class FieldExtractList(SimpleFieldList): def __init__( self, gj: pygj.GribJump, - requests: list[ExtractionRequest], + requests: ExtractionRequestCollection, reference_metadata: Optional[GribMetadata] = None, ): self._gj = gj @@ -267,7 +327,6 @@ def __init__( self._coords_from_fdb = coords_from_fdb self._verify_gridspec = verify_gridspec self._mars_requests = self._split_mars_requests(request) - self._extraction_requests = self._build_extraction_requests(self._mars_requests) def _check_env(self): fdb_conf = os.environ.get("FDB5_CONFIG", None) @@ -324,26 +383,6 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: expanded_requests = expand_dict_with_lists(request) return expanded_requests - def _build_extraction_request(self, request: dict[str, str]) -> ExtractionRequest: - """Builds a single extraction request from the given request dictionary.""" - # GribJump currently only supports strings as request values - stringified_request_dict = {k: str(v) for (k, v) in request.items()} - - if self._ranges is not None: - extraction_request = pygj.ExtractionRequest(stringified_request_dict, self._ranges) - elif self._mask is not None: - extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, self._mask) - elif self._indices is not None: - extraction_request = pygj.ExtractionRequest.from_indices(stringified_request_dict, self._indices) - else: - raise ValueError("No valid extraction method specified.") - return ExtractionRequest(extraction_request, request) - - def _build_extraction_requests(self, mars_requests: list[dict[str, str]]) -> list[ExtractionRequest]: - """Builds extraction requests from the given MARS requests.""" - extraction_requests = [self._build_extraction_request(request) for request in mars_requests] - return extraction_requests - def mutate(self): # TODO: Find a more elegant way to load the reference metadata lazily # and in the right place. @@ -360,9 +399,16 @@ def mutate(self): reference_metadata.gridspec, ) + extraction_requests = ExtractionRequestCollection.from_mars_requests( + self._mars_requests, + ranges=self._ranges, + mask=self._mask, + indices=self._indices, + ) + return FieldExtractList( self._gj, - self._extraction_requests, + requests=extraction_requests, reference_metadata=reference_metadata, ) From 7b1548e45e21854a119d74dcaa9a5293fb85f3be Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Wed, 16 Jul 2025 15:49:57 +0000 Subject: [PATCH 24/49] refactor: use FDBRetriever to load reference metadata --- src/earthkit/data/sources/gribjump.py | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 829f377f0..ebab881bf 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -28,6 +28,7 @@ from earthkit.data.sources import Source from earthkit.data.sources import from_source from earthkit.data.sources.array_list import ArrayField +from earthkit.data.sources.fdb import FDBRetriever from earthkit.data.utils.metadata.dict import UserMetadata @@ -202,14 +203,15 @@ def __init__( self, gj: pygj.GribJump, requests: ExtractionRequestCollection, - reference_metadata: Optional[GribMetadata] = None, + fdb_retriever: Optional[FDBRetriever] = None, ): self._gj = gj self._requests = requests - self._reference_metadata = reference_metadata + self._fdb_retriever = fdb_retriever self._loaded = False self._grid_indices = None + self._reference_metadata: Optional[GribMetadata] = None super().__init__(fields=None) # The fields attribute is set lazily @@ -253,12 +255,30 @@ def _load(self): self._loaded = True self._grid_indices = indices + def _load_reference_metadata(self): + """Loads the reference metadata from the FDB retriever if available.""" + if self._fdb_retriever is None: + return None + if self._reference_metadata is not None: + return self._reference_metadata + + fields = self._fdb_retriever.get(self._requests[0].request) + metadatas = fields.metadata() + if not metadatas: + raise ValueError("FDB retriever returned no metadata.") + if len(metadatas) != 1: + raise ValueError(f"Expected exactly one metadata for the first request, got {len(metadatas)}.") + metadata = metadatas[0] + assert isinstance(metadata, GribMetadata), type(metadata) + self._reference_metadata = metadata + return metadata + def _enrich_metadata_with_coordinates(self, indices: np.ndarray, metadata: UserMetadata) -> UserMetadata: """Enriches the metadata with coordinates if reference metadata is available.""" - if self._reference_metadata is None: + if (reference_metadata := self._load_reference_metadata()) is None: return metadata - reference_geography = self._reference_metadata.geography + reference_geography = reference_metadata.geography grid_latitudes = reference_geography.latitudes()[indices] grid_longitudes = reference_geography.longitudes()[indices] metadata = metadata.override( @@ -406,10 +426,13 @@ def mutate(self): indices=self._indices, ) + # TODO: Allow proper configuration of the FDB retriever + fdb_retriever = FDBRetriever({}) if self._coords_from_fdb else None + return FieldExtractList( self._gj, requests=extraction_requests, - reference_metadata=reference_metadata, + fdb_retriever=fdb_retriever, ) From 90a6d6e04bee24c9eb204979235fe98687465cff Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 08:28:41 +0000 Subject: [PATCH 25/49] docs: add example for masks and indices to notebook --- docs/examples/gribjump.ipynb | 211 ++++++++++++++++++++++++++++++++--- 1 file changed, 195 insertions(+), 16 deletions(-) diff --git a/docs/examples/gribjump.ipynb b/docs/examples/gribjump.ipynb index a54927cb9..cf563b1eb 100644 --- a/docs/examples/gribjump.ipynb +++ b/docs/examples/gribjump.ipynb @@ -12,12 +12,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "06c4aefb", "metadata": {}, "outputs": [], "source": [ "import os\n", + "import numpy as np\n", "import earthkit.data" ] }, @@ -51,7 +52,7 @@ "'1'" ] }, - "execution_count": 7, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -83,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "id": "cd0c1962", "metadata": {}, "outputs": [], @@ -112,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "id": "eb808136", "metadata": {}, "outputs": [ @@ -120,11 +121,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Gribjump Engine: Built file map: 0.009258 second elapsed, 0.008298 second cpu\n", + "Gribjump Engine: Built file map: 0.022177 second elapsed, 0.011457 second cpu\n", + "Starting 8 threads\n", "Gribjump Progress: 1 of 1 tasks complete\n", - "Gribjump Engine: All tasks finished: 0.167958 second elapsed, 0.154583 second cpu\n", - "Gribjump Engine: Repackaged results: 1.1e-05 second elapsed, 1e-05 second cpu\n", - "Engine::extract: 1.7e-05 second elapsed, 1.7e-05 second cpu\n" + "Gribjump Engine: All tasks finished: 0.334884 second elapsed, 0.162512 second cpu\n", + "Gribjump Engine: Repackaged results: 8e-06 second elapsed, 7e-06 second cpu\n", + "Engine::extract: 1.7e-05 second elapsed, 1.5e-05 second cpu\n" ] }, { @@ -205,7 +207,7 @@ "3 240023 None 2020-01-02T06:00:00 2020-01-02T12:00:00 6 None" ] }, - "execution_count": 12, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -216,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "id": "7eff5b19", "metadata": {}, "outputs": [ @@ -688,14 +690,14 @@ " origin: ecmf\n", " domain: g\n", " Conventions: CF-1.8\n", - " institution: ECMWF
  • param :
    240023
    class :
    ce
    stream :
    efcl
    levtype :
    sfc
    type :
    sfo
    expver :
    0001
    date :
    20230101
    hdate :
    20200101
    time :
    0000
    origin :
    ecmf
    domain :
    g
    Conventions :
    CF-1.8
    institution :
    ECMWF
  • " ], "text/plain": [ " Size: 62kB\n", @@ -723,7 +725,7 @@ " institution: ECMWF" ] }, - "execution_count": 13, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -749,7 +751,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 6, "id": "2c1e99ae", "metadata": {}, "outputs": [ @@ -768,6 +770,183 @@ "for group in groups:\n", " print(group, group.to_numpy().shape, group.metadata('base_datetime'))" ] + }, + { + "cell_type": "markdown", + "id": "8e1626a9", + "metadata": {}, + "source": [ + "### Extraction Options\n", + "\n", + "You can specify the extraction points through one of three options. GribJump\n", + "treats all fields as flattened 1D arrays and all coordinates on the grid must\n", + "assume this representation.\n", + "\n", + "* **Ranges:** A list of tuples `(start, end)` defining contiguous ranges of grid\n", + " points to extract. As shown in the example above, each tuple specifies a start\n", + " index (inclusive) and end index (exclusive) in the flattened 1D array\n", + " representation of the grid. For example, `[(0, 100), (200, 300)]` would extract\n", + " grid points 0-99 and 200-299.\n", + "\n", + "* **Indices:** A 1D numpy array or list of specific grid point indices to extract\n", + " from the flattened grid. This allows for non-contiguous extraction of\n", + " individual grid points. For example, `np.array([5, 10, 15, 20])` would extract\n", + " exactly those four grid points.\n", + "\n", + "* **Masks:** A numpy boolean array where `True` indicates grid points to extract\n", + " and `False` indicates points to skip. The mask must have the same length as\n", + " the total number of grid points in the field. However, no such validation is\n", + " performed, passing a mask with an invalid shape will silently return wrong\n", + " results.\n", + "\n", + "Only one of these methods can be used at a time. Please also note that GribJump\n", + "uses ranges internally despite of what the user specifies. Converting the\n", + "reperesentation chosen by the user to ranges can be expensive when multiple\n", + "fields are accessed at the same time." + ] + }, + { + "cell_type": "markdown", + "id": "6fe61883", + "metadata": {}, + "source": [ + "#### Code Examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60165c68", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gribjump Engine: Built file map: 0.010474 second elapsed, 0.008713 second cpu\n", + "Gribjump Progress: 1 of 1 tasks complete\n", + "Gribjump Engine: All tasks finished: 0.039335 second elapsed, 0.039178 second cpu\n", + "Gribjump Engine: Repackaged results: 6e-06 second elapsed, 5e-06 second cpu\n", + "Engine::extract: 2e-05 second elapsed, 2e-05 second cpu\n", + "Extracted dataset (ranges): Size: 36kB\n", + "Dimensions: (index: 2222)\n", + "Coordinates:\n", + " * index (index) int64 18kB 1234 1235 1236 1237 1238 ... 4563 4564 4565 4566\n", + "Data variables:\n", + " 240023 (index) float64 18kB ...\n", + "Attributes: (12/13)\n", + " param: 240023\n", + " class: ce\n", + " stream: efcl\n", + " levtype: sfc\n", + " type: sfo\n", + " expver: 0001\n", + " ... ...\n", + " hdate: 20200101\n", + " time: 0000\n", + " origin: ecmf\n", + " domain: g\n", + " Conventions: CF-1.8\n", + " institution: ECMWF\n", + "Gribjump Engine: Built file map: 0.009283 second elapsed, 0.007779 second cpu\n", + "Gribjump Progress: 1 of 1 tasks complete\n", + "Gribjump Engine: All tasks finished: 0.039215 second elapsed, 0.038721 second cpu\n", + "Gribjump Engine: Repackaged results: 5e-06 second elapsed, 5e-06 second cpu\n", + "Engine::extract: 2.3e-05 second elapsed, 2.2e-05 second cpu\n", + "Extracted dataset (indices): Size: 80B\n", + "Dimensions: (index: 5)\n", + "Coordinates:\n", + " * index (index) int64 40B 10 50 100 150 200\n", + "Data variables:\n", + " 240023 (index) float64 40B ...\n", + "Attributes: (12/13)\n", + " param: 240023\n", + " class: ce\n", + " stream: efcl\n", + " levtype: sfc\n", + " type: sfo\n", + " expver: 0001\n", + " ... ...\n", + " hdate: 20200101\n", + " time: 0000\n", + " origin: ecmf\n", + " domain: g\n", + " Conventions: CF-1.8\n", + " institution: ECMWF\n", + "Gribjump Engine: Built file map: 0.012851 second elapsed, 0.009124 second cpu\n", + "Gribjump Progress: 1 of 1 tasks complete\n", + "Gribjump Engine: All tasks finished: 1 second elapsed, 1 second cpu\n", + "Gribjump Engine: Repackaged results: 6e-06 second elapsed, 6e-06 second cpu\n", + "Engine::extract: 2.7e-05 second elapsed, 2.6e-05 second cpu\n", + "Extracted dataset (mask): Size: 11MB\n", + "Dimensions: (index: 672975)\n", + "Coordinates:\n", + " * index (index) int64 5MB 10 11 32 41 ... 13454079 13454087 13454093\n", + "Data variables:\n", + " 240023 (index) float64 5MB ...\n", + "Attributes: (12/13)\n", + " param: 240023\n", + " class: ce\n", + " stream: efcl\n", + " levtype: sfc\n", + " type: sfo\n", + " expver: 0001\n", + " ... ...\n", + " hdate: 20200101\n", + " time: 0000\n", + " origin: ecmf\n", + " domain: g\n", + " Conventions: CF-1.8\n", + " institution: ECMWF\n" + ] + } + ], + "source": [ + "request = {\n", + " \"class\": \"ce\",\n", + " \"expver\": \"0001\",\n", + " \"stream\": \"efcl\",\n", + " \"date\": \"20230101\",\n", + " \"model\": \"lisflood\",\n", + " \"domain\": \"g\",\n", + " \"origin\": \"ecmf\",\n", + " \"step\": 6,\n", + " \"type\": \"sfo\",\n", + " \"levtype\": \"sfc\",\n", + " \"param\": \"240023\",\n", + " \"time\": \"0000\",\n", + " \"hdate\": \"20200101\",\n", + "}\n", + "\n", + "# Example 1: Using ranges\n", + "source_ranges = earthkit.data.from_source(\n", + " \"gribjump\",\n", + " request,\n", + " ranges=[(1234, 2345), (3456, 4567)],\n", + ")\n", + "ds = source_ranges.to_xarray()\n", + "print(\"Extracted dataset (ranges):\", ds)\n", + "\n", + "# Example 2: Using indices to extract specific grid points\n", + "indices = np.array([10, 50, 100, 150, 200])\n", + "source_indices = earthkit.data.from_source(\n", + " \"gribjump\",\n", + " request,\n", + " indices=indices,\n", + ")\n", + "print(\"Extracted dataset (indices):\", source_indices.to_xarray())\n", + "\n", + "# Example 3: Using a boolean mask with random selection\n", + "shape = 4530 * 2970 # Depends on your grid size\n", + "mask = np.random.choice([True, False], size=shape, p=[0.05, 0.95])\n", + "\n", + "source_mask = earthkit.data.from_source(\n", + " \"gribjump\",\n", + " request,\n", + " mask=mask,\n", + ")\n", + "print(\"Extracted dataset (mask):\", source_mask.to_xarray())" + ] } ], "metadata": { From de5a8fcd250766ffc3f0b5634574734a07751ec0 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 08:38:55 +0000 Subject: [PATCH 26/49] feat: enforce that masks are 1D boolean arrays --- src/earthkit/data/sources/gribjump.py | 46 ++++++++++++++++----------- tests/sources/test_gribjump.py | 43 +++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 19 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index ebab881bf..1409d0d1d 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -26,7 +26,6 @@ from earthkit.data.indexing.fieldlist import SimpleFieldList from earthkit.data.readers.grib.metadata import GribMetadata from earthkit.data.sources import Source -from earthkit.data.sources import from_source from earthkit.data.sources.array_list import ArrayField from earthkit.data.sources.fdb import FDBRetriever from earthkit.data.utils.metadata.dict import UserMetadata @@ -155,6 +154,12 @@ def build_extraction_request( if ranges is not None: extraction_request = pygj.ExtractionRequest(stringified_request_dict, ranges) elif mask is not None: + if not isinstance(mask, np.ndarray): + raise TypeError(f"Expected 'mask' to be a numpy array, got {type(mask)}") + if not np.issubdtype(mask.dtype, np.bool_): + raise ValueError(f"Expected 'mask' to be a boolean array, got {mask.dtype}") + if mask.ndim != 1: + raise ValueError(f"Expected 'mask' to be a 1D numpy array, got {mask.ndim}D") extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, mask) elif indices is not None: extraction_request = pygj.ExtractionRequest.from_indices(stringified_request_dict, indices) @@ -174,7 +179,25 @@ def from_mars_requests( mask: Optional[np.ndarray] = None, indices: Optional[np.ndarray] = None, ) -> "ExtractionRequestCollection": - """Creates an ExtractionRequestCollection from MARS requests.""" + """Creates an ExtractionRequestCollection from MARS requests. + + One of the parameters `ranges`, `mask`, or `indices` must be provided. + + Parameters + ---------- + mars_requests : list[dict[str, str]] + List of MARS requests, each represented as a dictionary of keywords. + ranges : Optional[list[tuple[int, int]]], optional + The ranges for the extraction requests, by default None. + mask : Optional[np.ndarray], optional + The mask for the extraction requests, by default None. + indices : Optional[np.ndarray], optional + The indices for the extraction requests, by default None. + Returns + ------- + ExtractionRequestCollection + A collection of ExtractionRequest objects created from the MARS requests. + """ extraction_requests = [build_extraction_request(req, ranges, mask, indices) for req in mars_requests] return cls(extraction_requests) @@ -404,20 +427,8 @@ def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: return expanded_requests def mutate(self): - # TODO: Find a more elegant way to load the reference metadata lazily - # and in the right place. - reference_metadata: GribMetadata | None = None - if self._coords_from_fdb: - fdb_source = from_source("fdb", self._mars_requests[0], stream=False) - fdb_metadatas = fdb_source.metadata() - if not fdb_metadatas: - # TODO: This should be handled more gracefully - raise ValueError("FDB source returned no metadata.") - reference_metadata = fdb_metadatas[0] - verify_gridspec( - self._verify_gridspec or {}, - reference_metadata.gridspec, - ) + # TODO: Allow proper configuration of the FDB retriever + fdb_retriever = FDBRetriever({}) if self._coords_from_fdb else None extraction_requests = ExtractionRequestCollection.from_mars_requests( self._mars_requests, @@ -426,9 +437,6 @@ def mutate(self): indices=self._indices, ) - # TODO: Allow proper configuration of the FDB retriever - fdb_retriever = FDBRetriever({}) if self._coords_from_fdb else None - return FieldExtractList( self._gj, requests=extraction_requests, diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 3f41a3247..fe0391350 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -468,6 +468,49 @@ def test_gribjump_with_invalid_options(seed_fdb): ) +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_gribjump_with_invalid_mask(seed_fdb): + import numpy as np + + request = { + "class": "od", + "date": "20201221", + "domain": "g", + "expver": "0001", + "levelist": "1000", + "levtype": "pl", + "param": "129", + "step": [0, 6], + "stream": "oper", + "time": "1200", + "type": "fc", + } + + with pytest.raises(ValueError, match="Expected 'mask' to be a 1D numpy array"): + mask = np.array([[True, False], [False, True]], dtype=bool) + from_source( + "gribjump", + request, + mask=mask, + ) + + with pytest.raises(ValueError, match="Expected 'mask' to be a boolean array"): + mask = np.array([1, 0, 1], dtype=int) + from_source( + "gribjump", + request, + mask=mask, + ) + + with pytest.raises(TypeError, match="Expected 'mask' to be a numpy array"): + mask = [True, False, True] + from_source( + "gribjump", + request, + mask=mask, + ) + + if __name__ == "__main__": from earthkit.data.testing import main From e5447d49b8911d67b27bf651dcfbf3211b14a630 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 08:56:37 +0000 Subject: [PATCH 27/49] refactor: simplify by condensing request splitting utilities into one function --- src/earthkit/data/sources/gribjump.py | 115 ++++++++++++-------------- tests/sources/test_gribjump.py | 45 ++++++---- 2 files changed, 79 insertions(+), 81 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 1409d0d1d..132d0c937 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -18,7 +18,6 @@ from collections import UserList from typing import Any from typing import Optional -from typing import Union import numpy as np @@ -31,40 +30,58 @@ from earthkit.data.utils.metadata.dict import UserMetadata -def expand_dict_with_lists( - request: dict[str, Union[str, list[str]]], -) -> list[dict[str, str]]: - """ - Expands a dictionary containing list values into multiple dictionaries representing all possible combinations. - - For each list-type value in the input dictionary, this function creates all possible combinations - with other list values, while keeping non-list values constant across all output dictionaries. - - The list keys are sorted alphabetically before generating combinations to ensure consistent - and deterministic ordering of the output dictionaries regardless of the original key order. - - Example: - Input: {'a': [1, 2], 'b': [3, 4], 'c': 5} - Output: [ - {'a': 1, 'b': 3, 'c': 5}, - {'a': 1, 'b': 4, 'c': 5}, - {'a': 2, 'b': 3, 'c': 5}, - {'a': 2, 'b': 4, 'c': 5} - ] - - Args: - request (dict[str, Union[str, list[str]]]): Dictionary with string keys and either string - or list of strings as values. - - Returns: - list[dict[str, str]]: A list of dictionaries, where each dictionary contains one - specific combination of the input list values, with non-list values preserved. +def split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: + """Splits a MARS request into individual single-field requests by expanding list values. + + Creates all possible combinations of list values in the request dictionary, + generating separate requests for each field. This is required because GribJump + returns result arrays without metadata, so each field must be requested individually + to map outputs correctly. + + Parameters + ---------- + request : dict[str, Any] + The request dictionary containing MARS keywords with potentially list values. + List keys are sorted alphabetically to ensure deterministic ordering. + + Returns + ------- + list[dict[str, str]] + A list of individual request dictionaries, each representing a single field. + All values are converted to strings. + + Raises + ------ + ValueError + If the request contains unsupported "/" syntax for lists/ranges or empty lists. + TypeError + If list values contain mixed types. + + Examples + -------- + >>> split_mars_requests({"param": ["2t", "msl"], "date": "20230101"}) + [{'param': '2t', 'date': '20230101'}, {'param': 'msl', 'date': '20230101'}] + + >>> split_mars_requests({"param": ["2t", "msl"], "step": [0, 6]}) + [{'param': '2t', 'step': '0'}, {'param': '2t', 'step': '6'}, + {'param': 'msl', 'step': '0'}, {'param': 'msl', 'step': '6'}] """ - if empty_list_keys := [k for k, v in request.items() if isinstance(v, list) and len(v) == 0]: - raise ValueError( - "Cannot expand dictionary with empty list. " - f"Found empty list for keys: {', '.join(empty_list_keys)}" - ) + request = request.copy() + + # Validation + for k in request.keys(): + v = request[k] + if isinstance(v, str) and "/" in v: + raise ValueError( + f"Found unsupported list or range using '/' in value '{v}' for keyword '{k}'. " + "Use Python lists to load from multiple fields." + ) + elif isinstance(v, list) and len(v) == 0: + raise ValueError(f"Cannot expand dictionary with empty list. " f"Found empty list for key '{k}'.") + elif isinstance(v, list) and len({type(v_) for v_ in v}) != 1: + raise TypeError( + f"All list values must share the same type but found types {set(map(type, v))} " f"in {k}={v}" + ) list_keywords = sorted(k for k, v in request.items() if isinstance(v, list)) lists = [request[k] for k in list_keywords] @@ -369,7 +386,7 @@ def __init__( self._coords_from_fdb = coords_from_fdb self._verify_gridspec = verify_gridspec - self._mars_requests = self._split_mars_requests(request) + self._mars_requests = split_mars_requests(request) def _check_env(self): fdb_conf = os.environ.get("FDB5_CONFIG", None) @@ -398,34 +415,6 @@ def _check_env(self): "must be set (to '1' or 'True') for the 'gribjump' source to work." ) - @staticmethod - def _split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: - """Splits request into many single requests that load one field each. - - Since GribJump returns its result arrays without metadata, we need to split the - request into many single requests to later map the outputs to the correct fields. - Additionally performs some basic validation. - """ - - request = request.copy() - - # Check if user passed unspoorted lists and ranges as strings using "/" - for k in request.keys(): - v = request[k] - if isinstance(v, str) and "/" in v: - raise ValueError( - f"Found unsupported list or range using '/' in value '{v}' for keyword '{k}'. " - "Use Python lists to load from multiple fields." - ) - elif isinstance(v, list) and len({type(v_) for v_ in v}) != 1: - raise TypeError( - f"All list values must share the same type but found types {set(map(type, v))} " - f"in {k}={v}" - ) - - expanded_requests = expand_dict_with_lists(request) - return expanded_requests - def mutate(self): # TODO: Allow proper configuration of the FDB retriever fdb_retriever = FDBRetriever({}) if self._coords_from_fdb else None diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index fe0391350..c03669f3f 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -25,31 +25,40 @@ @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_expand_dict_with_lists(): - from earthkit.data.sources.gribjump import expand_dict_with_lists + from earthkit.data.sources.gribjump import split_mars_requests request = { - "b": ["hello", "world"], - "a": [1, 2, 3], - "c": 5, + "step": [0, 6, 12], + "param": ["129", "130"], + "date": "20230101", + "time": "1200", } - expected_dicts = [ - {"a": 1, "b": "hello", "c": 5}, - {"a": 1, "b": "world", "c": 5}, - {"a": 2, "b": "hello", "c": 5}, - {"a": 2, "b": "world", "c": 5}, - {"a": 3, "b": "hello", "c": 5}, - {"a": 3, "b": "world", "c": 5}, + result = split_mars_requests(request) + assert result == [ + {"param": "129", "step": 0, "date": "20230101", "time": "1200"}, + {"param": "129", "step": 6, "date": "20230101", "time": "1200"}, + {"param": "129", "step": 12, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 0, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 6, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 12, "date": "20230101", "time": "1200"}, ] - expanded_requests = expand_dict_with_lists(request) - assert expanded_requests == expected_dicts - - assert expand_dict_with_lists({}) == [{}] - assert expand_dict_with_lists({"a": 1}) == [{"a": 1}] - assert expand_dict_with_lists({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] + assert split_mars_requests({}) == [{}] + assert split_mars_requests({"a": 1}) == [{"a": 1}] + assert split_mars_requests({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] + # Error: empty list with pytest.raises(ValueError, match="Cannot expand dictionary with empty list"): - expand_dict_with_lists({"a": 1, "b": []}) + split_mars_requests({"a": 1, "b": []}) + + # Error: unsupported "/" syntax + with pytest.raises(ValueError, match="Found unsupported list or range using '/'"): + split_mars_requests({"param": "129/130", "date": "20230101"}) + + # Error: mixed types in lists + request_mixed = {"param": [129, "130"], "date": "20230101"} + with pytest.raises(TypeError, match="All list values must share the same type"): + split_mars_requests(request_mixed) @pytest.fixture From c0b8a3ae3c6ec261303aac69ca82bfda46327cd7 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:12:02 +0000 Subject: [PATCH 28/49] feat: remove verifiation functionality for now, to be added later --- src/earthkit/data/sources/gribjump.py | 21 -------------- tests/sources/test_gribjump.py | 42 --------------------------- 2 files changed, 63 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 132d0c937..4954121aa 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -21,7 +21,6 @@ import numpy as np -from earthkit.data.core.gridspec import GridSpec from earthkit.data.indexing.fieldlist import SimpleFieldList from earthkit.data.readers.grib.metadata import GribMetadata from earthkit.data.sources import Source @@ -95,19 +94,6 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: return expanded_requests -def verify_gridspec(expected: dict, actual: GridSpec) -> None: - actual = dict(actual) - for key, value in expected.items(): - if key not in actual: - raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got None") - if isinstance(value, (list, np.ndarray)): - if not np.array_equal(value, actual[key]): - raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got {actual[key]}") - else: - if value != actual[key]: - raise ValueError(f"Gridspec mismatch for key '{key}': expected {value}, got {actual[key]}") - - @dataclasses.dataclass class ExtractionRequest: """ @@ -362,7 +348,6 @@ def __init__( mask: Optional[np.ndarray] = None, indices: Optional[np.ndarray] = None, coords_from_fdb: bool = False, - verify_gridspec: Optional[dict] = None, **kwargs, ): super().__init__(**kwargs) @@ -372,11 +357,6 @@ def __init__( "Exactly one of 'ranges', 'mask' or 'indices' must be set. " f"Got {ranges=}, {mask=}, {indices=}" ) - if verify_gridspec is not None and not coords_from_fdb: - raise ValueError( - "If 'verify_gridspec' is set, 'coords_from_fdb' must also be set to True. " - f"Got {coords_from_fdb=}, {verify_gridspec=}" - ) self._ranges = ranges self._mask = mask self._indices = indices @@ -385,7 +365,6 @@ def __init__( self._gj = pygj.GribJump() self._coords_from_fdb = coords_from_fdb - self._verify_gridspec = verify_gridspec self._mars_requests = split_mars_requests(request) def _check_env(self): diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index c03669f3f..5edde7888 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -381,48 +381,6 @@ def test_gribjump_to_xarray_with_coords_does_not_fail_for_grids(seed_fdb): assert set(ds.coords) == {"step", "index", "latitude", "longitude"} -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_gribjump_verifies_gridspec(seed_fdb): - - request = { - "class": "od", - "date": "20201221", - "domain": "g", - "expver": "0001", - "levelist": "1000", - "levtype": "pl", - "param": "129", - "step": [0, 6], - "stream": "oper", - "time": "1200", - "type": "fc", - } - - def assert_okay(gridspec): - source = from_source( - "gribjump", request, ranges=[(1, 5)], coords_from_fdb=True, verify_gridspec=gridspec - ) - source.to_xarray() - - def assert_raises(expected_error, gridspec): - with pytest.raises(expected_error): - source = from_source( - "gribjump", request, ranges=[(1, 5)], coords_from_fdb=True, verify_gridspec=gridspec - ) - source.to_xarray() - - assert_okay({}) - assert_okay({"type": "regular_ll"}) - assert_okay({"type": "regular_ll", "grid": [30.0, 30.0]}) - assert_raises(ValueError, {"grid": [35.0, 30.0]}) - assert_raises(ValueError, {"type": "regular_ll", "grid": [35.0, 30.0]}) - assert_raises(ValueError, {"grid": "O320"}) - assert_raises( - ValueError, - {"type": "regular_ll", "grid": [30.0, 30.0], "projection": "lambert"}, - ) - - @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") def test_gribjump_with_mixed_types_in_lists(seed_fdb): From 3821bb3bdd560e5ddc82173cbee78b754a963dec Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 09:54:59 +0000 Subject: [PATCH 29/49] tidy: small renamings and docstrings --- docs/examples/gribjump.ipynb | 10 +-- docs/guide/sources.rst | 28 +++++---- src/earthkit/data/sources/gribjump.py | 87 +++++++++++++++++++-------- tests/sources/test_gribjump.py | 4 +- 4 files changed, 85 insertions(+), 44 deletions(-) diff --git a/docs/examples/gribjump.ipynb b/docs/examples/gribjump.ipynb index cf563b1eb..eedb5a3af 100644 --- a/docs/examples/gribjump.ipynb +++ b/docs/examples/gribjump.ipynb @@ -42,7 +42,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "ffc76940", "metadata": {}, "outputs": [ @@ -77,14 +77,14 @@ "Python lists are supported.\n", "\n", "The second required parameters is then one of `ranges`, `indices`, and `mask`, selecting the grid cells which should\n", - "be extracted. For convenience, one can set an additional parameter `coords_from_fdb=True` to make an additional\n", + "be extracted. For convenience, one can set an additional parameter `fetch_coords_from_fdb=True` to make an additional\n", "request directly to the fdb to retrieve latitudes and longitude information for the retrieved cells and include\n", "them in the retrieved cell's metadata." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "cd0c1962", "metadata": {}, "outputs": [], @@ -107,7 +107,7 @@ " \"hdate\": [\"20200101\", \"20200102\"],\n", " },\n", " ranges=[(1234, 2345)],\n", - " coords_from_fdb=True,\n", + " fetch_coords_from_fdb=True,\n", ")" ] }, @@ -815,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "60165c68", "metadata": {}, "outputs": [ diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index d45c341c2..8d8663f1c 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1238,7 +1238,7 @@ wekeocds gribjump -------- -.. py:function:: from_source("gribjump", request, *, ranges=None, mask=None, indices=None, coords_from_fdb=False, **kwargs) +.. py:function:: from_source("gribjump", request, *, ranges=None, mask=None, indices=None, fetch_coords_from_fdb=False, fdb_kwargs=None, **kwargs) :noindex: The ``gribjump`` source enables fast retrieval of subsets of GRIB messages from the `FDB (Fields DataBase)`_ using the `gribjump`_ library. @@ -1250,19 +1250,23 @@ gribjump [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range :param numpy.array mask: a 1D boolean mask specifying which grid points to retrieve :param numpy.array indices: a 1D array of grid indices to retrieve - :param bool coords_from_fdb: if ``True``, loads the full first message from - the FDB to extract the coordinates at the specified indices. This is useful - when the coordinates are needed for the retrieved data. If ``False``, the + :param bool fetch_coords_from_fdb: if ``True``, loads the first field's metadata from + the FDB to extract the coordinates at the specified indices. If ``False``, the coordinates are not loaded, which can speed up the retrieval process. Default is ``False``. Please note that no validation is performed to - ensure that all retrieved fields share the same grid and therefore coordinates. - - .. note:: - - This source is experimental and may change in future versions. - There is no mechanism to verify that the accessed GRIB messages use the grid - expected by the user. The provided ranges might, therefore, correspond to unexpected - points on the grid. + ensure that all fields in the requests share the same grid. + :param dict fdb_kwargs: only used when ``fetch_coords_from_fdb=True``. A dict of + keyword arguments passed to the `pyfdb.FDB` constructor. This allows to + specify the FDB configuration, user configuration, etc. If not provided, the + default configuration is used. + + .. warning:: + + This source is **experimental** and may change in future versions without + warning. It performs **no validation** that the specified grid indices + correspond to the fields' actual underlying grids. The provided ranges might, + therefore, correspond to unexpected points on the grid. This source is also + currently **not thread-safe**. The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 4954121aa..0e983b918 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -99,7 +99,8 @@ class ExtractionRequest: """ Simple wrapper of pygribjump.ExtractionRequest and the original FDB request dict. - Can be removed once pygribjump.ExtractionRequest provides a reference to the request dictionary. + Can be removed once pygribjump.ExtractionRequest provides a reference to the request dictionary + with original MARS keyword types. Parameters ---------- @@ -206,23 +207,30 @@ def from_mars_requests( class FieldExtractList(SimpleFieldList): - """Lazily loaded representation of the points extracted from multiple fields using GribJump. - - For simplicity, this class currently inherits from SimpleFieldList and is - inspired by the FieldlistFromDicts and GribFieldListInMemory classes. - However, it is not a complete implementation and can break in unexpected - ways. The main reason for this is that although the arrays with the - extracted values are represented as ArrayFields, they are not truly proper - Field implementations. They are neither stored as 2D grids, nor do they - possess any geographical information or well-defined metadata. - - Known limitations: - * FieldExtractList.sel is quite brittle as any filter values must have the same type - as the metadata in the user's request dictionary. The actual type of the underlying - MARS keyword is not respected. So ".sel(step=0) would not work with a request - {"step": "0"} but only {"step": 0}. - * Efficient lazy loading of selections / slices only is not supported. - * Pickling / unpickling might not work. + """Lazily loaded representation of points extracted from multiple fields using GribJump. + + .. warning:: + This implementation is **not thread-safe**. Concurrent access from multiple threads + may result in race conditions during lazy loading. Use appropriate synchronization + if accessing from multiple threads. + + .. note:: + This class should not be instantiated directly. Use the ``gribjump`` source instead: + ``earthkit.data.from_source("gribjump", request, ranges=ranges)`` + + This class inherits from SimpleFieldList and provides lazy loading of grid point + extractions from GRIB fields via GribJump. FieldList operations like ``sel()``, + ``group_by()``, etc. might work but are not guaranteed to be fully functional. + + Known Limitations + ----------------- + * **No validation**: Grid indices are not validated against actual field grids. + Incorrect indices may return unexpected grid points. + * **Not thread-safe**: Concurrent access may cause race conditions during lazy loading. + * **Limited metadata**: Only metadata from the request dictionary is available, + except for latitude/longitude coordinates when ``fetch_coords_from_fdb=True`` is used. + * **No efficient slicing**: Lazy loading of selections/slices is not supported. + * **Serialization issues**: Pickling/unpickling might not work reliably. """ def __init__( @@ -235,11 +243,12 @@ def __init__( self._requests = requests self._fdb_retriever = fdb_retriever + # These attributes are set lazily after loading the data. self._loaded = False self._grid_indices = None self._reference_metadata: Optional[GribMetadata] = None - super().__init__(fields=None) # The fields attribute is set lazily + super().__init__(fields=None) def __len__(self): self._load() @@ -340,6 +349,15 @@ def to_pandas(self, *args, **kwargs): class GribJumpSource(Source): + """Source for extracting grid points from GRIB messages in an FDB with GribJump. + + ⚠️ This source is experimental and may change in future versions without + warning. It performs no validation that the specified grid indices + correspond to the fields' actual underlying grids. The provided ranges + might, therefore, correspond to unexpected points on the grid. This source + is also currently not thread-safe. + """ + def __init__( self, request: dict, @@ -347,9 +365,30 @@ def __init__( ranges: Optional[list[tuple[int, int]]] = None, mask: Optional[np.ndarray] = None, indices: Optional[np.ndarray] = None, - coords_from_fdb: bool = False, + fetch_coords_from_fdb: bool = False, + fdb_kwargs: Optional[dict[str, Any]] = None, **kwargs, ): + """ + Parameters + ---------- + request : dict + The MARS request dictionary describing the fields to retrieve. + ranges : Optional[list[tuple[int, int]]], optional + The ranges of grid indices to retrieve, by default None. + mask : Optional[np.ndarray], optional + A 1D boolean mask specifying which grid points to retrieve, by default None. + indices : Optional[np.ndarray], optional + A 1D array of grid indices to retrieve, by default None. + fetch_coords_from_fdb : bool, optional + If set to True, loads the first field's metadata from the FDB to extract the coordinates + at the specified indices. + fdb_kwargs : Optional[dict[str, Any]], optional + Only used when `fetch_coords_from_fdb=True`. A dict of + keyword arguments passed to the `pyfdb.FDB` constructor. These arguments are only passed + to the FDB when fetching coordinates and is not used by GribJump for the extraction itself. + """ + super().__init__(**kwargs) if sum(opt is not None for opt in (ranges, mask, indices)) != 1: @@ -364,7 +403,8 @@ def __init__( self._check_env() self._gj = pygj.GribJump() - self._coords_from_fdb = coords_from_fdb + self._coords_from_fdb = fetch_coords_from_fdb + self._fdb_kwargs = fdb_kwargs if fdb_kwargs is not None else {} self._mars_requests = split_mars_requests(request) def _check_env(self): @@ -395,16 +435,13 @@ def _check_env(self): ) def mutate(self): - # TODO: Allow proper configuration of the FDB retriever - fdb_retriever = FDBRetriever({}) if self._coords_from_fdb else None - extraction_requests = ExtractionRequestCollection.from_mars_requests( self._mars_requests, ranges=self._ranges, mask=self._mask, indices=self._indices, ) - + fdb_retriever = FDBRetriever(self._fdb_kwargs) if self._coords_from_fdb else None return FieldExtractList( self._gj, requests=extraction_requests, diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 5edde7888..3cfad51b2 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -323,7 +323,7 @@ def test_gribjump_to_xarray_with_coords(seed_fdb, ds_expected_with_coords, metho "type": "fc", } - source = from_source("gribjump", mars_request, coords_from_fdb=True, **kwargs) + source = from_source("gribjump", mars_request, fetch_coords_from_fdb=True, **kwargs) ds = source.to_xarray() xr.testing.assert_allclose(ds, ds_expected_with_coords) @@ -375,7 +375,7 @@ def test_gribjump_to_xarray_with_coords_does_not_fail_for_grids(seed_fdb): "type": "fc", } - source = from_source("gribjump", mars_request, coords_from_fdb=True, indices=[0]) + source = from_source("gribjump", mars_request, fetch_coords_from_fdb=True, indices=[0]) ds = source.to_xarray() assert set(ds.dims) == {"step", "index"} assert set(ds.coords) == {"step", "index", "latitude", "longitude"} From ab3a831b12eef77ad7edf70c1738d49f0a549808 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 11:11:22 +0000 Subject: [PATCH 30/49] tidy: comments --- docs/guide/sources.rst | 5 ----- src/earthkit/data/sources/gribjump.py | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index 8d8663f1c..a89b0a60c 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1262,11 +1262,6 @@ gribjump .. warning:: - This source is **experimental** and may change in future versions without - warning. It performs **no validation** that the specified grid indices - correspond to the fields' actual underlying grids. The provided ranges might, - therefore, correspond to unexpected points on the grid. This source is also - currently **not thread-safe**. The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 0e983b918..dc76f70a7 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -163,6 +163,7 @@ def build_extraction_request( if not np.issubdtype(mask.dtype, np.bool_): raise ValueError(f"Expected 'mask' to be a boolean array, got {mask.dtype}") if mask.ndim != 1: + # NOTE: We could relax this and just always call 'mask.ravel()' internally. raise ValueError(f"Expected 'mask' to be a 1D numpy array, got {mask.ndim}D") extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, mask) elif indices is not None: From 5f80b6dba7654e7c5eae20b6eb99074889d60cf2 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 11:37:10 +0000 Subject: [PATCH 31/49] fix: type hint and name --- src/earthkit/data/sources/gribjump.py | 6 +++--- tests/sources/test_gribjump.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index dc76f70a7..4b41e1177 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -29,7 +29,7 @@ from earthkit.data.utils.metadata.dict import UserMetadata -def split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: +def split_mars_requests(request: dict[str, Any]) -> list[dict[str, Any]]: """Splits a MARS request into individual single-field requests by expanding list values. Creates all possible combinations of list values in the request dictionary, @@ -62,8 +62,8 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, str]]: [{'param': '2t', 'date': '20230101'}, {'param': 'msl', 'date': '20230101'}] >>> split_mars_requests({"param": ["2t", "msl"], "step": [0, 6]}) - [{'param': '2t', 'step': '0'}, {'param': '2t', 'step': '6'}, - {'param': 'msl', 'step': '0'}, {'param': 'msl', 'step': '6'}] + [{'param': '2t', 'step': 0}, {'param': '2t', 'step': 6}, + {'param': 'msl', 'step': 0}, {'param': 'msl', 'step': 6}] """ request = request.copy() diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 3cfad51b2..e41124997 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -24,7 +24,7 @@ @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_expand_dict_with_lists(): +def test_split_mars_requests(): from earthkit.data.sources.gribjump import split_mars_requests request = { From 741dbe51d462028e8d9d61053a5866b600c23ad1 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:02:56 +0000 Subject: [PATCH 32/49] feat: convert masks to ranges once for significant speedups --- src/earthkit/data/sources/gribjump.py | 42 ++++++++++++ tests/sources/test_gribjump.py | 97 ++++++++++++++++----------- 2 files changed, 101 insertions(+), 38 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index 4b41e1177..e22223488 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -94,6 +94,35 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, Any]]: return expanded_requests +def mask_to_ranges(mask: np.ndarray) -> list[tuple[int, int]]: + """Converts a boolean mask to ranges of indices where the mask is True. + + Parameters + ---------- + mask : np.ndarray + A 1D boolean numpy array. + + Returns + ------- + list[tuple[int, int]] + A list of tuples representing the start and end indices of True segments in the mask. + """ + if not isinstance(mask, np.ndarray): + raise TypeError(f"Expected 'mask' to be a numpy array, got {type(mask)}") + if not np.issubdtype(mask.dtype, np.bool_): + raise ValueError(f"Expected 'mask' to be a boolean array, got {mask.dtype}") + if mask.ndim != 1: + raise ValueError(f"Expected 'mask' to be a 1D numpy array, got {mask.ndim}D") + + padded = np.concatenate(([False], mask, [False])) + d = np.diff(padded.astype(int)) + starts = np.where(d == 1)[0] + ends = np.where(d == -1)[0] + + ranges = list(zip(starts, ends)) + return ranges + + @dataclasses.dataclass class ExtractionRequest: """ @@ -203,6 +232,19 @@ def from_mars_requests( ExtractionRequestCollection A collection of ExtractionRequest objects created from the MARS requests. """ + + if sum(opt is not None for opt in (ranges, mask, indices)) != 1: + raise ValueError( + "Exactly one of 'ranges', 'mask' or 'indices' must be set. " + f"Got {ranges=}, {mask=}, {indices=}" + ) + + if mask is not None: + # Since PyGribJump converts the mask to ranges internally, + # we convert it to ranges here once to avoid doing this multiple times. + ranges = mask_to_ranges(mask) + mask = None + extraction_requests = [build_extraction_request(req, ranges, mask, indices) for req in mars_requests] return cls(extraction_requests) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index e41124997..080a97767 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -23,44 +23,6 @@ from earthkit.data.testing import earthkit_test_data_file -@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") -def test_split_mars_requests(): - from earthkit.data.sources.gribjump import split_mars_requests - - request = { - "step": [0, 6, 12], - "param": ["129", "130"], - "date": "20230101", - "time": "1200", - } - result = split_mars_requests(request) - assert result == [ - {"param": "129", "step": 0, "date": "20230101", "time": "1200"}, - {"param": "129", "step": 6, "date": "20230101", "time": "1200"}, - {"param": "129", "step": 12, "date": "20230101", "time": "1200"}, - {"param": "130", "step": 0, "date": "20230101", "time": "1200"}, - {"param": "130", "step": 6, "date": "20230101", "time": "1200"}, - {"param": "130", "step": 12, "date": "20230101", "time": "1200"}, - ] - - assert split_mars_requests({}) == [{}] - assert split_mars_requests({"a": 1}) == [{"a": 1}] - assert split_mars_requests({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] - - # Error: empty list - with pytest.raises(ValueError, match="Cannot expand dictionary with empty list"): - split_mars_requests({"a": 1, "b": []}) - - # Error: unsupported "/" syntax - with pytest.raises(ValueError, match="Found unsupported list or range using '/'"): - split_mars_requests({"param": "129/130", "date": "20230101"}) - - # Error: mixed types in lists - request_mixed = {"param": [129, "130"], "date": "20230101"} - with pytest.raises(TypeError, match="All list values must share the same type"): - split_mars_requests(request_mixed) - - @pytest.fixture def setup_fdb_with_gribjump(): import pyfdb @@ -247,6 +209,65 @@ def ds_expected(ds_expected_with_coords): return ds +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +def test_split_mars_requests(): + from earthkit.data.sources.gribjump import split_mars_requests + + request = { + "step": [0, 6, 12], + "param": ["129", "130"], + "date": "20230101", + "time": "1200", + } + result = split_mars_requests(request) + assert result == [ + {"param": "129", "step": 0, "date": "20230101", "time": "1200"}, + {"param": "129", "step": 6, "date": "20230101", "time": "1200"}, + {"param": "129", "step": 12, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 0, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 6, "date": "20230101", "time": "1200"}, + {"param": "130", "step": 12, "date": "20230101", "time": "1200"}, + ] + + assert split_mars_requests({}) == [{}] + assert split_mars_requests({"a": 1}) == [{"a": 1}] + assert split_mars_requests({"a": 1, "b": 2}) == [{"a": 1, "b": 2}] + + # Error: empty list + with pytest.raises(ValueError, match="Cannot expand dictionary with empty list"): + split_mars_requests({"a": 1, "b": []}) + + # Error: unsupported "/" syntax + with pytest.raises(ValueError, match="Found unsupported list or range using '/'"): + split_mars_requests({"param": "129/130", "date": "20230101"}) + + # Error: mixed types in lists + request_mixed = {"param": [129, "130"], "date": "20230101"} + with pytest.raises(TypeError, match="All list values must share the same type"): + split_mars_requests(request_mixed) + + +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +@pytest.mark.parametrize( + "mask,expected_ranges", + [ + ([False, False, False], []), + ([True, True, False], [(0, 2)]), + ([False, False, True], [(2, 3)]), + ([False, False, True, True, False, True, False], [(2, 4), (5, 6)]), + ([True, False, True, True, True, False, True], [(0, 1), (2, 5), (6, 7)]), + ], +) +def test_mask_to_ranges(mask, expected_ranges): + import numpy as np + + from earthkit.data.sources.gribjump import mask_to_ranges + + mask = np.array(mask, dtype=bool) + result = mask_to_ranges(mask) + assert result == expected_ranges + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") @pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) def test_gribjump_to_numpy(seed_fdb, arr_expected, method, request): From 3e803de91b9216227a26513ab5c8e2ee38285ebc Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:07:28 +0000 Subject: [PATCH 33/49] test: add another test for mask_to_ranges --- tests/sources/test_gribjump.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/sources/test_gribjump.py b/tests/sources/test_gribjump.py index 080a97767..d48556d2a 100644 --- a/tests/sources/test_gribjump.py +++ b/tests/sources/test_gribjump.py @@ -268,6 +268,24 @@ def test_mask_to_ranges(mask, expected_ranges): assert result == expected_ranges +@pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") +@pytest.mark.parametrize( + "mask,error_type", + [ + ([], ValueError), + ([1, 0, 1], ValueError), + ([[False, True], [True, False]], ValueError), + ], +) +def test_mask_to_ranges_errors(mask, error_type): + import numpy as np + + from earthkit.data.sources.gribjump import mask_to_ranges + + with pytest.raises(error_type): + mask_to_ranges(np.array(mask)) + + @pytest.mark.skipif(NO_GRIBJUMP, reason="pygribjump or pyfdb not available") @pytest.mark.parametrize("method", ["ranges", "indices", "mask"]) def test_gribjump_to_numpy(seed_fdb, arr_expected, method, request): From 82b5577516d1f79574f8f086eea9f1bb4dc82279 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:23:02 +0000 Subject: [PATCH 34/49] tidy: small comment/docstring changes --- docs/guide/sources.rst | 13 +++++++++---- src/earthkit/data/sources/gribjump.py | 13 +++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index a89b0a60c..14b3791d8 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1242,9 +1242,16 @@ gribjump :noindex: The ``gribjump`` source enables fast retrieval of subsets of GRIB messages from the `FDB (Fields DataBase)`_ using the `gribjump`_ library. - It requires both the `pygribjump`_ and `pyfdb`_ packages to be installed. + It requires both `pygribjump`_ and `pyfdb`_ to be installed. Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. + .. warning:: + ⚠️ This source is **experimental** and may change in future versions without + warning. It performs **no validation** that the specified grid indices + correspond to the fields' actual underlying grids. The provided ranges + might, therefore, correspond to unexpected points on the grid. This source + is also currently **not thread-safe**. + :param dict request: the fdb request as a dict :param list ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range @@ -1252,7 +1259,7 @@ gribjump :param numpy.array indices: a 1D array of grid indices to retrieve :param bool fetch_coords_from_fdb: if ``True``, loads the first field's metadata from the FDB to extract the coordinates at the specified indices. If ``False``, the - coordinates are not loaded, which can speed up the retrieval process. + coordinates are not loaded and no separate FDB request is made. Default is ``False``. Please note that no validation is performed to ensure that all fields in the requests share the same grid. :param dict fdb_kwargs: only used when ``fetch_coords_from_fdb=True``. A dict of @@ -1260,8 +1267,6 @@ gribjump specify the FDB configuration, user configuration, etc. If not provided, the default configuration is used. - .. warning:: - The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index e22223488..ebfc7a549 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -45,9 +45,8 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, Any]]: Returns ------- - list[dict[str, str]] + list[dict[str, Any]] A list of individual request dictionaries, each representing a single field. - All values are converted to strings. Raises ------ @@ -112,6 +111,7 @@ def mask_to_ranges(mask: np.ndarray) -> list[tuple[int, int]]: if not np.issubdtype(mask.dtype, np.bool_): raise ValueError(f"Expected 'mask' to be a boolean array, got {mask.dtype}") if mask.ndim != 1: + # NOTE: We could relax this and allow 2D masks, which we flatten using .ravel(). raise ValueError(f"Expected 'mask' to be a 1D numpy array, got {mask.ndim}D") padded = np.concatenate(([False], mask, [False])) @@ -140,7 +140,7 @@ class ExtractionRequest: """ extraction_request: pygj.ExtractionRequest - request: dict[str, str] + request: dict[str, Any] @property def ranges(self) -> list[tuple[int, int]]: @@ -187,13 +187,6 @@ def build_extraction_request( if ranges is not None: extraction_request = pygj.ExtractionRequest(stringified_request_dict, ranges) elif mask is not None: - if not isinstance(mask, np.ndarray): - raise TypeError(f"Expected 'mask' to be a numpy array, got {type(mask)}") - if not np.issubdtype(mask.dtype, np.bool_): - raise ValueError(f"Expected 'mask' to be a boolean array, got {mask.dtype}") - if mask.ndim != 1: - # NOTE: We could relax this and just always call 'mask.ravel()' internally. - raise ValueError(f"Expected 'mask' to be a 1D numpy array, got {mask.ndim}D") extraction_request = pygj.ExtractionRequest.from_mask(stringified_request_dict, mask) elif indices is not None: extraction_request = pygj.ExtractionRequest.from_indices(stringified_request_dict, indices) From 7e398f40ae168fc8403b0df47a4e50f7458813b5 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:28:04 +0000 Subject: [PATCH 35/49] tidy: make warning about missing validation in docs more explicit --- docs/guide/sources.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index 14b3791d8..bf30eb5c2 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1246,11 +1246,12 @@ gribjump Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. .. warning:: - ⚠️ This source is **experimental** and may change in future versions without - warning. It performs **no validation** that the specified grid indices - correspond to the fields' actual underlying grids. The provided ranges - might, therefore, correspond to unexpected points on the grid. This source - is also currently **not thread-safe**. + ⚠️ This source is **experimental** and may change in future versions without + warning. It performs **no validation** that the specified grid indices, + masks, or ranges correspond to the fields' actual underlying grids. + **Incorrect usage can lead to silent data corruption.** + The provided ranges or masks might correspond to unexpected points on the + grid. This source is also currently **not thread-safe**. :param dict request: the fdb request as a dict :param list ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form From 7eb1dacdc0a3eb89c62f296b0bc863116bf81a96 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:55:16 +0000 Subject: [PATCH 36/49] docs: improve wording of warning --- docs/guide/sources.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index bf30eb5c2..8cd7a67a2 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1249,7 +1249,7 @@ gribjump ⚠️ This source is **experimental** and may change in future versions without warning. It performs **no validation** that the specified grid indices, masks, or ranges correspond to the fields' actual underlying grids. - **Incorrect usage can lead to silent data corruption.** + **Incorrect usage may silently return wrong data points.** The provided ranges or masks might correspond to unexpected points on the grid. This source is also currently **not thread-safe**. From e373e143b0cc5f174ca81b3a253369143d5970e4 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 25 Jul 2025 11:42:04 +0000 Subject: [PATCH 37/49] fix: allow fdb and gribjump to be configured via FDB5_CONFIG and GRIBJUMP_HOME --- src/earthkit/data/sources/gribjump.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index ebfc7a549..cf4bf554b 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -444,22 +444,25 @@ def __init__( self._mars_requests = split_mars_requests(request) def _check_env(self): - fdb_conf = os.environ.get("FDB5_CONFIG", None) fdb_home = os.environ.get("FDB_HOME", None) + fdb_config = os.environ.get("FDB5_CONFIG", None) + fdb_config_file = os.environ.get("FDB5_CONFIG_FILE", None) + + gj_home = os.environ.get("GRIBJUMP_HOME", None) gj_config_file = os.environ.get("GRIBJUMP_CONFIG_FILE", None) gj_ignore_grid = os.environ.get("GRIBJUMP_IGNORE_GRID", None) - if fdb_home is None and fdb_conf is None: + if fdb_home is None and fdb_config is None and fdb_config_file is None: raise RuntimeError( - """Neither FDB_HOME nor FDB5_CONFIG environment variable - was set! Please define either one to access FDB. + """Neither FDB_HOME, FDB5_CONFIG, nor FDB5_CONFIG_FILE environment variable + was set! Please define at least one to access FDB. See: https://fields-database.readthedocs.io for details about FDB.""" ) - if gj_config_file is None: + if gj_home is None and gj_config_file is None: raise RuntimeError( - "Environment variable 'GRIBJUMP_CONFIG_FILE' is not set but " - "is required by GribJump. Please set it to the path of the GribJump " - "configuration file." + """Neither GRIBJUMP_HOME nor GRIBJUMP_CONFIG_FILE environment variable + was set! Please define at least one to access GribJump. + See: https://github.com/ecmwf/gribjump for details about GribJump.""" ) if gj_ignore_grid is None: # We could consider setting this automatically but this would need From 90b1f95e5f96f132e6435ba2289ce9d388408685 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 25 Jul 2025 12:00:33 +0000 Subject: [PATCH 38/49] chore: update docstring --- src/earthkit/data/sources/gribjump.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index cf4bf554b..c7d828088 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -37,6 +37,10 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, Any]]: returns result arrays without metadata, so each field must be requested individually to map outputs correctly. + NOTE: Parsing of MARS requests should ideally not be handled here but in a dedicated + component like pymetkit. Consider updating this function once something appropriate + is available. + Parameters ---------- request : dict[str, Any] @@ -66,7 +70,7 @@ def split_mars_requests(request: dict[str, Any]) -> list[dict[str, Any]]: """ request = request.copy() - # Validation + # Validate request values for k in request.keys(): v = request[k] if isinstance(v, str) and "/" in v: From d6b506a570dcb0a1f53d951492a3e6efda8ac217 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 25 Jul 2025 12:18:35 +0000 Subject: [PATCH 39/49] feat: pass log context to gribjump --- src/earthkit/data/sources/gribjump.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index c7d828088..be07f48c1 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -303,7 +303,9 @@ def _load(self): return extraction_requests = [req.extraction_request for req in self._requests] - extraction_results = self._gj.extract(extraction_requests) + + context = {"client": "earthkit-data", "source_type": "gribjump"} + extraction_results = self._gj.extract(extraction_requests, ctx=context) fields = [] indices = None From a806a711538583a0dd6d62221b1ac540dded461d Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 25 Jul 2025 12:19:57 +0000 Subject: [PATCH 40/49] refactor: simplify gribjump log context --- src/earthkit/data/sources/gribjump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/earthkit/data/sources/gribjump.py b/src/earthkit/data/sources/gribjump.py index be07f48c1..a510d4913 100644 --- a/src/earthkit/data/sources/gribjump.py +++ b/src/earthkit/data/sources/gribjump.py @@ -304,7 +304,7 @@ def _load(self): extraction_requests = [req.extraction_request for req in self._requests] - context = {"client": "earthkit-data", "source_type": "gribjump"} + context = {"origin": "earthkit-data"} extraction_results = self._gj.extract(extraction_requests, ctx=context) fields = [] From 23afbfc80ab3c12feb25108637f4d83e87d77590 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 25 Jul 2025 14:16:58 +0000 Subject: [PATCH 41/49] test: add t_gribjump.grib test data with expver xxxx --- tests/data/t_gribjump.grib | Bin 0 -> 2760 bytes tests/sources/test_gribjump.py | 20 ++++++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 tests/data/t_gribjump.grib diff --git a/tests/data/t_gribjump.grib b/tests/data/t_gribjump.grib new file mode 100644 index 0000000000000000000000000000000000000000..ced96dc070fa07e90fe9592231ddf2dc6d37407c GIT binary patch literal 2760 zcmb`Fc}$aM9DqMWP#kKlh+Lwns7%pRWP(@`kzu(8UfjriSfF=HFQ6Btk@Adxu|>{xxj9335l^*WT*`MkLY_yhokzy_VLYpdT(R@gN? z0|*IDg8z4oO`0X?S_K>P&e341w72I6~8F! z+bE~CtEG&x_BNLuI0lZlAl;!1Qx=mUtX`Z4)lOi@6eoteE`tNuugvjL=8o9~2RoGyul8W|}rOJi%UiUkTk3Xcfu!o1=a7R9jA9?1d(H|{Tk$W?H z_dJ!1$bO4`p_0>Tmxn96CfgG`%%n^A5(klecQ7ji+vFk2ekw0{h+~G2q&j9AF-I|e zZXdw`b+XopmYnD+il$m1+)v=RU*yyXa?l2(%0S}Yh`<{tpWxW@N#Sor4xBJSJyBKY z!#jiSO6SrQS%7G}um$CTdW88VAt&A<$}hSu!57^|n!?jDGlJm6b({|wf01lLCa-QJ zwPlX)%0MM(hgobPZnEMp@9vwV=W2Ml-XoBl}`mYJOYAJ zCzxP`W1EmR;D9<=>qJXVbQQOEu7jlegM&0@Gx?dgX(}T)fA`@)8JZmW(&toMT&_ok zQ;r@5QNZF=WBvI{sU=xXSu-Lhv@0Qxa5BL&=I@9KR2lv>{zsyBf-|N@pd@5xkokJ# zmm#A`A#^K2zT|BRgK_MXXzLME@6$@lRIgBSWNf)D8vqW+h7^0%fdlGftrG*SbrRw7 z)lP&%xMlYc*aY;7Ejmz)YjmuHCgN_Vbl1Wf9IBRk*}{-=zVwHBO|?x;d%I=x+!NWf z!w6)oS5u?GjyVpJ`-=M+&s;{Wn#|N69~Reck~=AfI*JrhdBAEwg&;=*!KZ@y-mOYb+Xn8 zTw9&IwYdnfTn7$!oJjUB1pw&lgO@JM@vJ=MTdl4*aZz>8TPtOf;R|=n8e1AII*lvK z9u#*#YoSV1GEjiIu$T}4eZYDS!8rC{{R#GDN zjK{&F;J*mkg(fbxx5MrxiC<;MvUZ6V^E@PY!LHn~-1pa)Zf;>Ri1Xy5_ Date: Thu, 28 Aug 2025 16:13:14 +0000 Subject: [PATCH 42/49] add pygribjump as an optional dependency At the time of this commit, there is only one available version for pygribjump available on pypi, which is a pre-release version (0.10.3.dev20250827). Before merging, it should be discussed whether this is acceptable or if we want to wait until a stable release of pygribjump is available. --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2b606a0ab..d4e91a08c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "xarray>=0.19", ] optional-dependencies.all = [ - "earthkit-data[cds,covjsonkit,ecmwf-opendata,fdb,geo,geopandas,mars,odb,polytope,projection,s3,wekeo]", + "earthkit-data[cds,covjsonkit,ecmwf-opendata,fdb,geo,geopandas,gribjump,mars,odb,polytope,projection,s3,wekeo]", ] optional-dependencies.cds = [ "cdsapi>=0.7.2" ] optional-dependencies.ci = [ "numpy" ] @@ -70,6 +70,7 @@ optional-dependencies.fdb = [ "pyfdb>=0.1" ] optional-dependencies.geo = [ "earthkit-geo>=0.2" ] optional-dependencies.geopandas = [ "geopandas" ] optional-dependencies.geotiff = [ "pyproj", "rasterio", "rioxarray" ] +optional-dependencies.gribjump = [ "pygribjump" ] optional-dependencies.mars = [ "ecmwf-api-client>=1.6.1" ] optional-dependencies.odb = [ "pyodc" ] optional-dependencies.polytope = [ "polytope-client>=0.7.6" ] From 85d9513eb1f5f86ac89fedccf8c9572dabc6d88e Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Tue, 16 Sep 2025 21:49:50 +0200 Subject: [PATCH 43/49] docs: clarify gribjump install instructions and dependency handling --- docs/guide/sources.rst | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index e9a38b57b..844ba27d1 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1241,8 +1241,11 @@ gribjump .. py:function:: from_source("gribjump", request, *, ranges=None, mask=None, indices=None, fetch_coords_from_fdb=False, fdb_kwargs=None, **kwargs) :noindex: - The ``gribjump`` source enables fast retrieval of subsets of GRIB messages from the `FDB (Fields DataBase)`_ using the `gribjump`_ library. - It requires both `pygribjump`_ and `pyfdb`_ to be installed. + The ``gribjump`` source enables fast retrieval of GRIB message subsets from the `FDB (Fields DataBase)`_ using the `gribjump `_ library. + Both `pygribjump `_ and `pyfdb`_ must be installed. The `pygribjump`_ package uses `findlibs `_ to locate an installation of the `gribjump`_ library. + If the library is not available on your system, you can install it via the `gribjumplib `_ wheel from PyPI. + Installing `gribjumplib` from PyPI will also automatically install `fdb5lib `_ and other dependencies, which may take priority over any existing installations on your system. + Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. .. warning:: From e3a68d4a16efba81dd004a90e21177bbdc1d5eed Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:02:25 +0200 Subject: [PATCH 44/49] docs: move warning before parameters section --- docs/guide/sources.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index 844ba27d1..6df966c02 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1246,8 +1246,6 @@ gribjump If the library is not available on your system, you can install it via the `gribjumplib `_ wheel from PyPI. Installing `gribjumplib` from PyPI will also automatically install `fdb5lib `_ and other dependencies, which may take priority over any existing installations on your system. - Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. - .. warning:: ⚠️ This source is **experimental** and may change in future versions without warning. It performs **no validation** that the specified grid indices, @@ -1256,6 +1254,8 @@ gribjump The provided ranges or masks might correspond to unexpected points on the grid. This source is also currently **not thread-safe**. + Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. + :param dict request: the fdb request as a dict :param list ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range From c03c3ee35f6a33962814bade03e0d050b826735b Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 18 Sep 2025 13:39:58 +0000 Subject: [PATCH 45/49] docs: clarify parameter description and types --- docs/guide/sources.rst | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index 6df966c02..ed671fb68 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1256,20 +1256,28 @@ gribjump Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. - :param dict request: the fdb request as a dict - :param list ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form - [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range - :param numpy.array mask: a 1D boolean mask specifying which grid points to retrieve - :param numpy.array indices: a 1D array of grid indices to retrieve - :param bool fetch_coords_from_fdb: if ``True``, loads the first field's metadata from + :param request: the fdb request as a dict + :type request: dict + :param ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form + [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range. + :type ranges: list[tuple[int, int]], optional + :param mask: a 1D boolean mask specifying which grid points to retrieve + :type mask: numpy.array, optional + :param indices: a 1D array of grid indices to retrieve + :type indices: numpy.array, optional + :param fetch_coords_from_fdb: if ``True``, loads the first field's metadata from the FDB to extract the coordinates at the specified indices. If ``False``, the coordinates are not loaded and no separate FDB request is made. Default is ``False``. Please note that no validation is performed to ensure that all fields in the requests share the same grid. - :param dict fdb_kwargs: only used when ``fetch_coords_from_fdb=True``. A dict of + :type fetch_coords_from_fdb: bool, optional + :param fdb_kwargs: only used when ``fetch_coords_from_fdb=True``. A dict of keyword arguments passed to the `pyfdb.FDB` constructor. This allows to - specify the FDB configuration, user configuration, etc. If not provided, the - default configuration is used. + specify the FDB configuration, user configuration, etc. If not provided, + the default configuration is used. These arguments are only passed to the + FDB when fetching coordinates and are not used by GribJump for the + extraction itself. + :type fdb_kwargs: dict, optional The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: From 972b97b27e1979b24a18db547b9b40191ce740b6 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:55:51 +0000 Subject: [PATCH 46/49] add pyfdb as a gribjump group dependency and update docs --- docs/guide/sources.rst | 5 ++++- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index ed671fb68..bab949b65 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1256,7 +1256,8 @@ gribjump Exactly one of the parameters ``ranges``, ``mask`` or ``indices`` must be specified at a time. - :param request: the fdb request as a dict + :param request: the FDB request as a dictionary. GribJump requires strict value formatting + (e.g., hdates as "YYYYMMDD", not "YYYY-MM-DD"). Format errors may result in "DataNotFound" errors. :type request: dict :param ranges: a list of tuples specifying the ranges of 1D grid indices to retrieve in the form [(start1, end1), (start2, end2), ...]. Ranges are exclusive, meaning that the end index is not included in the range. @@ -1280,6 +1281,8 @@ gribjump :type fdb_kwargs: dict, optional + + The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: .. code-block:: python diff --git a/pyproject.toml b/pyproject.toml index 05f162fc3..70e09b271 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ optional-dependencies.fdb = [ "pyfdb>=0.1" ] optional-dependencies.geo = [ "earthkit-geo>=0.2" ] optional-dependencies.geopandas = [ "geopandas" ] optional-dependencies.geotiff = [ "pyproj", "rasterio", "rioxarray" ] -optional-dependencies.gribjump = [ "pygribjump" ] +optional-dependencies.gribjump = [ "pyfdb>=0.1", "pygribjump" ] optional-dependencies.mars = [ "ecmwf-api-client>=1.6.1" ] optional-dependencies.odb = [ "pyodc" ] optional-dependencies.polytope = [ "polytope-client>=0.7.6" ] From 8d031d381a8e477ea16b244e3b81276ccef05c65 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:09:45 +0000 Subject: [PATCH 47/49] last docs and typo fixes --- docs/examples/gribjump.ipynb | 16 ++++++++-------- docs/guide/sources.rst | 2 -- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/examples/gribjump.ipynb b/docs/examples/gribjump.ipynb index eedb5a3af..bfe1f1d88 100644 --- a/docs/examples/gribjump.ipynb +++ b/docs/examples/gribjump.ipynb @@ -72,13 +72,13 @@ "source": [ "### How To Use\n", "\n", - "The `gribjump` source works similar to the `fdb` source and receives a dictionary with an fdb/mars request.\n", + "The `gribjump` source works similar to the `fdb` source and receives a dictionary with an FDB request.\n", "Please note that the mars syntax for ranges and lists using \"/\" is not supported. Only scalar values and\n", "Python lists are supported.\n", "\n", - "The second required parameters is then one of `ranges`, `indices`, and `mask`, selecting the grid cells which should\n", + "The second required parameter is one of `ranges`, `indices`, or `mask`, selecting the grid cells which should\n", "be extracted. For convenience, one can set an additional parameter `fetch_coords_from_fdb=True` to make an additional\n", - "request directly to the fdb to retrieve latitudes and longitude information for the retrieved cells and include\n", + "request directly to the fdb to retrieve latitude and longitude information for the retrieved cells and include\n", "them in the retrieved cell's metadata." ] }, @@ -791,18 +791,18 @@ "* **Indices:** A 1D numpy array or list of specific grid point indices to extract\n", " from the flattened grid. This allows for non-contiguous extraction of\n", " individual grid points. For example, `np.array([5, 10, 15, 20])` would extract\n", - " exactly those four grid points.\n", + " exactly those four grid points. This array must be sorted in ascending order.\n", "\n", "* **Masks:** A numpy boolean array where `True` indicates grid points to extract\n", " and `False` indicates points to skip. The mask must have the same length as\n", " the total number of grid points in the field. However, no such validation is\n", - " performed, passing a mask with an invalid shape will silently return wrong\n", + " performed and passing a mask with an invalid shape will silently return wrong\n", " results.\n", "\n", "Only one of these methods can be used at a time. Please also note that GribJump\n", - "uses ranges internally despite of what the user specifies. Converting the\n", - "reperesentation chosen by the user to ranges can be expensive when multiple\n", - "fields are accessed at the same time." + "uses ranges internally regardless of what the user specifies. Converting the\n", + "user's chosen representation to ranges can be expensive when multiple\n", + "fields are accessed simultaneously." ] }, { diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index bab949b65..3670dd0ea 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -1281,8 +1281,6 @@ gribjump :type fdb_kwargs: dict, optional - - The following example retrieves a subset from a GRIB message in the FDB using a boolean mask: .. code-block:: python From f2c5f0e2aa8360f9b958065104fe3b39b4b20522 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:33:39 +0000 Subject: [PATCH 48/49] docs: change notebook to also set FDB_HOME --- docs/examples/gribjump.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/examples/gribjump.ipynb b/docs/examples/gribjump.ipynb index bfe1f1d88..a145e0ce4 100644 --- a/docs/examples/gribjump.ipynb +++ b/docs/examples/gribjump.ipynb @@ -58,8 +58,7 @@ } ], "source": [ - "# Configure FDB either via FDB_HOME or FDB5_CONFIG environment variable.\n", - "# os.environ.setdefault(\"FDB_HOME\", \"\")\n", + "os.environ.setdefault(\"FDB_HOME\", \"\")\n", "os.environ.setdefault(\"FDB5_CONFIG_FILE\", \"\")\n", "os.environ.setdefault(\"GRIBJUMP_CONFIG_FILE\", \"\")\n", "os.environ.setdefault(\"GRIBJUMP_IGNORE_GRID\", \"1\")" From 88bca2609021e3fd898f5ff0b35bf6781681e464 Mon Sep 17 00:00:00 2001 From: Andreas Grafberger <18516896+andreas-grafberger@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:08:52 +0200 Subject: [PATCH 49/49] docs: reference gribjump example notebook in missing locations Implements PR suggestions from https://github.com/ecmwf/earthkit-data/pull/689#discussion_r2362461306. --- docs/examples/index.rst | 1 + tests/documentation/test_notebooks.py | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/examples/index.rst b/docs/examples/index.rst index cfc8af796..02e9e551a 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -30,6 +30,7 @@ Data sources polytope_feature.ipynb s3.ipynb wekeo.ipynb + gribjump.ipynb GRIB ++++++ diff --git a/tests/documentation/test_notebooks.py b/tests/documentation/test_notebooks.py index df0d82cd3..88f6b6bb9 100644 --- a/tests/documentation/test_notebooks.py +++ b/tests/documentation/test_notebooks.py @@ -25,25 +25,26 @@ EXAMPLES = earthkit_file("docs", "examples") SKIP = [ + "ads.ipynb", + "cds.ipynb", + "demo_source_plugin.ipynb", + "ecmwf_open_data.ipynb", "fdb.ipynb", + "grib_fdb_write.ipynb", + "grib_to_fdb_target.ipynb", + "grib_to_xarray.ipynb", + "gribjump.ipynb", "mars.ipynb", - "cds.ipynb", - "ads.ipynb", - "wekeo.ipynb", + "netcdf_opendap.ipynb", "polytope.ipynb", "polytope_feature.ipynb", "polytope_polygon_coverage.ipynb", "polytope_time_series.ipynb", "polytope_vertical_profile.ipynb", - "grib_fdb_write.ipynb", - "demo_source_plugin.ipynb", - "ecmwf_open_data.ipynb", "shapefile.ipynb", - "grib_to_xarray.ipynb", - "grib_to_fdb_target.ipynb", - "xarray_engine_chunks_on_dask_cluster.ipynb", + "wekeo.ipynb", "xarray_cupy.ipynb", - "netcdf_opendap.ipynb", + "xarray_engine_chunks_on_dask_cluster.ipynb", ] if NO_TORCH: