diff --git a/pyproject.toml b/pyproject.toml index 803d32709..ae71d4b62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,7 @@ optional-dependencies.test = [ "pytest-timeout", ] optional-dependencies.wekeo = [ "hda>=2.22" ] +optional-dependencies.zarr = [ "zarr>=3" ] urls.Documentation = "https://earthkit-data.readthedocs.io/" urls.Homepage = "https://github.com/ecmwf/earthkit-data/" urls.Issues = "https://github.com/ecmwf/earthkit-data.issues" diff --git a/src/earthkit/data/readers/__init__.py b/src/earthkit/data/readers/__init__.py index 9faf10a99..35ba5b38f 100644 --- a/src/earthkit/data/readers/__init__.py +++ b/src/earthkit/data/readers/__init__.py @@ -204,27 +204,26 @@ def reader(source, path, **kwargs): raise TypeError("Provided reader must be a callable or a string, not %s" % type(reader)) - if os.path.isdir(path): - from .directory import DirectoryReader - - return DirectoryReader(source, path).mutate() - LOG.debug("Reader for %s", path) - if not os.path.exists(path): r = _non_existing(source, path, **kwargs) if r is not None: return r raise FileNotFoundError(f"No such file exists: '{path}'") - if os.path.getsize(path) == 0: - r = _empty(source, path, **kwargs) - if r is not None: - return r - raise Exception(f"File is empty: '{path}'") + LOG.debug("Reader for %s", path) - n_bytes = CONFIG.get("reader-type-check-bytes") - with open(path, "rb") as f: - magic = f.read(n_bytes) + if os.path.isdir(path): + magic = None + else: + if os.path.getsize(path) == 0: + r = _empty(source, path, **kwargs) + if r is not None: + return r + raise Exception(f"File is empty: '{path}'") + + n_bytes = CONFIG.get("reader-type-check-bytes") + with open(path, "rb") as f: + magic = f.read(n_bytes) LOG.debug("Looking for a reader for %s (%s)", path, magic) diff --git a/src/earthkit/data/readers/directory.py b/src/earthkit/data/readers/directory.py index 1a280bc08..a15af57be 100644 --- a/src/earthkit/data/readers/directory.py +++ b/src/earthkit/data/readers/directory.py @@ -61,10 +61,15 @@ def mutate(self): return self def mutate_source(self): - if os.path.exists(os.path.join(self.path, ".zattrs")): + if ( + os.path.exists(os.path.join(self.path, ".zarray")) + or os.path.exists(os.path.join(self.path, ".zgroup")) + or os.path.exists(os.path.join(self.path, ".zmetadata")) + or os.path.exists(os.path.join(self.path, ".zattrs")) + ): if self.stream: raise ValueError("Cannot stream zarr directories") - return from_source("zarr", self.path) + return from_source("xarray-zarr", self.path) if len(self._content) == 1: return from_source( @@ -103,5 +108,14 @@ def write(self, f, **kwargs): def reader(source, path, *, magic=None, deeper_check=False, **kwargs): - if magic is None and os.path.isdir(path): + if ( + magic is None + and os.path.isdir(path) + and not ( + os.path.exists(os.path.join(path, ".zarray")) + or os.path.exists(os.path.join(path, ".zgroup")) + or os.path.exists(os.path.join(path, ".zmetadata")) + or os.path.exists(os.path.join(path, ".zattrs")) + ) + ): return DirectoryReader(source, path) diff --git a/src/earthkit/data/readers/numpy.py b/src/earthkit/data/readers/numpy.py index 3cccc339b..326dbc1a6 100644 --- a/src/earthkit/data/readers/numpy.py +++ b/src/earthkit/data/readers/numpy.py @@ -34,12 +34,10 @@ def to_numpy(self, numpy_load_kwargs={}): def reader(source, path, *, magic=None, deeper_check=False, **kwargs): - if magic is None: # Bypass check and force - return NumpyReader(source, path) + if magic is not None: + if magic[:6] == b"\x93NUMPY": + return NumpyReader(source, path) - if magic[:6] == b"\x93NUMPY": - return NumpyReader(source, path) - - _, extension = os.path.splitext(path) - if magic[:4] == b"PK\x03\x04" and extension == ".npz": - return NumpyZipReader(source, path) + _, extension = os.path.splitext(path) + if magic[:4] == b"PK\x03\x04" and extension == ".npz": + return NumpyZipReader(source, path) diff --git a/src/earthkit/data/readers/tar.py b/src/earthkit/data/readers/tar.py index ce325e4e9..b4b95480f 100644 --- a/src/earthkit/data/readers/tar.py +++ b/src/earthkit/data/readers/tar.py @@ -34,5 +34,5 @@ def reader(source, path, *, magic=None, deeper_check=False, **kwargs): kind, compression = mimetypes.guess_type(path) - if magic is None or kind == "application/x-tar": + if kind == "application/x-tar": return TarReader(source, path, compression) diff --git a/src/earthkit/data/readers/text.py b/src/earthkit/data/readers/text.py index 7ea687c03..512d8a2ea 100644 --- a/src/earthkit/data/readers/text.py +++ b/src/earthkit/data/readers/text.py @@ -44,9 +44,6 @@ def mutate(self): def reader(source, path, *, magic=None, deeper_check=False, **kwargs): - if magic is None: # Bypass check and force - return TextReader(source, path) - if deeper_check: if is_text(path): return TextReader(source, path) diff --git a/src/earthkit/data/readers/zarr.py b/src/earthkit/data/readers/zarr.py new file mode 100644 index 000000000..97cb56183 --- /dev/null +++ b/src/earthkit/data/readers/zarr.py @@ -0,0 +1,45 @@ +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import os + +from . import Reader +from .netcdf.fieldlist import XArrayFieldList + + +class ZarrReader(XArrayFieldList, Reader): + + def __init__(self, source, path, **kwargs): + Reader.__init__(self, source, path, **kwargs) + XArrayFieldList.__init__(self, self._open_zarr(**kwargs)) + + def mutate_source(self): + return self + + def to_xarray(self, **kwargs): + return self._open_zarr(**kwargs) + + def _open_zarr(self, **kwargs): + import xarray as xr + + options = kwargs.get("xarray_open_zarr_kwargs", kwargs) + return xr.open_zarr(self.path, **options) + + def __repr__(self): + return f"ZarrReader({self.path})" + + +def reader(source, path, *, magic=None, deeper_check=False, **kwargs): + if ( + os.path.exists(os.path.join(path, ".zarray")) + or os.path.exists(os.path.join(path, ".zgroup")) + or os.path.exists(os.path.join(path, ".zmetadata")) + or os.path.exists(os.path.join(path, ".zattrs")) + ): + return ZarrReader(source, path) diff --git a/src/earthkit/data/sources/xarray_zarr.py b/src/earthkit/data/sources/xarray_zarr.py new file mode 100644 index 000000000..4b12c421d --- /dev/null +++ b/src/earthkit/data/sources/xarray_zarr.py @@ -0,0 +1,29 @@ +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +from ..readers.zarr import ZarrReader +from . import Source + + +class ZarrSource(Source): + + def __init__(self, path, **kwargs): + super().__init__(**kwargs) + self._reader = ZarrReader(self, path, **kwargs) + + def mutate(self): + source = self._reader.mutate_source() + if source not in (None, self): + source._parent = self + return source + + return self + + +source = ZarrSource diff --git a/tests/data/test_zarr/.zattrs b/tests/data/test_zarr/.zattrs new file mode 100644 index 000000000..63e460446 --- /dev/null +++ b/tests/data/test_zarr/.zattrs @@ -0,0 +1,4 @@ +{ + "Conventions": "CF-1.6", + "history": "2024-04-24 15:40:12 GMT by grib_to_netcdf-2.36.0: /Users/cgr/install/eccodes/release/bin/grib_to_netcdf -o test4.nc test4.grib" +} diff --git a/tests/data/test_zarr/.zgroup b/tests/data/test_zarr/.zgroup new file mode 100644 index 000000000..ea023896e --- /dev/null +++ b/tests/data/test_zarr/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} diff --git a/tests/data/test_zarr/.zmetadata b/tests/data/test_zarr/.zmetadata new file mode 100644 index 000000000..2366f8b4d --- /dev/null +++ b/tests/data/test_zarr/.zmetadata @@ -0,0 +1,213 @@ +{ + "metadata": { + ".zgroup": { + "zarr_format": 2 + }, + ".zattrs": { + "Conventions": "CF-1.6", + "history": "2024-04-24 15:40:12 GMT by grib_to_netcdf-2.36.0: /Users/cgr/install/eccodes/release/bin/grib_to_netcdf -o test4.nc test4.grib" + }, + "z/.zattrs": { + "units": "m**2 s**-2", + "long_name": "Geopotential", + "standard_name": "geopotential", + "add_offset": 33425.29356267787, + "scale_factor": 0.7605308942631956, + "missing_value": -32767, + "_ARRAY_DIMENSIONS": [ + "time", + "level", + "latitude", + "longitude" + ] + }, + "z/.zarray": { + "shape": [ + 1, + 2, + 181, + 360 + ], + "chunks": [ + 1, + 1, + 181, + 360 + ], + "fill_value": -32767, + "order": "C", + "filters": null, + "dimension_separator": ".", + "compressor": { + "id": "blosc", + "cname": "lz4", + "clevel": 5, + "shuffle": 1, + "blocksize": 0, + "typesize": null + }, + "zarr_format": 2, + "dtype": "