diff --git a/docs/examples/grib_to_zarr_target.ipynb b/docs/examples/grib_to_zarr_target.ipynb new file mode 100644 index 000000000..1df45a6fc --- /dev/null +++ b/docs/examples/grib_to_zarr_target.ipynb @@ -0,0 +1,822 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8e308cd3-7f5a-4b62-bd2d-027850282c00", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "## Writing GRIB data to Zarr" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "62b00621-67cd-46b0-81ef-16278a6eee18", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8d75e667605a4e4eb22967f6a3d6e9c6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "pl.grib: 0%| | 0.00/48.8k [00:00\n", + "#T_ca541 th {\n", + " text-align: left;\n", + "}\n", + "#T_ca541_row0_col0, #T_ca541_row0_col1, #T_ca541_row0_col2, #T_ca541_row0_col3, #T_ca541_row0_col4, #T_ca541_row0_col5, #T_ca541_row0_col6, #T_ca541_row0_col7, #T_ca541_row0_col8, #T_ca541_row1_col0, #T_ca541_row1_col1, #T_ca541_row1_col2, #T_ca541_row1_col3, #T_ca541_row1_col4, #T_ca541_row1_col5, #T_ca541_row1_col6, #T_ca541_row1_col7, #T_ca541_row1_col8 {\n", + " text-align: left;\n", + "}\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
  leveldatetimestepparamIdclassstreamtypeexperimentVersionNumber
shortNametypeOfLevel         
risobaricInhPa700,50020240603,202406040,12000,6157odoperfc0001
tisobaricInhPa700,50020240603,202406040,12000,6130odoperfc0001
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "2ab4d979-7c02-42e4-9b52-8ec12e76853b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "#### Using to_target() on the data object" + ] + }, + { + "cell_type": "raw", + "id": "60ee891e-f0cd-426c-8f26-8155e9c25381", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "We use :func:`to_target` to write the GRIB fieldlist/field into a zarr store. First, the data is converted to Xarray then :py:func:`xarray.Dataset.to_zarr` is called to generate the zarr store. We need to set the kwargs accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e3ff25d0-bce4-4cfc-bdd0-93ca0864d08d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniforge/base/envs/dev/lib/python3.11/site-packages/zarr/api/asynchronous.py:205: UserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# with these options each field will be a separate chunk\n", + "ds.to_target(\"zarr\", \n", + " earthkit_to_xarray_kwargs={\"chunks\": {\"forecast_reference_time\": 1, \n", + " \"step\": 1, \n", + " \"level\": 1}},\n", + " xarray_to_zarr_kwargs={\"store\": \"_pl.zarr\", \"mode\": \"w\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3ffafb60-c412-4560-9bea-089143bcf85d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
/\n",
+       "├── forecast_reference_time (4,) int64\n",
+       "├── latitude (19,) float64\n",
+       "├── level (2,) int64\n",
+       "├── longitude (36,) float64\n",
+       "├── r (4, 2, 2, 19, 36) float64\n",
+       "├── step (2,) int64\n",
+       "└── t (4, 2, 2, 19, 36) float64\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m/\u001b[0m\n", + "├── \u001b[1mforecast_reference_time\u001b[0m (4,) int64\n", + "├── \u001b[1mlatitude\u001b[0m (19,) float64\n", + "├── \u001b[1mlevel\u001b[0m (2,) int64\n", + "├── \u001b[1mlongitude\u001b[0m (36,) float64\n", + "├── \u001b[1mr\u001b[0m (4, 2, 2, 19, 36) float64\n", + "├── \u001b[1mstep\u001b[0m (2,) int64\n", + "└── \u001b[1mt\u001b[0m (4, 2, 2, 19, 36) float64\n" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import zarr \n", + "root = zarr.group(\"_pl.zarr\")\n", + "root.tree()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "515bc071-d45f-48aa-abea-0cc688f4eebc", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Type : Array\n", + "Zarr format : 3\n", + "Data type : DataType.float64\n", + "Shape : (4, 2, 2, 19, 36)\n", + "Chunk shape : (1, 1, 1, 19, 36)\n", + "Order : C\n", + "Read-only : False\n", + "Store type : LocalStore\n", + "Filters : ()\n", + "Serializer : BytesCodec(endian=)\n", + "Compressors : (ZstdCodec(level=0, checksum=False),)\n", + "No. bytes : 87552 (85.5K)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "root[\"t\"].info" + ] + }, + { + "cell_type": "markdown", + "id": "ef19bc33-fcc7-4b5a-83b0-ee59da4179f0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "The zarr store can be loaded to Xarray to check its content." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "706d4467-64b8-46bf-894d-346088208fa2", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/93/w0p869rx17q98wxk83gn9ys40000gn/T/ipykernel_45349/754541422.py:2: FutureWarning: In a future version of xarray decode_timedelta will default to False rather than None. To silence this warning, set decode_timedelta to True, False, or a 'CFTimedeltaCoder' instance.\n", + " xarray.open_dataset(\"_pl.zarr\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 176kB\n",
+       "Dimensions:                  (step: 2, longitude: 36,\n",
+       "                              forecast_reference_time: 4, latitude: 19, level: 2)\n",
+       "Coordinates:\n",
+       "  * step                     (step) timedelta64[ns] 16B 00:00:00 06:00:00\n",
+       "  * longitude                (longitude) float64 288B 0.0 10.0 ... 340.0 350.0\n",
+       "  * forecast_reference_time  (forecast_reference_time) datetime64[ns] 32B 202...\n",
+       "  * latitude                 (latitude) float64 152B 90.0 80.0 ... -80.0 -90.0\n",
+       "  * level                    (level) int64 16B 500 700\n",
+       "Data variables:\n",
+       "    r                        (forecast_reference_time, step, level, latitude, longitude) float64 88kB ...\n",
+       "    t                        (forecast_reference_time, step, level, latitude, longitude) float64 88kB ...\n",
+       "Attributes:\n",
+       "    class:        od\n",
+       "    stream:       oper\n",
+       "    levtype:      pl\n",
+       "    type:         fc\n",
+       "    expver:       0001\n",
+       "    date:         20240603\n",
+       "    time:         0\n",
+       "    domain:       g\n",
+       "    number:       0\n",
+       "    Conventions:  CF-1.8\n",
+       "    institution:  ECMWF
" + ], + "text/plain": [ + " Size: 176kB\n", + "Dimensions: (step: 2, longitude: 36,\n", + " forecast_reference_time: 4, latitude: 19, level: 2)\n", + "Coordinates:\n", + " * step (step) timedelta64[ns] 16B 00:00:00 06:00:00\n", + " * longitude (longitude) float64 288B 0.0 10.0 ... 340.0 350.0\n", + " * forecast_reference_time (forecast_reference_time) datetime64[ns] 32B 202...\n", + " * latitude (latitude) float64 152B 90.0 80.0 ... -80.0 -90.0\n", + " * level (level) int64 16B 500 700\n", + "Data variables:\n", + " r (forecast_reference_time, step, level, latitude, longitude) float64 88kB ...\n", + " t (forecast_reference_time, step, level, latitude, longitude) float64 88kB ...\n", + "Attributes:\n", + " class: od\n", + " stream: oper\n", + " levtype: pl\n", + " type: fc\n", + " expver: 0001\n", + " date: 20240603\n", + " time: 0\n", + " domain: g\n", + " number: 0\n", + " Conventions: CF-1.8\n", + " institution: ECMWF" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xarray\n", + "xarray.open_dataset(\"_pl.zarr\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8479a12e-e907-43de-a3a6-aefb8cbfa754", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev", + "language": "python", + "name": "dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/index.rst b/docs/examples/index.rst index c945dddd4..727becb00 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -181,6 +181,7 @@ Targets and encoders grib_to_file_pattern_target.ipynb grib_to_fdb_target.ipynb grib_to_geotiff.ipynb + grib_to_zarr_target.ipynb grib_encoder.ipynb Miscellaneous diff --git a/docs/guide/targets/index.rst b/docs/guide/targets/index.rst index 2611967ff..2efcb80f2 100644 --- a/docs/guide/targets/index.rst +++ b/docs/guide/targets/index.rst @@ -23,6 +23,7 @@ Examples - :ref:`/examples/grib_to_file_pattern_target.ipynb` - :ref:`/examples/grib_to_fdb_target.ipynb` - :ref:`/examples/grib_to_geotiff.ipynb` + - :ref:`/examples/grib_to_zarr_target.ipynb` Overview diff --git a/docs/guide/targets/to_target.rst b/docs/guide/targets/to_target.rst index f39d4d33d..8c943527f 100644 --- a/docs/guide/targets/to_target.rst +++ b/docs/guide/targets/to_target.rst @@ -39,6 +39,10 @@ Built in targets * - :ref:`targets-fdb` - add data to a `Fields DataBase `_ (FDB) - :py:class:`~data.targets.FDBTarget` + * - :ref:`targets-zarr` + - add data to a `zarr `_ store + - :py:class:`~data.targets.ZarrTarget` + .. _targets-file: @@ -170,6 +174,25 @@ fdb - :ref:`/examples/grib_to_fdb_target.ipynb` +.. _targets-zarr: + +zarr +---- + +.. py:function:: to_target("zarr", earthkit_to_xarray_kwargs=None, xarray_to_zarr_kwargs=None, data=None) + :noindex: + + The ``zarr`` target writes to a `zarr `_ store. + + :param dict earthkit_to_xarray_kwargs: the keyword arguments passed to the :func:`to_xarray` function. If not provided, the default values are used. + :param dict xarray_to_zarr_kwargs: the keyword arguments passed to the :py:func:`xarray.Dataset.to_zarr` function. As a bare minimum, the ``store`` keyword argument must be provided. + :param data: specify the data to write. Cannot be set when :func:`to_target` is called on a data object. + + This target converts the data to an :ref:`xarray.Dataset ` and then writes it to a zarr store using the :py:func:`xarray.Dataset.to_zarr` function. The conversion to an xarray dataset is done by the :func:`to_xarray` function. + + Notebook examples: + + - :ref:`/examples/grib_to_zarr_target.ipynb` .. .. _data-targets-multio: diff --git a/src/earthkit/data/encoders/zarr.py b/src/earthkit/data/encoders/zarr.py new file mode 100644 index 000000000..2e9b05085 --- /dev/null +++ b/src/earthkit/data/encoders/zarr.py @@ -0,0 +1,80 @@ +# (C) Copyright 2023 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import logging + +from . import EncodedData +from . import Encoder + +LOG = logging.getLogger(__name__) + + +class ZarrEncodedData(EncodedData): + def __init__(self, ds): + self.ds = ds + + def to_bytes(self): + return None + + def to_file(self, f): + return None + + def to_xarray(self): + return self.ds + + def metadata(self, key): + raise NotImplementedError + + +class ZarrEncoder(Encoder): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def encode( + self, + data=None, + **kwargs, + ): + if data is not None: + from earthkit.data.wrappers import get_wrapper + + data = get_wrapper(data) + return data._encode(self, **kwargs) + else: + raise ValueError("No data to encode") + + def _encode( + self, + data=None, + values=None, + min=None, + max=None, + check_nans=False, + metadata={}, + template=None, + # return_bytes=False, + missing_value=9999, + **kwargs, + ): + return ZarrEncodedData(data.to_xarray(add_earthkit_attrs=False)) + + def _encode_field(self, field, **kwargs): + raise NotImplementedError("ZarrEncoder does not support encoding individual fields.") + + def _encode_fieldlist(self, data, **kwargs): + earthkit_to_xarray_kwargs = kwargs.pop("earthkit_to_xarray_kwargs", {}) + # earthkit_to_xarray_kwargs.update(kwargs) + earthkit_to_xarray_kwargs["add_earthkit_attrs"] = False + kwargs = earthkit_to_xarray_kwargs + + ds = data.to_xarray(**kwargs) + return ZarrEncodedData(ds) + + +encoder = ZarrEncoder diff --git a/src/earthkit/data/targets/zarr.py b/src/earthkit/data/targets/zarr.py new file mode 100644 index 000000000..9bb4fdd6f --- /dev/null +++ b/src/earthkit/data/targets/zarr.py @@ -0,0 +1,51 @@ +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import logging + +from . import SimpleTarget + +LOG = logging.getLogger(__name__) + + +class ZarrTarget(SimpleTarget): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._zarr_kwargs = kwargs + self._ekd_kwargs = kwargs.pop("earthkit_to_xarray_kwargs", {}) + self._xr_kwargs = kwargs.pop("xarray_to_zarr_kwargs", {}) + self._encoder = "zarr" + + def close(self): + """Close the target and flush the fdb. + + The target will not be able to write anymore. + + Raises: + ------- + ValueError: If the target is already closed. + """ + pass + + def flush(self): + """Flush the fdb. + + Raises: + ------- + ValueError: If the target is already closed. + """ + pass + + def _write(self, data, **kwargs): + r = self._encode(data, earthkit_to_xarray_kwargs=self._ekd_kwargs) + ds = r.to_xarray() + ds.to_zarr(**self._xr_kwargs) + + +target = ZarrTarget diff --git a/src/earthkit/data/testing.py b/src/earthkit/data/testing.py index e9b08d28b..115a87ced 100644 --- a/src/earthkit/data/testing.py +++ b/src/earthkit/data/testing.py @@ -133,6 +133,16 @@ def modules_installed(*modules): NO_ECFS = True +NO_ZARR = True +try: + import zarr # noqa + + if int(zarr.__version__.split(".")[0]) >= 3: + NO_ZARR = False +except Exception: + pass + + def MISSING(*modules): return not modules_installed(*modules) diff --git a/tests/sources/test_zarr.py b/tests/sources/test_zarr.py index f84a58693..109cf2d82 100644 --- a/tests/sources/test_zarr.py +++ b/tests/sources/test_zarr.py @@ -1,16 +1,21 @@ -import importlib.util +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# import pytest from earthkit.data import from_source from earthkit.data.readers.netcdf.field import XArrayField +from earthkit.data.testing import NO_ZARR from earthkit.data.testing import earthkit_test_data_file -if importlib.util.find_spec("zarr") is not None: - NO_ZARR = False -else: - NO_ZARR = True - @pytest.mark.skipif(NO_ZARR, reason="Zarr not installed") def test_zarr_source(): diff --git a/tests/targets/test_target_zarr.py b/tests/targets/test_target_zarr.py new file mode 100644 index 000000000..fed4cb6b2 --- /dev/null +++ b/tests/targets/test_target_zarr.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + + +import os + +import pytest + +from earthkit.data import from_source +from earthkit.data.core.temporary import temp_directory +from earthkit.data.targets import to_target +from earthkit.data.testing import NO_ZARR + + +@pytest.mark.skipif(NO_ZARR, reason="Zarr not installed") +@pytest.mark.cache +@pytest.mark.parametrize("direct_call", [True, False]) +def test_target_zarr_from_grib(direct_call): + ds = from_source("sample", "pl.grib") + + with temp_directory() as tmp: + path = os.path.join(tmp, "_res.zarr") + + if direct_call: + to_target( + "zarr", + earthkit_to_xarray_kwargs={"chunks": {"forecast_reference_time": 1, "step": 1, "level": 1}}, + xarray_to_zarr_kwargs={"store": path, "mode": "w"}, + data=ds, + ) + else: + ds.to_target( + "zarr", + earthkit_to_xarray_kwargs={"chunks": {"forecast_reference_time": 1, "step": 1, "level": 1}}, + xarray_to_zarr_kwargs={"store": path, "mode": "w"}, + ) + + import zarr + + root = zarr.group(path) + assert root + + shapes = { + "t": (4, 2, 2, 19, 36), + "r": (4, 2, 2, 19, 36), + "forecast_reference_time": (4,), + "step": (2,), + "level": (2,), + "latitude": (19,), + "longitude": (36,), + } + + for k in ["t", "r", "forecast_reference_time", "step", "level", "latitude", "longitude"]: + k in root, f"Key {k} not found in Zarr root" + assert ( + root[k].shape == shapes[k] + ), f"Shape mismatch for {k}: expected {shapes[k]}, got {root[k].shape}"