ecmwf · sandorkertesz · Apr 30, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/docs/guide/misc/patterns.rst b/docs/guide/misc/patterns.rst
@@ -0,0 +1,194 @@
+.. _patterns:
+
+Using patterns for the file-pattern source
+==========================================
+
+The :ref:`data-sources-file-pattern` source works with an input path pattern. The pattern is a string containing parameters within ``{}`` brackets. The way these parameters are substituted depends on the ``hive_partitioning`` option.
+
+Pattern substitution
++++++++++++++++++++++
+
+hive_partioning=False
+/////////////////////////////
+
+When ``hive_partioning=False`` we have to specify all the possible values for all the parameters. E.g.:
+
+.. code-block:: python
+
+    from_source(
+        "file-pattern",
+        "mydir/{year}/myfile_{param}.grib",
+        year=[2023, 2024],
+        param=["t", "r"],
+    )
+
+When this code is executed the file paths are constructed from the Cartesian product of the substituted values. The example above will result in a :py:class:`Fieldlist` built from the following paths::
+
+    mydir/2023/myfile_t.grib
+    mydir/2023/myfile_r.grib
+    mydir/2024/myfile_t.grib
+    mydir/2024/myfile_r.grib
+
+
+hive_partioning=True
+/////////////////////////////
+
+When ``hive_partitioning=True`` the behaviour is different. The pattern values still can be specified, but is is optional since they can be determined dynamically. See :ref:`here <data-sources-file-pattern>` for details.
+
+Pattern item types
+++++++++++++++++++++
+
+Each pattern parameter can have an optional type specifier.
+
+The following pattern types are available:
+
+- ``int``: enforce the input values to be integers. An optional format can be specified.
+
+    .. code-block:: python
+
+        {name: int}
+        {name: int(format)}
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value/Error
+        * - {step:int}
+          - 5
+          - "5"
+        * - {step:int(%04d)}
+          - 5
+          - "0005"
+        * - {step:int}
+          - "5"
+          - ValueError
+        * - {step:int}
+          - 5.0
+          - ValueError
+
+- ``float``: enforce the input values to be floats or ints. An optional format can be specified, the default is ``%g``.
+
+    .. code-block:: python
+
+        {name: float}
+        {name: float(format)}
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value/Error
+        * - {val:float}
+          - 5.1
+          - "5.1"
+        * - {val:float}
+          - 5.0
+          - "5"
+        * - {val:float}
+          - 5
+          - "5"
+        * - {val:float(%.2f)}
+          - 5.1
+          - "5.10"
+        * - {step:float}
+          - "5.0"
+          - ValueError
+
+- ``enum``: enforce the input values to be one of the specified values
+
+    .. code-block:: python
+
+        {name: enum(value1, value2, value3)}
+
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value/Error
+        * - {step:enum(0,6,12)}
+          - [0, 6]
+          - "0" and "6"
+        * - {step:enum(0,6,12)}
+          - [0,18]
+          - ValueError
+
+- ``date``: all values are cast to a datetime formatted with the ``datetime.strftime`` syntax. The formatting must be specified.
+
+    .. code-block:: python
+
+        {my_date: date(format)}
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value/Error
+        * - {my_date:date(%Y-%m-%d)}
+          - [datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 2)]
+          - "2023-01-01" and "2023-01-02"
+        * - {my_date:date(%Y-%m-%d)}
+          - ["20230101", "20230102"]
+          - "2023-01-01" and "2023-01-02"
+
+- ``strftime``: alias to ``date``
+
+- ``strftimedelta``: all values are cast to a datetime by applying the specified timedelta. Datetime formatting must be specified.
+
+    .. code-block:: python
+
+        {my_date: strftimedelta(delta, format)}
+
+    where ``delta`` can be specified in seconds, minutes, hours (the default is hours), e.g.::
+
+        6
+        -6h
+        60m
+        7200s
+
+    .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value/Error
+        * - {my_date:strftimedelta(-6,%Y-%m-%d_%H)}
+          - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ]
+          - "2020-05-10_18" and "2020-05-11_00"
+        * - {my_date:strftimedelta(60m,%Y-%m-%d_%H)}
+          - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ]
+          - "2020-05-11_01" and "2020-05-11_07"
+        * - {my_date:strftimedelta(7200s,%Y-%m-%d_%H)}
+          - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ]
+          - "2020-05-11_02" and "2020-05-11_08"
+
+
+Built-in pattern item functions
++++++++++++++++++++++++++++++++
+
+The built-in pattern item functions are applied to the substituted values. The syntax is as follows::
+
+    {param|function1|function2|...|functionN}
+
+At the moment, the only built-in pattern function is ``lower``.
+
+   .. list-table::
+        :header-rows: 1
+        :widths: auto
+
+        * - Pattern
+          - Value
+          - Substituted value
+        * - {param|lower}
+          - ["T", "z", "Rhu" ]
+          - "t", "z" and "rhu"
diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst
@@ -153,56 +153,98 @@ file
 file-pattern
 --------------
 
-.. py:function:: from_source("file-pattern", pattern, *args, **kwargs)
+.. py:function:: from_source("file-pattern", pattern, *args, hive_partitioning=False, **kwargs)
   :noindex:
 
-  The ``file-pattern`` source will build paths from the pattern specified,
-  using the other arguments to fill the pattern. Each argument can be a list
-  to iterate and create the cartesian product of all lists.
-  Then each file is read in the same ways as with the :ref:`file source <data-sources-file>`.
+  The ``file-pattern`` source reads data from paths specified by a :ref:`pattern <patterns>`.
 
-  .. code-block:: python
+  :param pattern: input path pattern using ``{}`` brackets to define parameters that can be substituted. See :ref:`patterns <patterns>` for details.
+  :type pattern: str
+  :param tuple *args: specify the values to substitute into the parameters ``pattern``. Each parameter can be a list/tuple or a single value.
+  :param hive_partitioning: control how the ``pattern`` is interpreted. See details below.
+  :type hive_partitioning: bool
+  :param dict **kwargs: other keyword arguments specifying the parameter values
 
-      import datetime
-      import earthkit.data as ekd
+  The actual behaviour and the type of the returned object depend on ``hive_partitioning``:
 
-      ds = ekd.from_source(
-          "file-pattern",
-          "path/to/data-{my_date:date(%Y-%m-%d)}-{run_time}-{param}.grib",
-          {
-              "my_date": datetime.datetime(2020, 5, 2),
-              "run_time": [12, 18],
-              "param": ["t2", "msl"],
-          },
-      )
+hive_partioning=False
+////////////////////////////
 
+  When ``hive_partitioning`` is ``False``, first, the pattern parameters are substituted with the values specified by the ``*args`` and ``**kwargs``, see :ref:`patterns <patterns>` for details. For this, all the possible values must be specified for each pattern parameter. Next, the paths are constructed by taking the Cartesian product of the substituted values. Finally, the resulting paths are read and :ref:`from_source <data-sources-file-pattern>` returns a single object (for GRIB data it will be a :py:class:`Fieldlist`).
 
-  The code above will read the following files::
+    .. code-block:: python
 
-    path/to/data-2020-05-02-12-t2.grib
-    path/to/data-2020-05-02-12-msl.grib
-    path/to/data-2020-05-02-18-t2.grib
-    path/to/data-2020-05-02-18-msl.grib
+        import datetime
+        import earthkit.data as ekd
 
+        # ds is a fieldlist
+        ds = ekd.from_source(
+            "file-pattern",
+            "path/to/data-{my_date:date(%Y-%m-%d)}-{run_time}-{param}.grib",
+            {
+                "my_date": datetime.datetime(2020, 5, 2),
+                "run_time": [12, 18],
+                "param": ["t2", "msl"],
+            },
+        )
 
-  .. code-block:: python
 
-      import datetime
-      import earthkit.data as ekd
+    The code above substitutes "my_date", "run_time" and "param" into the ``pattern`` and constructs the following file paths read into single GRIB :py:class:`Fieldlist`::
 
-      ds = ekd.from_source(
-          "file-pattern",
-          "path/to/data-{my_date:strftime(-6;%Y%m%d%H)}-006-{param}.grib",
-          {
-              "my_date": datetime.datetime(2020, 5, 2, 0),
-              "param": ["t2", "msl"],
-          },
-      )
+        path/to/data-2020-05-02-12-t2.grib
+        path/to/data-2020-05-02-12-msl.grib
+        path/to/data-2020-05-02-18-t2.grib
+        path/to/data-2020-05-02-18-msl.grib
+
+
+hive_partioning=True
+/////////////////////////////
+
+    When ``hive_partitioning`` is ``True``, the ``pattern`` defines a Hive partitioning with each pattern parameter interpreted as a metadata key. The returned object has a limited scope only supporting the :meth:`sel` method. Calling any of these methods will trigger a filesystem scan for all the matching files. During this scan, if the required metadata is present in the pattern no files will be opened at all to extract their metadata, which can be an enormous optimisation. Another advantage is that during the scan entire file system branches can be skipped based simply on inspecting the actual file path.
+
+    Pattern values are optional, but can be still specified to restrict the search to a specific set of values.
+
+    For the hive partitioning example below let us suppose we have the following directory structure containing several years of GRIB data:
+
+    .. code-block:: text
+
+        mydir/
+            20230101/
+                myfile_t.grib
+                myfile_r.grib
+                myfile_u.grib
+                myfile_v.grib
+            20230102/
+                myfile_t.grib
+                myfile_r.grib
+                myfile_u.grib
+                myfile_v.grib
+            20230103/
+                myfile_t.grib
+                myfile_r.grib
+                myfile_u.grib
+                myfile_v.grib
+            20230104/
+            ...
+
+    .. code-block:: python
+
+        import datetime
+        import earthkit.data as ekd
+
+        # At this point nothing is scanned/read yet. ds only has the
+        # sel() method.
+        ds = from_source(
+            "file-pattern", "mydir/{date}/myfile_{param}.grib", hive_partitioning=True
+        )
 
-  The code above will read the following files::
+        # The following line will trigger a filesystem scan
+        # for all the matching files. The scan will be limited to the
+        # "mydir/20230101/" sub-directory and non of the GRIB files will be
+        # opened to extract their metadata. The returned object will
+        # be a :py:class:`Fieldlist`.
+        ds1 = ds.sel(date="20230101", param=["t", "r"])
 
-    path/to/data-2020050118-006-t2.grib
-    path/to/data-2020050118-006-msl.grib
 
 Further examples:
 

diff --git a/environment.yml b/environment.yml
@@ -40,6 +40,7 @@ dependencies:
 - myst-parser
 - pre-commit
 - pydata-sphinx-theme
+- pyfakefs
 - pytest
 - pytest-cov
 - pytest-forked

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,6 +79,7 @@ optional-dependencies.test = [
   "earthkit-data-demo-source",
   "nbconvert",
   "nbformat",
+  "pyfakefs",
   "pytest",
   "pytest-cov",
   "pytest-forked",