diff --git a/docs/guide/misc/patterns.rst b/docs/guide/misc/patterns.rst new file mode 100644 index 000000000..41e7f9596 --- /dev/null +++ b/docs/guide/misc/patterns.rst @@ -0,0 +1,194 @@ +.. _patterns: + +Using patterns for the file-pattern source +========================================== + +The :ref:`data-sources-file-pattern` source works with an input path pattern. The pattern is a string containing parameters within ``{}`` brackets. The way these parameters are substituted depends on the ``hive_partitioning`` option. + +Pattern substitution ++++++++++++++++++++++ + +hive_partioning=False +///////////////////////////// + +When ``hive_partioning=False`` we have to specify all the possible values for all the parameters. E.g.: + +.. code-block:: python + + from_source( + "file-pattern", + "mydir/{year}/myfile_{param}.grib", + year=[2023, 2024], + param=["t", "r"], + ) + +When this code is executed the file paths are constructed from the Cartesian product of the substituted values. The example above will result in a :py:class:`Fieldlist` built from the following paths:: + + mydir/2023/myfile_t.grib + mydir/2023/myfile_r.grib + mydir/2024/myfile_t.grib + mydir/2024/myfile_r.grib + + +hive_partioning=True +///////////////////////////// + +When ``hive_partitioning=True`` the behaviour is different. The pattern values still can be specified, but is is optional since they can be determined dynamically. See :ref:`here ` for details. + +Pattern item types +++++++++++++++++++++ + +Each pattern parameter can have an optional type specifier. + +The following pattern types are available: + +- ``int``: enforce the input values to be integers. An optional format can be specified. + + .. code-block:: python + + {name: int} + {name: int(format)} + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value/Error + * - {step:int} + - 5 + - "5" + * - {step:int(%04d)} + - 5 + - "0005" + * - {step:int} + - "5" + - ValueError + * - {step:int} + - 5.0 + - ValueError + +- ``float``: enforce the input values to be floats or ints. An optional format can be specified, the default is ``%g``. + + .. code-block:: python + + {name: float} + {name: float(format)} + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value/Error + * - {val:float} + - 5.1 + - "5.1" + * - {val:float} + - 5.0 + - "5" + * - {val:float} + - 5 + - "5" + * - {val:float(%.2f)} + - 5.1 + - "5.10" + * - {step:float} + - "5.0" + - ValueError + +- ``enum``: enforce the input values to be one of the specified values + + .. code-block:: python + + {name: enum(value1, value2, value3)} + + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value/Error + * - {step:enum(0,6,12)} + - [0, 6] + - "0" and "6" + * - {step:enum(0,6,12)} + - [0,18] + - ValueError + +- ``date``: all values are cast to a datetime formatted with the ``datetime.strftime`` syntax. The formatting must be specified. + + .. code-block:: python + + {my_date: date(format)} + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value/Error + * - {my_date:date(%Y-%m-%d)} + - [datetime.datetime(2023, 1, 1), datetime.datetime(2023, 1, 2)] + - "2023-01-01" and "2023-01-02" + * - {my_date:date(%Y-%m-%d)} + - ["20230101", "20230102"] + - "2023-01-01" and "2023-01-02" + +- ``strftime``: alias to ``date`` + +- ``strftimedelta``: all values are cast to a datetime by applying the specified timedelta. Datetime formatting must be specified. + + .. code-block:: python + + {my_date: strftimedelta(delta, format)} + + where ``delta`` can be specified in seconds, minutes, hours (the default is hours), e.g.:: + + 6 + -6h + 60m + 7200s + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value/Error + * - {my_date:strftimedelta(-6,%Y-%m-%d_%H)} + - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ] + - "2020-05-10_18" and "2020-05-11_00" + * - {my_date:strftimedelta(60m,%Y-%m-%d_%H)} + - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ] + - "2020-05-11_01" and "2020-05-11_07" + * - {my_date:strftimedelta(7200s,%Y-%m-%d_%H)} + - [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6) ] + - "2020-05-11_02" and "2020-05-11_08" + + +Built-in pattern item functions ++++++++++++++++++++++++++++++++ + +The built-in pattern item functions are applied to the substituted values. The syntax is as follows:: + + {param|function1|function2|...|functionN} + +At the moment, the only built-in pattern function is ``lower``. + + .. list-table:: + :header-rows: 1 + :widths: auto + + * - Pattern + - Value + - Substituted value + * - {param|lower} + - ["T", "z", "Rhu" ] + - "t", "z" and "rhu" diff --git a/docs/guide/sources.rst b/docs/guide/sources.rst index fb6274e44..fa54d5e20 100644 --- a/docs/guide/sources.rst +++ b/docs/guide/sources.rst @@ -153,56 +153,98 @@ file file-pattern -------------- -.. py:function:: from_source("file-pattern", pattern, *args, **kwargs) +.. py:function:: from_source("file-pattern", pattern, *args, hive_partitioning=False, **kwargs) :noindex: - The ``file-pattern`` source will build paths from the pattern specified, - using the other arguments to fill the pattern. Each argument can be a list - to iterate and create the cartesian product of all lists. - Then each file is read in the same ways as with the :ref:`file source `. + The ``file-pattern`` source reads data from paths specified by a :ref:`pattern `. - .. code-block:: python + :param pattern: input path pattern using ``{}`` brackets to define parameters that can be substituted. See :ref:`patterns ` for details. + :type pattern: str + :param tuple *args: specify the values to substitute into the parameters ``pattern``. Each parameter can be a list/tuple or a single value. + :param hive_partitioning: control how the ``pattern`` is interpreted. See details below. + :type hive_partitioning: bool + :param dict **kwargs: other keyword arguments specifying the parameter values - import datetime - import earthkit.data as ekd + The actual behaviour and the type of the returned object depend on ``hive_partitioning``: - ds = ekd.from_source( - "file-pattern", - "path/to/data-{my_date:date(%Y-%m-%d)}-{run_time}-{param}.grib", - { - "my_date": datetime.datetime(2020, 5, 2), - "run_time": [12, 18], - "param": ["t2", "msl"], - }, - ) +hive_partioning=False +//////////////////////////// + When ``hive_partitioning`` is ``False``, first, the pattern parameters are substituted with the values specified by the ``*args`` and ``**kwargs``, see :ref:`patterns ` for details. For this, all the possible values must be specified for each pattern parameter. Next, the paths are constructed by taking the Cartesian product of the substituted values. Finally, the resulting paths are read and :ref:`from_source ` returns a single object (for GRIB data it will be a :py:class:`Fieldlist`). - The code above will read the following files:: + .. code-block:: python - path/to/data-2020-05-02-12-t2.grib - path/to/data-2020-05-02-12-msl.grib - path/to/data-2020-05-02-18-t2.grib - path/to/data-2020-05-02-18-msl.grib + import datetime + import earthkit.data as ekd + # ds is a fieldlist + ds = ekd.from_source( + "file-pattern", + "path/to/data-{my_date:date(%Y-%m-%d)}-{run_time}-{param}.grib", + { + "my_date": datetime.datetime(2020, 5, 2), + "run_time": [12, 18], + "param": ["t2", "msl"], + }, + ) - .. code-block:: python - import datetime - import earthkit.data as ekd + The code above substitutes "my_date", "run_time" and "param" into the ``pattern`` and constructs the following file paths read into single GRIB :py:class:`Fieldlist`:: - ds = ekd.from_source( - "file-pattern", - "path/to/data-{my_date:strftime(-6;%Y%m%d%H)}-006-{param}.grib", - { - "my_date": datetime.datetime(2020, 5, 2, 0), - "param": ["t2", "msl"], - }, - ) + path/to/data-2020-05-02-12-t2.grib + path/to/data-2020-05-02-12-msl.grib + path/to/data-2020-05-02-18-t2.grib + path/to/data-2020-05-02-18-msl.grib + + +hive_partioning=True +///////////////////////////// + + When ``hive_partitioning`` is ``True``, the ``pattern`` defines a Hive partitioning with each pattern parameter interpreted as a metadata key. The returned object has a limited scope only supporting the :meth:`sel` method. Calling any of these methods will trigger a filesystem scan for all the matching files. During this scan, if the required metadata is present in the pattern no files will be opened at all to extract their metadata, which can be an enormous optimisation. Another advantage is that during the scan entire file system branches can be skipped based simply on inspecting the actual file path. + + Pattern values are optional, but can be still specified to restrict the search to a specific set of values. + + For the hive partitioning example below let us suppose we have the following directory structure containing several years of GRIB data: + + .. code-block:: text + + mydir/ + 20230101/ + myfile_t.grib + myfile_r.grib + myfile_u.grib + myfile_v.grib + 20230102/ + myfile_t.grib + myfile_r.grib + myfile_u.grib + myfile_v.grib + 20230103/ + myfile_t.grib + myfile_r.grib + myfile_u.grib + myfile_v.grib + 20230104/ + ... + + .. code-block:: python + + import datetime + import earthkit.data as ekd + + # At this point nothing is scanned/read yet. ds only has the + # sel() method. + ds = from_source( + "file-pattern", "mydir/{date}/myfile_{param}.grib", hive_partitioning=True + ) - The code above will read the following files:: + # The following line will trigger a filesystem scan + # for all the matching files. The scan will be limited to the + # "mydir/20230101/" sub-directory and non of the GRIB files will be + # opened to extract their metadata. The returned object will + # be a :py:class:`Fieldlist`. + ds1 = ds.sel(date="20230101", param=["t", "r"]) - path/to/data-2020050118-006-t2.grib - path/to/data-2020050118-006-msl.grib Further examples: diff --git a/environment.yml b/environment.yml index b028f0fbe..94acc2537 100644 --- a/environment.yml +++ b/environment.yml @@ -40,6 +40,7 @@ dependencies: - myst-parser - pre-commit - pydata-sphinx-theme +- pyfakefs - pytest - pytest-cov - pytest-forked diff --git a/pyproject.toml b/pyproject.toml index 43d99c5df..803d32709 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ optional-dependencies.test = [ "earthkit-data-demo-source", "nbconvert", "nbformat", + "pyfakefs", "pytest", "pytest-cov", "pytest-forked", diff --git a/src/earthkit/data/sources/file_pattern.py b/src/earthkit/data/sources/file_pattern.py index ee263fb35..5c524cb24 100644 --- a/src/earthkit/data/sources/file_pattern.py +++ b/src/earthkit/data/sources/file_pattern.py @@ -7,19 +7,97 @@ # nor does it submit to any jurisdiction. # +from typing import Any as TypingAny +from typing import Dict +from typing import Optional +from typing import Tuple +from typing import Union + +from earthkit.data.sources import Source +from earthkit.data.sources import from_source +from earthkit.data.sources.empty import EmptySource from earthkit.data.sources.file import File from earthkit.data.sources.multi import MultiSource +from earthkit.data.utils.patterns import HivePattern from earthkit.data.utils.patterns import Pattern +class HiveFilePattern(Source): + def __init__(self, pattern: str, params: Dict[str, TypingAny], **kwargs: TypingAny) -> None: + self.scanner = HivePattern(pattern, params) + + def sel( + self, + *args: Tuple[Dict[str, TypingAny]], + _hive_diag: Optional[TypingAny] = None, + **kwargs: TypingAny, + ) -> Union[EmptySource, MultiSource]: + from earthkit.data.core.index import normalize_selection + + kwargs = normalize_selection(*args, **kwargs) + + rest = {k: v for k, v in kwargs.items() if k not in self.scanner.params} + for k in rest: + del kwargs[k] + + if rest: + out = EmptySource() + for f in self.scanner.scan(**kwargs): + ds = from_source("file", f) + out += ds.sel(**rest) + if _hive_diag: + _hive_diag.file(1) + _hive_diag.sel(1) + + return out + else: + sources = [File(f) for f in self.scanner.scan(**kwargs)] + + if _hive_diag: + _hive_diag.file(len(sources)) + + src = MultiSource(sources) + + prev = None + while src is not prev: + prev = src + src = src.mutate() + return src + + class FilePattern(MultiSource): - def __init__(self, pattern, *args, filter=None, merger=None, **kwargs): - files = Pattern(pattern).substitute(*args, **kwargs) - if not isinstance(files, list): - files = [files] + def __init__( + self, + pattern: str, + *args: Tuple[Dict[str, TypingAny]], + filter: Optional[TypingAny] = None, + merger: Optional[TypingAny] = None, + hive_partitioning: bool = False, + **kwargs: TypingAny, + ) -> None: + self.hive_partitioning = hive_partitioning + + if not self.hive_partitioning: + files = Pattern(pattern).substitute(*args, **kwargs) + if not isinstance(files, list): + files = [files] + + sources = [File(file) for file in sorted(files)] + super().__init__(sources, filter=filter, merger=merger) + + else: + self.pattern = pattern + params = {} + for a in args: + params.update(a) + params.update(kwargs) + self.params = params - sources = [File(file) for file in sorted(files)] - super().__init__(sources, filter=filter, merger=merger) + def mutate(self) -> Union["HiveFilePattern", "FilePattern"]: + if self.hive_partitioning: + return HiveFilePattern(self.pattern, self.params) + else: + return self source = FilePattern diff --git a/src/earthkit/data/utils/patterns.py b/src/earthkit/data/utils/patterns.py index 592d62d65..c3f92f64f 100644 --- a/src/earthkit/data/utils/patterns.py +++ b/src/earthkit/data/utils/patterns.py @@ -8,25 +8,80 @@ # import itertools +import logging +import os import re +from functools import cached_property +from pathlib import Path +from typing import Any as TypingAny +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple +from typing import Union from .dates import to_datetime from .dates import to_timedelta +LOG = logging.getLogger(__name__) + RE1 = re.compile(r"{([^}]*)}") RE2 = re.compile(r"\(([^}]*)\)") class Any: - def substitute(self, value, name): + """Represents a type that accepts any value.""" + + def substitute(self, value: TypingAny, name: str) -> TypingAny: + """Substitute the value without any validation. + + Parameters + ---------- + value : Any + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + Any + The substituted value. + """ return value class Enum: - def __init__(self, enum=""): - self.enum = set(enum.split(",")) - - def substitute(self, value, name): + """Represents a type that accepts a value from a predefined set of strings. + + Parameters + ---------- + enum : str, optional + Comma-separated string of allowed values, by default "". + """ + + def __init__(self, enum: str = "") -> None: + self.enum: set[str] = set(enum.split(",")) + + def substitute(self, value: str, name: str) -> str: + """Substitute the value if it is in the predefined set. + + Parameters + ---------- + value : str + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted value. + + Raises + ------ + ValueError + If the value is not in the predefined set. + """ if self.enum and value not in self.enum: raise ValueError( "Invalid value '{}' for parameter '{}', expected one of {}".format(value, name, self.enum) @@ -35,20 +90,74 @@ def substitute(self, value, name): class Int: - def __init__(self, format="%d"): - self.format = format - - def substitute(self, value, name): + """Represents a type that accepts integer values. + + Parameters + ---------- + format : str, optional + Format string for integer substitution, by default "%d". + """ + + def __init__(self, format: str = "%d") -> None: + self.format: str = format + + def substitute(self, value: int, name: str) -> str: + """Substitute the value if it is an integer. + + Parameters + ---------- + value : int + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted value. + + Raises + ------ + ValueError + If the value is not an integer. + """ if not isinstance(value, int): raise ValueError("Invalid value '{}' for parameter '{}', expected an integer".format(value, name)) return self.format % value class Float: - def __init__(self, format="%g"): - self.format = format - - def substitute(self, value, name): + """Represents a type that accepts float values. + + Parameters + ---------- + format : str, optional + Format string for float substitution, by default "%g". + """ + + def __init__(self, format: str = "%g") -> None: + self.format: str = format + + def substitute(self, value: Union[int, float], name: str) -> str: + """Substitute the value if it is a float or integer. + + Parameters + ---------- + value : int or float + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted value. + + Raises + ------ + ValueError + If the value is not a float or integer. + """ if not isinstance(value, (int, float)): raise ValueError("Invalid value '{}' for parameter '{}', expected a float".format(value, name)) @@ -56,15 +165,56 @@ def substitute(self, value, name): class Datetime: - def __init__(self, format): - self.format = format - - def substitute(self, value, name): + """Represents a type that accepts datetime values. + + Parameters + ---------- + format : str + Format string for datetime substitution. + """ + + def __init__(self, format: str) -> None: + self.format: str = format + + def substitute(self, value: TypingAny, name: str) -> str: + """Substitute the value as a formatted datetime string. + + Parameters + ---------- + value : Any + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted datetime string. + """ return to_datetime(value).strftime(self.format) class DatetimeDelta: - def __init__(self, params): + """Represents a type that accepts datetime deltas. + + Parameters + ---------- + params : str + Parameters for delta and format, separated by a semicolon. + + Attributes + ---------- + delta : str + Timedelta string for datetime substitution. Accepted formats use the following suffixes: + - "h" for hours (the default) + - "m" for minutes + - "s" for seconds + + format : str + Format string for datetime substitution. + """ + + def __init__(self, params: str) -> None: params_list = params.split(";") if len(params_list) != 2: raise ValueError( @@ -73,7 +223,26 @@ def __init__(self, params): self.delta = params_list[0].strip() self.format = params_list[1].strip() - def substitute(self, value, name): + def substitute(self, value: TypingAny, name: str) -> str: + """Substitute the datetime value, add the delta and format the result. + + Parameters + ---------- + value : Any + The value to substitute. Must be convertible to a datetime object. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted datetime value with the delta added to it and formatted. + + Raises + ------ + ValueError + If the delta format is invalid. + """ sign = -1 if self.delta[0] == "-" else 1 if re.fullmatch(r"[-\+]?\d+[hms]?", self.delta): delta = re.search(r"\d+[hms]?", self.delta).group(0) @@ -89,10 +258,37 @@ def substitute(self, value, name): class Str: - def __init__(self, format="%s"): - self.format = format - - def substitute(self, value, name): + """Represents a type that accepts string values. + + Parameters + ---------- + format : str, optional + Format string for string substitution, by default "%s". + """ + + def __init__(self, format: str = "%s") -> None: + self.format: str = format + + def substitute(self, value: str, name: str) -> str: + """Substitute the value if it is a string. + + Parameters + ---------- + value : str + The value to substitute. + name : str + The name of the parameter. + + Returns + ------- + str + The substituted value. + + Raises + ------ + ValueError + If the value is not a string. + """ if not isinstance(value, str): raise ValueError("Invalid value '{}' for parameter '{}', expected a string".format(value, name)) return self.format % value @@ -110,17 +306,54 @@ def substitute(self, value, name): class Constant: - name = None + """Represents a constant value in a pattern. - def __init__(self, value): - self.value = value + Parameters + ---------- + value : Any + The constant value. + """ + + name: Optional[str] = None + + def __init__(self, value: TypingAny) -> None: + self.value: TypingAny = value + + def substitute(self, params: Dict[str, TypingAny], **kwargs: TypingAny) -> TypingAny: + """Substitute the constant value. + + Parameters + ---------- + params : dict + Parameters to substitute (not used for constants). + + Returns + ------- + Any + The constant value. + """ + return self.value + + def substitute_many(self, params: Dict[str, TypingAny], **kwargs: TypingAny) -> TypingAny: + return [self.value] - def substitute(self, params): + def pattern(self) -> str: return self.value + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.value})" + class Variable: - def __init__(self, value): + """Represents a variable in a pattern. + + Parameters + ---------- + value : str + The variable definition string. + """ + + def __init__(self, value: str) -> None: bits = value.split(":") self.name = bits[0] kind = RE2.split(":".join(bits[1:])) @@ -129,38 +362,155 @@ def __init__(self, value): else: self.kind = TYPES[kind[0]](kind[1]) - def substitute(self, params): - if self.name not in params: + self.value = value + + def substitute(self, params: Dict[str, TypingAny]) -> TypingAny: + """Substitute value for a parameter. + + Parameters + ---------- + params : dict + The value belonging to ``self.name`` in ``params`` are + substituted into the Variable. The value to substitute must be a + single value. + + Returns + ------- + Substituted value. + + Raises + ------ + ValueError + If ``self.name`` is not in ``params``. + + Example + ------- + >>> v = Variable("my_date:date(%Y-%m-%d)") + >>> v.substitute({"my_date": "2000-01-01"}) + '2000-01-01' + + >>> v = Variable("my_date:date(%Y-%m-%d)") + >>> v.substitute({"level": "500"}) + ValueError: Missing parameter 'my_date' + """ + if self.name in params: + return self.kind.substitute(params[self.name], self.name) + else: raise ValueError("Missing parameter '{}'".format(self.name)) - return self.kind.substitute(params[self.name], self.name) + + def substitute_many(self, params: Dict[str, TypingAny]) -> Optional[List[TypingAny]]: + """Substitute all values for a parameter. + + Parameters + ---------- + params : dict + The values belonging to ``self.name`` in ``params`` are + substituted into the Variable. + + Returns + ------- + list + List of substituted values. + Raises + ------ + ValueError + If ``self.name`` is not in ``params``. + + + Example + ------- + >>> v = Variable("my_date:date(%Y-%m-%d)") + >>> v.substitute_many({"my_date": ["2000-01-01", "2000-01-02"]}) + ['2000-01-01', '2000-01-02'] + + >>> v = Variable("my_date:date(%Y-%m-%d)") + >>> v.substitute_many({"my_date": "2000-01-01"}) + ['2000-01-01'] + + >>> v = Variable("my_date:date(%Y-%m-%d)") + >>> v.substitute_many({"level": "500"}) + ValueError: Missing parameter 'my_date' + """ + if self.name in params: + v = params[self.name] + if not isinstance(v, list): + v = [v] + return [self.kind.substitute(x, self.name) for x in v] + else: + raise ValueError("Missing parameter '{}'".format(self.name)) + + def pattern(self) -> str: + return "{" + self.value + "}" + + def __repr__(self) -> str: + return f"Variable({self.name},{self.value},{self.kind})" FUNCTIONS = dict(lower=lambda s: s.lower()) class Function: - def __init__(self, value): + """Represents a function applied to a variable in a pattern. + + Parameters + ---------- + value : str + The function definition string. + """ + + def __init__(self, value: str) -> None: functions = value.split("|") self.name = functions[0] self.variable = Variable(functions[0]) self.functions = functions[1:] - - def substitute(self, params): - value = self.variable.substitute(params) + self._pattern = value + + def substitute(self, params: Dict[str, TypingAny], **kwargs: TypingAny) -> TypingAny: + """Substitute the variable and apply functions. + + Parameters + ---------- + params : dict + Parameters to substitute. + + Returns + ------- + Any + The substituted and transformed value. + """ + value = self.variable.substitute(params, **kwargs) for f in self.functions: value = FUNCTIONS[f](value) return value + def substitute_many(self, params: Dict[str, TypingAny], **kwargs: TypingAny) -> TypingAny: + value = self.variable.substitute_many(params, **kwargs) + res = [] + for f in self.functions: + for v in value: + res.append(FUNCTIONS[f](v)) + return res + + def pattern(self) -> str: + return self._pattern + class Pattern: - def __init__(self, pattern, ignore_missing_keys=False): - self.ignore_missing_keys = ignore_missing_keys + """Represents a pattern with variables and constants. + + Parameters + ---------- + pattern : str + The pattern string. + """ + def __init__(self, pattern: str) -> None: self.pattern = [] self.variables = [] for i, p in enumerate(RE1.split(pattern)): if i % 2 == 0: - self.pattern.append(Constant(p)) + if p != "": + self.pattern.append(Constant(p)) else: if "|" in p: v = Function(p) @@ -170,10 +520,39 @@ def __init__(self, pattern, ignore_missing_keys=False): self.pattern.append(v) @property - def names(self): + def names(self) -> List[str]: return sorted({v.name for v in self.variables}) - def substitute(self, *args, **kwargs): + def is_constant(self) -> bool: + return not self.variables and len(self.pattern) == 1 and isinstance(self.pattern[0], Constant) + + def substitute( + self, + *args: Tuple[Dict[str, TypingAny]], + allow_extra: bool = False, + **kwargs: TypingAny, + ) -> Union[str, List[str]]: + """Substitute values into the pattern. + + Parameters + ---------- + args : tuple of dict + Positional dictionaries of parameters to substitute. + allow_extra : bool, optional + Whether to allow using input with parameters not part of the Pattern. + kwargs : dict + Additional keyword arguments for substitution. + + Returns + ------- + str or list + The substituted pattern as a string or list of strings. + + Raises + ------ + ValueError + If there are unused parameters and `allow_extra` is False. + """ params = {} for a in args: params.update(a) @@ -181,24 +560,30 @@ def substitute(self, *args, **kwargs): for k, v in params.items(): if isinstance(v, list): - return self._substitute_many(params) + return self._substitute_many(params, allow_extra=allow_extra) - return self._substitute_one(params) - # TODO: discuss if this should be: - # return [self._substitute_one(params)] + return self._substitute_one(params, allow_extra=allow_extra) - def _substitute_one(self, params): + def _substitute_one( + self, + params: Dict[str, TypingAny], + allow_extra: bool = False, + ) -> str: used = set(params.keys()) result = [] for p in self.pattern: used.discard(p.name) result.append(p.substitute(params)) - if used and not self.ignore_missing_keys: + if used and not allow_extra: raise ValueError("Unused parameter(s): {}".format(used)) return "".join(str(x) for x in result) - def _substitute_many(self, params): + def _substitute_many( + self, + params: Dict[str, TypingAny], + allow_extra: bool = False, + ) -> List[str]: for k, v in list(params.items()): if not isinstance(v, list): params[k] = [v] @@ -206,9 +591,282 @@ def _substitute_many(self, params): seen = set() result = [] for n in (dict(zip(params.keys(), x)) for x in itertools.product(*params.values())): - m = self.substitute(n) + m = self.substitute(n, allow_extra=allow_extra) if m not in seen: seen.add(m) result.append(m) return result + + def _subpattern(self, params: Dict[str, TypingAny]) -> str: + """Substitute the pattern with the given parameters. + + Parameters + ---------- + params : dict + The parameters to substitute. Each parameter must be a single value. + + Returns + ------- + str + The substituted pattern. + """ + result = [] + for p in self.pattern: + try: + result.append(p.substitute(params)) + except ValueError: + result.append(p.pattern()) + + return "".join(str(x) for x in result) + + def match(self, value: str) -> Optional[re.Match]: + """Match pattern regex against value. + + Parameters + ---------- + value : str + Value to match + + Returns + ------- + re.Match + re.Match object if the value matches the pattern. None otherwise. + + Example + ------- + >>> p = Pattern("t_{my_date:date(%Y-%m-%d)}.grib") + >>> p.match("t_2000-01-01.grib") + + + >>> p = Pattern("t_{my_date:date(%Y-%m-%d)}.grib") + >>> p.match("2000-01-01.grib") + None + + >>> p = Pattern("{shortName}_{my_date:date(%Y-%m-%d)}.grib") + >>> p.match("t_2000-01-01.grib") + + + >>> p = Pattern("data/t/level") + >>> p.match("data/t/level") + + >>> p.match("data/t/level/500") + None + """ + rx = self.regex + return rx.match(value) + + def __repr__(self) -> str: + t = "pattern:" + for p in self.pattern: + t += f"\n {p}" + return t + + @cached_property + def regex(self) -> re.Pattern: + t = "" + for p in self.pattern: + if isinstance(p, Constant): + t += p.value + else: + t += f"(?P<{p.name}>\S+)" + + t = rf"^{t}$" + return re.compile(t) + + +class HivePattern: + """Hive pattern. + + Parameters + ---------- + pattern : str + The hive pattern string. + values : dict, optional + Dictionary of values for substitution, by default None. + + Attributes + ---------- + pattern : str + The original pattern string. + root : str + The root directory of the pattern. This is the beginning of the path not containing + any pattern parameters. + rest : str + The remaining part of the pattern after the root. + params : list + List of parameters in the pattern. + dynamic_params : list + List of parameters without user specified values in the pattern. + fixed_single_params : dict + Dictionary of user specified single value parameters. + fixed_multi_keys : dict + Dictionary of user specified multi value parameters. + parts : list + List of pattern parts. Each part is a Pattern object representing a part of the path. + """ + + def __init__(self, pattern: str, values: Optional[Dict[str, TypingAny]] = None) -> None: + self.pattern = pattern + values = values or {} + values = dict(values) + + # substitute single values into the pattern + pattern = Pattern(pattern) + self.fixed_single_params = {} + for k in list(values.keys()): + v = values[k] + LOG.debug(f" {k=} {v=}") + if isinstance(v, (list, tuple)): + if len(v) == 1: + self.fixed_single_params[k] = values.pop(k)[0] + elif v: + self.fixed_single_params[k] = values.pop(k) + + pattern = pattern._subpattern(self.fixed_single_params) + + # analyze path structure and turn each file path part into a + # pattern + path = Path(pattern) + self.root = "" + self.rest = "" + path_parts = path.parts + LOG.debug(f"{pattern=} {path_parts=}") + + parts = [Pattern(x) for x in path_parts] + self.parts = [] + for i, part in enumerate(parts): + if part.is_constant(): + self.root = os.path.join(self.root, part.pattern[0].value) + else: + self.rest = os.path.join(*path_parts[i:]) + self.parts = parts[i:] + break + + self.params = list(self.fixed_single_params.keys()) + self.dynamic_params = [] + self.fixed_multi_params = {} + for p in self.parts: + for v in p.variables: + if v.name is not self.params: + try: + s = v.substitute_many(values) + self.fixed_multi_params[v.name] = s + except ValueError: + self.dynamic_params.append(v.name) + self.params.append(v.name) + + assert len(self.fixed_multi_params) == len(values), f"{len(self.fixed_multi_params)} != {len(values)}" + + LOG.debug(f"root={self.root}") + LOG.debug(f"rest={self.rest}") + LOG.debug(f"params={self.params}") + LOG.debug(f"dynamic_params={self.dynamic_params}") + LOG.debug(f"fixed_single_params={self.fixed_single_params}") + LOG.debug(f"fixed_multi_params={self.fixed_multi_params}") + for p in self.parts: + LOG.debug(f" {p=}") + LOG.debug(f" re={p.regex}") + + def scan(self, *args: Dict[str, TypingAny], **kwargs: TypingAny) -> List[str]: + """Scan the file system for files matching the pattern. + + Parameters + ---------- + args : tuple of dicts + Positional dictionaries of parameters for scanning. + kwargs : dict + Additional keyword arguments for scanning. + + Returns + ------- + list + List of file paths matching the pattern. + + Raises + ------ + ValueError + If a parameter is not valid for the pattern. + """ + params_in = {} + for a in args: + params_in.update(a) + params_in.update(kwargs) + + for k in list(params_in.keys()): + if isinstance(params_in[k], tuple): + params_in[k] = list(params_in[k]) + elif not isinstance(params_in[k], list): + params_in[k] = [params_in[k]] + + # determine param values to use + params = {} + for k, v in params_in.items(): + if k in self.fixed_single_params: + if self.fixed_single_params[k] not in v: + return [] + + elif k in self.fixed_multi_params: + v1 = [v1 for v1 in v if v1 in self.fixed_multi_params[k]] + if v1: + params[k] = v1 + else: + return [] + elif k in self.dynamic_params: + params[k] = v + else: + raise ValueError(f"Invalid key '{k}' not in pattern") + + for k in params: + params[k] = set([str(x) for x in params[k]]) + + root_num = len(Path(self.root).parts) + last = len(self.parts) - 1 + res = [] + + # walk the file system + for root, dirs, files in os.walk(self.root): + # LOG.debug(f"walk: {root=}") + index = len(Path(root).parts) - root_num + # LOG.debug(f"{index=} {last=}") + part = self.parts[index] + + # intermediate level + if index != last: + exclude = [] + for d in dirs: + g = self.collect(d, part, params) + if g is None: + exclude.append(d) + # LOG.debug(f" {exclude=}") + if exclude: + dirs[:] = [d for d in dirs if d not in exclude] + continue + + # last level (collection) + else: + for file in files: + # LOG.debug(f" {file=}") + d = self.collect(file, part, params) + if d: + res.append(os.path.join(root, file)) + # LOG.debug(" match") + + return res + + def collect(self, file: str, part: Pattern, params: Dict[str, set[str]]) -> Optional[Dict[str, str]]: + # LOG.debug(f" match={file}") + m = part.regex.match(file) + if m: + if part.is_constant(): + return {} + + group = m.groupdict() + if len(group) == len(part.variables): + for k, v in group.items(): + if k in params: + # LOG.debug(f" {k=} {v=} {params[k]=}") + if v not in params[k]: + return None + return group + return None diff --git a/tests/conftest.py b/tests/conftest.py index f1f61d663..10158dda1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ # Import all fixtures from list of plugins pytest_plugins = [ "list_of_dicts.lod_fixtures", + "patterns.patterns_fixtures", ] diff --git a/tests/data/pattern/1/r_2020-09-22T12:00:00_0.grib b/tests/data/pattern/1/r_2020-09-22T12:00:00_0.grib new file mode 100644 index 000000000..e83c2086b Binary files /dev/null and b/tests/data/pattern/1/r_2020-09-22T12:00:00_0.grib differ diff --git a/tests/data/pattern/1/r_2020-09-22T12:00:00_12.grib b/tests/data/pattern/1/r_2020-09-22T12:00:00_12.grib new file mode 100644 index 000000000..233c06020 Binary files /dev/null and b/tests/data/pattern/1/r_2020-09-22T12:00:00_12.grib differ diff --git a/tests/data/pattern/1/r_2020-09-22T12:00:00_24.grib b/tests/data/pattern/1/r_2020-09-22T12:00:00_24.grib new file mode 100644 index 000000000..8269703c8 Binary files /dev/null and b/tests/data/pattern/1/r_2020-09-22T12:00:00_24.grib differ diff --git a/tests/data/pattern/1/r_2020-09-22T12:00:00_6.grib b/tests/data/pattern/1/r_2020-09-22T12:00:00_6.grib new file mode 100644 index 000000000..d268d1600 Binary files /dev/null and b/tests/data/pattern/1/r_2020-09-22T12:00:00_6.grib differ diff --git a/tests/data/pattern/1/t_2020-09-22T12:00:00_0.grib b/tests/data/pattern/1/t_2020-09-22T12:00:00_0.grib new file mode 100644 index 000000000..4eabb909d Binary files /dev/null and b/tests/data/pattern/1/t_2020-09-22T12:00:00_0.grib differ diff --git a/tests/data/pattern/1/t_2020-09-22T12:00:00_12.grib b/tests/data/pattern/1/t_2020-09-22T12:00:00_12.grib new file mode 100644 index 000000000..92d9f5ab9 Binary files /dev/null and b/tests/data/pattern/1/t_2020-09-22T12:00:00_12.grib differ diff --git a/tests/data/pattern/1/t_2020-09-22T12:00:00_24.grib b/tests/data/pattern/1/t_2020-09-22T12:00:00_24.grib new file mode 100644 index 000000000..e3c8bc8e1 Binary files /dev/null and b/tests/data/pattern/1/t_2020-09-22T12:00:00_24.grib differ diff --git a/tests/data/pattern/1/t_2020-09-22T12:00:00_6.grib b/tests/data/pattern/1/t_2020-09-22T12:00:00_6.grib new file mode 100644 index 000000000..630e3af91 Binary files /dev/null and b/tests/data/pattern/1/t_2020-09-22T12:00:00_6.grib differ diff --git a/tests/data/pattern/1/z_2020-09-22T12:00:00_0.grib b/tests/data/pattern/1/z_2020-09-22T12:00:00_0.grib new file mode 100644 index 000000000..1d5f57e0d Binary files /dev/null and b/tests/data/pattern/1/z_2020-09-22T12:00:00_0.grib differ diff --git a/tests/data/pattern/1/z_2020-09-22T12:00:00_12.grib b/tests/data/pattern/1/z_2020-09-22T12:00:00_12.grib new file mode 100644 index 000000000..5c317410d Binary files /dev/null and b/tests/data/pattern/1/z_2020-09-22T12:00:00_12.grib differ diff --git a/tests/data/pattern/1/z_2020-09-22T12:00:00_24.grib b/tests/data/pattern/1/z_2020-09-22T12:00:00_24.grib new file mode 100644 index 000000000..b23aada71 Binary files /dev/null and b/tests/data/pattern/1/z_2020-09-22T12:00:00_24.grib differ diff --git a/tests/data/pattern/1/z_2020-09-22T12:00:00_6.grib b/tests/data/pattern/1/z_2020-09-22T12:00:00_6.grib new file mode 100644 index 000000000..77abe8d92 Binary files /dev/null and b/tests/data/pattern/1/z_2020-09-22T12:00:00_6.grib differ diff --git a/tests/environment-unit-tests.yml b/tests/environment-unit-tests.yml index 6929db1a1..666d4664b 100644 --- a/tests/environment-unit-tests.yml +++ b/tests/environment-unit-tests.yml @@ -43,6 +43,7 @@ dependencies: - myst-parser - pre-commit - pydata-sphinx-theme +- pyfakefs - pytest - pytest-cov - pytest-forked diff --git a/tests/patterns/patterns_fixtures.py b/tests/patterns/patterns_fixtures.py new file mode 100644 index 000000000..fb3811bec --- /dev/null +++ b/tests/patterns/patterns_fixtures.py @@ -0,0 +1,96 @@ +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import datetime + +import pytest + + +@pytest.fixture +def hive_fs_1(fs): + pattern = "/my_root/{shortName}_{date:date(%Y-%m-%d)}+{step}.grib" + values = { + "shortName": ["t", "z", "r"], + "date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 12)], + "step": [0, 6, 12], + } + sample = "/my_root/r_2020-05-11+6.grib" + num = 18 + + return build_hive_fs(pattern, values, sample, num, fs) + + +@pytest.fixture +def hive_fs_2(fs): + pattern = "/my_root/{shortName}/{date:date(%Y-%m-%d)}:{step}.grib" + values = { + "shortName": ["t", "z", "r"], + "date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 12)], + "step": [0, 6, 12], + } + sample = "/my_root/r/2020-05-11:6.grib" + num = 18 + + return build_hive_fs(pattern, values, sample, num, fs) + + +@pytest.fixture +def hive_fs_3(fs): + pattern = "/my_root/{step}__/{shortName}/my+{date:date(%Y-%m-%d)}.grib" + values = { + "shortName": ["t", "z", "r"], + "date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 12)], + "step": [0, 6, 12], + } + sample = "/my_root/6__/r/my+2020-05-11.grib" + num = 18 + + return build_hive_fs(pattern, values, sample, num, fs) + + +@pytest.fixture +def hive_fs_4(fs): + pattern = "/my_root/{step}__/{shortName}/{shortName}_{date:date(%Y-%m-%d)}.grib" + values = { + "shortName": ["t", "z", "r"], + "date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 12)], + "step": [0, 6, 12], + } + sample = "/my_root/6__/r/r_2020-05-11.grib" + num = 18 + + return build_hive_fs(pattern, values, sample, num, fs) + + +@pytest.fixture +def hive_fs_5(fs): + pattern = "/my_root/{step}/{shortName}/{date:date(%Y-%m-%d)}.grib" + values = { + "shortName": "t", + "date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 12)], + "step": [0, 6, 12], + } + sample = "/my_root/0/t/2020-05-11.grib" + num = 6 + + return build_hive_fs(pattern, values, sample, num, fs) + + +def build_hive_fs(pattern, values, sample, file_num, fs): + from earthkit.data.utils.patterns import Pattern + + p = Pattern(pattern) + files = p.substitute(values) + assert len(files) == file_num + assert sample in files + + for f in files: + fs.create_file(f) + + return pattern, files, values diff --git a/tests/patterns/test_hive.py b/tests/patterns/test_hive.py new file mode 100644 index 000000000..86b842deb --- /dev/null +++ b/tests/patterns/test_hive.py @@ -0,0 +1,279 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import os + +import pytest + +from earthkit.data import from_source +from earthkit.data.testing import earthkit_test_data_file +from earthkit.data.utils.patterns import HivePattern + + +@pytest.mark.parametrize("fx", ["hive_fs_1", "hive_fs_2", "hive_fs_3", "hive_fs_4", "hive_fs_5"]) +def test_hive_full_scan(request, fx): + pattern, files, _ = request.getfixturevalue(fx) + + p = HivePattern(pattern, {}) + res_files = p.scan() + + assert sorted(files) == sorted(res_files) + + +@pytest.mark.parametrize("fx", ["hive_fs_5"]) +def test_hive_full_scan_with_fixed(request, fx): + pattern, files, values = request.getfixturevalue(fx) + + v = { + "shortName": values["shortName"], + } + + p = HivePattern(pattern, v) + res_files = p.scan() + + assert sorted(files) == sorted(res_files) + + +@pytest.mark.parametrize( + "fx,filters,expected_files", + [ + ( + "hive_fs_1", + {"shortName": "z", "step": 6}, + ["/my_root/z_2020-05-11+6.grib", "/my_root/z_2020-05-12+6.grib"], + ), + ( + "hive_fs_1", + {"shortName": ["z", "t"], "step": 6}, + [ + "/my_root/z_2020-05-11+6.grib", + "/my_root/z_2020-05-12+6.grib", + "/my_root/t_2020-05-11+6.grib", + "/my_root/t_2020-05-12+6.grib", + ], + ), + ( + "hive_fs_2", + {"shortName": "z", "step": 6}, + ["/my_root/z/2020-05-11:6.grib", "/my_root/z/2020-05-12:6.grib"], + ), + ( + "hive_fs_2", + {"shortName": ["z", "t"], "step": 6}, + [ + "/my_root/z/2020-05-11:6.grib", + "/my_root/z/2020-05-12:6.grib", + "/my_root/t/2020-05-11:6.grib", + "/my_root/t/2020-05-12:6.grib", + ], + ), + ( + "hive_fs_3", + {"shortName": "z", "step": 6}, + ["/my_root/6__/z/my+2020-05-11.grib", "/my_root/6__/z/my+2020-05-12.grib"], + ), + ( + "hive_fs_3", + {"shortName": ["z", "t"], "step": 6}, + [ + "/my_root/6__/z/my+2020-05-11.grib", + "/my_root/6__/z/my+2020-05-12.grib", + "/my_root/6__/t/my+2020-05-11.grib", + "/my_root/6__/t/my+2020-05-12.grib", + ], + ), + ( + "hive_fs_4", + {"shortName": "z", "step": 6}, + ["/my_root/6__/z/z_2020-05-11.grib", "/my_root/6__/z/z_2020-05-12.grib"], + ), + ( + "hive_fs_4", + {"shortName": ["z", "t"], "step": 6}, + [ + "/my_root/6__/z/z_2020-05-11.grib", + "/my_root/6__/z/z_2020-05-12.grib", + "/my_root/6__/t/t_2020-05-11.grib", + "/my_root/6__/t/t_2020-05-12.grib", + ], + ), + ], +) +def test_hive_filter(request, fx, filters, expected_files): + pattern, _, _ = request.getfixturevalue(fx) + + # files = _build_fs(md, fs_pattern, date_format) + # print(f"files={files}") + + p = HivePattern(pattern, {}) + res_files = p.scan(filters) + + # print(f"res_files={res_files}") + + assert sorted(expected_files) == sorted(res_files) + + +class HiveDiag: + def __init__(self): + self.file_count = 0 + self.sel_count = 0 + + def file(self, count): + self.file_count += count + + def sel(self, count): + self.sel_count += count + + def reset(self): + self.file_count = 0 + self.sel_count = 0 + + +def test_hive_sel_1(): + root = earthkit_test_data_file("pattern/1") + pattern = "{shortName}_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + + ds = from_source("file-pattern", os.path.join(root, pattern), hive_partitioning=True) + + # assert ds.root == path + diag = HiveDiag() + # using hive partitioning keys + r = ds.sel(shortName="t", step=12, _hive_diag=diag) + assert diag.file_count == 1 + assert diag.sel_count == 0 + assert len(r) == 6 + + md_ref = [("t", 1000, 12), ("t", 850, 12), ("t", 700, 12), ("t", 500, 12), ("t", 400, 12), ("t", 300, 12)] + assert r.metadata("shortName", "level", "step") == md_ref + + # using hive partitioning keys + extra keys from GRIB header + diag.reset() + r = ds.sel(shortName="t", step=12, levtype="pl", _hive_diag=diag) + assert diag.file_count == 1 + assert diag.sel_count == 1 + assert len(r) == 6 + + +def test_hive_sel_2(): + root = earthkit_test_data_file("pattern/invalid") + pattern = "_{shortName}_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + + ds = from_source("file-pattern", os.path.join(root, pattern), hive_partitioning=True) + + r = ds.sel(shortName="t", step=12) + assert len(r) == 0 + + +def test_hive_init_1(): + pattern = "{shortName}_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + p = HivePattern(pattern) + + assert p.pattern == pattern + assert p.params == ["shortName", "date", "step"] + assert p.dynamic_params == ["shortName", "date", "step"] + assert p.fixed_single_params == {} + assert p.fixed_multi_params == {} + assert p.root == "" + assert p.rest == pattern + + assert len(p.parts) == 1 + ref = ["shortName", "date", "step"] + for i, v in enumerate(p.parts[0].variables): + assert v.name == ref[i] + + +def test_hive_init_2(): + pattern = "root_d/{year}/fc/t_{level}b_/a{shortName}_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + p = HivePattern(pattern) + + assert p.pattern == pattern + assert p.params == ["year", "level", "shortName", "date", "step"] + assert p.dynamic_params == ["year", "level", "shortName", "date", "step"] + assert p.fixed_single_params == {} + assert p.fixed_multi_params == {} + assert p.root == "root_d" + assert p.rest == pattern[7:] + + assert len(p.parts) == 4 + ref = [["year"], [], ["level"], ["shortName", "date", "step"]] + for i, part in enumerate(p.parts): + assert len(part.variables) == len(ref[i]) + for k, v in enumerate(part.variables): + assert v.name == ref[i][k] + + ref = [False, True, False, False] + for i, part in enumerate(p.parts): + assert part.is_constant() == ref[i] + + assert p.parts[0].match("2023") + assert p.parts[0].match("2_2_3") + assert p.parts[1].match("fc") + assert not p.parts[1].match("an") + assert p.parts[2].match("t_500_b_") + assert not p.parts[2].match("t_500_b") + assert not p.parts[2].match("500") + + m = p.parts[3].match("at2m_2023-01-21_345.grib") + assert m is not None + assert m.groupdict() == {"shortName": "t2m", "date": "2023-01-21", "step": "345"} + + m = p.parts[3].match("a_t2m_2023-01-21_345.grib") + assert m is not None + assert m.groupdict() == {"shortName": "_t2m", "date": "2023-01-21", "step": "345"} + + m = p.parts[3].match("at2m2023-01-21_345.grib") + assert m is None + + +def test_hive_init_3(): + pattern = "root_d/{year}/fc/t_{level}b_/a{shortName}_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + p = HivePattern(pattern, {"year": [2023, 2024], "level": "500", "shortName": "t"}) + + assert p.pattern == pattern + assert set(p.params) == {"year", "level", "shortName", "date", "step"} + assert p.dynamic_params == ["date", "step"] + assert p.fixed_single_params == {"level": "500", "shortName": "t"} + assert p.fixed_multi_params == {"year": [2023, 2024]} + assert p.root == "root_d" + assert p.rest == "{year}/fc/t_500b_/at_{date:date(%Y-%m-%dT%H:%M)}_{step}.grib" + + assert len(p.parts) == 4 + ref = [["year"], [], [], ["date", "step"]] + for i, part in enumerate(p.parts): + assert len(part.variables) == len(ref[i]) + for k, v in enumerate(part.variables): + assert v.name == ref[i][k] + + ref = [False, True, True, False] + for i, part in enumerate(p.parts): + assert part.is_constant() == ref[i] + + assert p.parts[0].match("2023") + assert p.parts[0].match("2024") + assert p.parts[0].match("2025") + assert p.parts[0].match("2_2_3") + assert p.parts[1].match("fc") + assert not p.parts[1].match("an") + assert p.parts[2].match("t_500b_") + assert not p.parts[2].match("at_500b_") + assert not p.parts[2].match("t_500b__") + assert not p.parts[2].match("t_500_b") + assert not p.parts[2].match("500") + + m = p.parts[3].match("at_2023-01-21_345.grib") + assert m is not None + assert m.groupdict() == {"date": "2023-01-21", "step": "345"} + + m = p.parts[3].match("a_t_2023-01-21_345.grib") + assert m is None + + m = p.parts[3].match("at2023-01-21_345.grib") + assert m is None diff --git a/tests/patterns/test_patterns.py b/tests/patterns/test_patterns.py new file mode 100644 index 000000000..af340f21d --- /dev/null +++ b/tests/patterns/test_patterns.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import datetime + +import pytest + +from earthkit.data.utils.patterns import Function +from earthkit.data.utils.patterns import Pattern +from earthkit.data.utils.patterns import Variable + + +@pytest.mark.parametrize( + "pattern,values,expected_value,error", + [ + ("level", {"level": "500"}, "500", None), + ("level:int", {"level": 500}, "500", None), + ("level:int(%04d)", {"level": 500}, "0500", None), + ("level:int", {"level": "500"}, None, ValueError), + ("level:float", {"level": 500}, "500", None), + ("level:float", {"level": 500.0}, "500", None), + ("level:float", {"level": 500.1}, "500.1", None), + ("level:float(%.02f)", {"level": 500.1}, "500.10", None), + ("level:enum(925,500,700)", {"level": "500"}, "500", None), + ("level:enum(925,500,700)", {"level": 500}, "500", ValueError), + ("level:enum(925,500,700)", {"level": "1000"}, None, ValueError), + ("my_date:date(%Y-%m-%d)", {"my_date": "20200513"}, "2020-05-13", None), + ("my_date:date(%Y-%m-%d)", {"my_date": datetime.datetime(2020, 5, 13)}, "2020-05-13", None), + ("my_date:strftime(%Y-%m-%d)", {"my_date": datetime.datetime(2020, 5, 13)}, "2020-05-13", None), + ( + "my_date:date(%Y%m%d_%H:%M)", + {"my_date": datetime.datetime(2020, 5, 13, 11, 23, 0)}, + "20200513_11:23", + None, + ), + ], +) +def test_pattern_variable_substitute(pattern, values, expected_value, error): + v = Variable(pattern) + if not error: + assert v.substitute(values) == expected_value + else: + with pytest.raises(error): + v.substitute(values) + + +@pytest.mark.parametrize( + "pattern,values,expected_value,error", + [ + ("level", {"level": "500"}, ["500"], None), + ("level", {"level": ["500", "300"]}, ["500", "300"], None), + ("level:int", {"level": 500}, ["500"], None), + ("level:int", {"level": [500, 300]}, ["500", "300"], None), + ("level:int", {"level": "500"}, None, ValueError), + ("level:enum(925,500,700)", {"level": ["500", "700"]}, ["500", "700"], None), + ("level:enum(925,500,700)", {"level": ["700", "500"]}, ["700", "500"], None), + ("level:enum(925,500,700)", {"level": 500}, "500", ValueError), + ("level:enum(925,500,700)", {"level": ["1000", "500"]}, None, ValueError), + ("my_date:date(%Y-%m-%d)", {"my_date": datetime.datetime(2020, 5, 13)}, ["2020-05-13"], None), + ("my_date:strftime(%Y-%m-%d)", {"my_date": datetime.datetime(2020, 5, 13)}, ["2020-05-13"], None), + ( + "my_date:date(%Y%m%d_%H:%M)", + {"my_date": datetime.datetime(2020, 5, 13, 11, 23, 0)}, + ["20200513_11:23"], + None, + ), + ], +) +def test_pattern_variable_substitute_many(pattern, values, expected_value, error): + v = Variable(pattern) + if not error: + assert v.substitute_many(values) == expected_value + else: + with pytest.raises(error): + v.substitute_many(values) + + +@pytest.mark.parametrize( + "pattern,values,expected_value,error", + [ + ("param|lower", {"param": "TeST"}, "test", None), + ], +) +def test_pattern_function_substitute(pattern, values, expected_value, error): + v = Function(pattern) + if not error: + assert v.substitute(values) == expected_value + else: + with pytest.raises(error): + v.substitute(values) + + +@pytest.mark.parametrize( + "pattern,value,expected_value", + [ + ("level", "level", {}), + ("level", "level1", None), + ("{level}", "500", {"level": "500"}), + ("_{level}_", "_500_", {"level": "500"}), + ("_{level}_{date:date(%Y-%m-%d)}.grib", "_500_", None), + ( + "_{level}_{date:date(%Y-%m-%d)}.grib", + "_500_2000-01-02.grib", + {"level": "500", "date": "2000-01-02"}, + ), + ], +) +def test_pattern_match(pattern, value, expected_value): + p = Pattern(pattern) + m = p.match(value) + if expected_value is None: + assert m is None + else: + assert m.groupdict() == expected_value + + +@pytest.mark.parametrize( + "pattern,values,expected_value", + [ + ("test.{format}", {"format": ["nc", "grib"]}, ["test.nc", "test.grib"]), + ("test_{id}.grib", {"id": [2, 3, "AA"]}, ["test_2.grib", "test_3.grib", "test_AA.grib"]), + ( + "test_{my_date:date(%Y-%m-%d)}_{name}.grib", + {"my_date": datetime.datetime(2020, 5, 13), "name": ["t2", "msl"]}, + ["test_2020-05-13_t2.grib", "test_2020-05-13_msl.grib"], + ), + ( + "test_{date:strftimedelta(-6;%Y-%m-%d_%H)}.grib", + {"date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6)]}, + ["test_2020-05-10_18.grib", "test_2020-05-11_00.grib"], + ), + ( + "test_{date:strftimedelta(60m;%Y-%m-%d_%H)}.grib", + {"date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6)]}, + ["test_2020-05-11_01.grib", "test_2020-05-11_07.grib"], + ), + ("test_{param|lower}", {"param": ["T", "z"]}, ["test_t", "test_z"]), + ], +) +def test_pattern_core(pattern, values, expected_value): + p = Pattern(pattern) + assert p.substitute(values) == expected_value diff --git a/tests/utils/test_patterns.py b/tests/utils/test_patterns.py deleted file mode 100644 index dc9fddcde..000000000 --- a/tests/utils/test_patterns.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -# (C) Copyright 2020 ECMWF. -# -# This software is licensed under the terms of the Apache Licence Version 2.0 -# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -# In applying this licence, ECMWF does not waive the privileges and immunities -# granted to it by virtue of its status as an intergovernmental organisation -# nor does it submit to any jurisdiction. -# - -import datetime - -import pytest - -from earthkit.data.utils.patterns import Pattern - - -@pytest.mark.parametrize( - "pattern,values,expected_value", - [ - ("test.{format}", {"format": ["nc", "grib"]}, ["test.nc", "test.grib"]), - ("test_{id}.grib", {"id": [2, 3, "AA"]}, ["test_2.grib", "test_3.grib", "test_AA.grib"]), - ( - "test_{my_date:date(%Y-%m-%d)}_{name}.grib", - {"my_date": datetime.datetime(2020, 5, 13), "name": ["t2", "msl"]}, - ["test_2020-05-13_t2.grib", "test_2020-05-13_msl.grib"], - ), - ( - "test_{date:strftimedelta(-6;%Y-%m-%d_%H)}.grib", - {"date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6)]}, - ["test_2020-05-10_18.grib", "test_2020-05-11_00.grib"], - ), - ( - "test_{date:strftimedelta(60m;%Y-%m-%d_%H)}.grib", - {"date": [datetime.datetime(2020, 5, 11), datetime.datetime(2020, 5, 11, 6)]}, - ["test_2020-05-11_01.grib", "test_2020-05-11_07.grib"], - ), - ], -) -def test_pattern_core(pattern, values, expected_value): - p = Pattern(pattern) - assert p.substitute(values) == expected_value