diff --git a/pyproject.toml b/pyproject.toml index f7f2c6d5..c313bf45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "pandas", "pdbufr>=0.11", "pyyaml", + "requests", "tqdm>=4.63", "xarray>=0.19" ] diff --git a/src/earthkit/data/sources/zenodo.py b/src/earthkit/data/sources/zenodo.py new file mode 100644 index 00000000..902b0854 --- /dev/null +++ b/src/earthkit/data/sources/zenodo.py @@ -0,0 +1,142 @@ +# (C) Copyright 2026- ECMWF and individual contributors. + +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation nor +# does it submit to any jurisdiction. + +import fnmatch +import logging +import re + +import requests + +from earthkit.data.core.config import CONFIG +from earthkit.data.sources import Source, from_source_internal + +LOG = logging.getLogger(__name__) + +_DOI_PATTERN = re.compile(r"^(?:doi:\s*)?(10\.5281/zenodo\.\d+)$", flags=re.IGNORECASE) +_URL_PATTERN = re.compile(r"^(?:https?:\/\/)?zenodo\.org\/records?\/(\d+)\/?(?:\?.*)?$") + + +def _resolve_doi(doi): + timeout = CONFIG.get("url-download-timeout") + + LOG.debug("Resolving DOI %s", doi) + try: + r = requests.get(f"https://doi.org/{doi}", timeout=timeout) + r.raise_for_status() + except requests.ConnectionError as e: + raise RuntimeError("Could not connect to doi.org") from e + except requests.Timeout as e: + raise RuntimeError(f"request to doi.org timed out after {timeout}s") from e + except requests.HTTPError as e: + raise RuntimeError(f"doi.org returned HTTP {r.status_code}") from e + + resolved_url = r.url + LOG.debug(f"DOI {doi} resolved to {resolved_url}") + + match = _URL_PATTERN.match(resolved_url) + if not match: + raise ValueError(f"DOI '{doi}' resolved to an unexpected URL: {resolved_url}. Expected a Zenodo record URL.") + return int(match.group(1)) + + +def _get_record_files(record_id): + timeout = CONFIG.get("url-download-timeout") + + api_url = f"https://zenodo.org/api/records/{record_id}" + LOG.debug(f"Fetching file list for record {record_id} from {api_url}") + try: + r = requests.get(api_url, timeout=timeout) + r.raise_for_status() + except requests.ConnectionError as e: + raise RuntimeError("could not connect to zenodo.org") from e + except requests.Timeout as e: + raise RuntimeError(f"request to zenodo.org timed out after {timeout}s.") from e + except requests.HTTPError as e: + raise RuntimeError(f"Zenodo API returned HTTP {r.status_code}") from e + + try: + data = r.json() + except ValueError as e: + raise RuntimeError("failed to parse Zenodo API response") from e + + if "files" not in data or not data["files"]: + raise RuntimeError(f"Record {record_id} has no accessible files. The record may be restricted or embargoed.") + + files = [f["key"] for f in data["files"]] + LOG.debug(f"Record {record_id} contains {len(files)} file(s): {files!r}") + + # Map of file name to its download URL, sorted by file name + return {name: f"https://zenodo.org/records/{record_id}/files/{name}?download=1" for name in sorted(files)} + + +class Zenodo(Source): + """Source for downloading files from Zenodo records. + + Parameters + ---------- + identifier : int | str + Record ID, Zenodo URL or DOI. + filenames : str | Sequence[str] | None, optional + File selection with a glob string or an explicit list of file names. + By default, all files are selected. + **kwargs + Additional keyword arguments forwarded to the URL source. + """ + + def __init__(self, identifier, filenames=None, **kwargs): + super().__init__() + self._kwargs = kwargs + + if isinstance(identifier, str): + identifier = identifier.strip() + + # Resolve DOI to record ID + if isinstance(identifier, str) and (match := _DOI_PATTERN.match(identifier)): + doi = match.group(1) + self.record_id = _resolve_doi(doi) + # Treat everything else as a record ID + elif isinstance(identifier, int): + self.record_id = identifier + elif isinstance(identifier, str) and (match := _URL_PATTERN.match(identifier)): + self.record_id = int(match.group(1)) + elif isinstance(identifier, str) and identifier.isnumeric(): + self.record_id = int(identifier) + else: + raise ValueError(f"unable to determine record ID from identifier: {identifier!r}") + + LOG.info(f"Zenodo record ID: {self.record_id}") + + # Fetch file metadata from the Zenodo API + record_files = _get_record_files(self.record_id) + + # No filenames specified -> select all + if filenames is None: + self._file_urls = record_files + # Match filenames with provided pattern + elif isinstance(filenames, str): + matched = fnmatch.filter(record_files.keys(), filenames) + if not matched: + raise FileNotFoundError(f"no files in record {self.record_id} matched the pattern: {filenames!r}") + self._file_urls = {name: record_files[name] for name in matched} + # Select filenames based on provided list + else: + for name in filenames: + if name not in record_files: + raise FileNotFoundError(f"file {name!r} not found in record {self.record_id}") + self._file_urls = {name: record_files[name] for name in filenames} + + LOG.info( + f"Selected {len(self._file_urls)} file(s) from record {self.record_id}:, ".join(self._file_urls.keys()) + ) + + def mutate(self): + urls = list(self._file_urls.values()) + return from_source_internal("url", urls, **self._kwargs) + + +source = Zenodo