From 09869251d37c526597a44de175572778c44bc804 Mon Sep 17 00:00:00 2001
From: Christopher Polster <christopher.polster@ecmwf.int>
Date: Wed, 18 Mar 2026 13:58:54 +0100
Subject: [PATCH 1/2] Add source to fetch data from Zenodo based on record ID
 or DOI

---
 pyproject.toml                      |  1 +
 src/earthkit/data/sources/zenodo.py | 65 +++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 src/earthkit/data/sources/zenodo.py

diff --git a/pyproject.toml b/pyproject.toml
index f7f2c6d5c..c313bf456 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
   "pandas",
   "pdbufr>=0.11",
   "pyyaml",
+  "requests",
   "tqdm>=4.63",
   "xarray>=0.19"
 ]
diff --git a/src/earthkit/data/sources/zenodo.py b/src/earthkit/data/sources/zenodo.py
new file mode 100644
index 000000000..8ee81c2c6
--- /dev/null
+++ b/src/earthkit/data/sources/zenodo.py
@@ -0,0 +1,65 @@
+# (C) Copyright 2026- ECMWF and individual contributors.
+
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation nor
+# does it submit to any jurisdiction.
+
+import fnmatch
+import re
+
+import requests
+
+from earthkit.data.sources import Source
+from earthkit.data.sources import from_source_internal
+
+_DOI_PATTERN = re.compile(r"^(?:doi:\s*)?(10\.5281/zenodo\.\d+)$", flags=re.IGNORECASE)
+_URL_PATTERN = re.compile(r"^(?:https?:\/\/)?zenodo\.org\/records\/(\d+)\/?$")
+
+
+def _resolve_doi(doi):
+    r = requests.get(f"https://doi.org/{doi}")
+    r.raise_for_status()
+    return r.url
+
+
+def _get_file_list(record_id):
+    r = requests.get(f"https://zenodo.org/api/records/{record_id}")
+    r.raise_for_status()
+    return {f["key"] for f in r.json()["files"]}
+
+
+class Zenodo(Source):
+
+    def __init__(self, identifier, file="*", **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+
+        if match := _DOI_PATTERN.match(identifier):
+            identifier = _resolve_doi(match.group(1))
+
+        if isinstance(identifier, int):
+            self.record_id = identifier
+        elif match := _URL_PATTERN.match(identifier):
+            self.record_id = int(match.group(1))
+        elif identifier.isnumeric():
+            self.record_id = int(identifier)
+        else:
+            raise ValueError(f"unable to determine record ID from identifier {identifier}")
+
+        # Obtain the list of files in the record and match against provided pattern
+        record_files = _get_file_list(self.record_id)
+        if file is None:
+            self.files = record_files
+        else:
+            self.files = fnmatch.filter(record_files, file)
+            if not self.files:
+                raise FileNotFoundError(file)  # TODO
+
+    def mutate(self):
+        urls = [f"https://zenodo.org/records/{self.record_id}/files/{file}?download=1" for file in self.files]
+        return from_source_internal("url", urls, **self.kwargs)
+
+
+source = Zenodo

From a4225ddc32d5322f6405fed1dde1434999d746a9 Mon Sep 17 00:00:00 2001
From: Christopher Polster <christopher.polster@ecmwf.int>
Date: Wed, 29 Apr 2026 16:35:22 +0200
Subject: [PATCH 2/2] Add support for list of files and improve error handling

---
 src/earthkit/data/sources/zenodo.py | 139 +++++++++++++++++++++-------
 1 file changed, 108 insertions(+), 31 deletions(-)

diff --git a/src/earthkit/data/sources/zenodo.py b/src/earthkit/data/sources/zenodo.py
index 8ee81c2c6..902b0854e 100644
--- a/src/earthkit/data/sources/zenodo.py
+++ b/src/earthkit/data/sources/zenodo.py
@@ -7,59 +7,136 @@
 # does it submit to any jurisdiction.
 
 import fnmatch
+import logging
 import re
 
 import requests
 
-from earthkit.data.sources import Source
-from earthkit.data.sources import from_source_internal
+from earthkit.data.core.config import CONFIG
+from earthkit.data.sources import Source, from_source_internal
+
+LOG = logging.getLogger(__name__)
 
 _DOI_PATTERN = re.compile(r"^(?:doi:\s*)?(10\.5281/zenodo\.\d+)$", flags=re.IGNORECASE)
-_URL_PATTERN = re.compile(r"^(?:https?:\/\/)?zenodo\.org\/records\/(\d+)\/?$")
+_URL_PATTERN = re.compile(r"^(?:https?:\/\/)?zenodo\.org\/records?\/(\d+)\/?(?:\?.*)?$")
 
 
 def _resolve_doi(doi):
-    r = requests.get(f"https://doi.org/{doi}")
-    r.raise_for_status()
-    return r.url
-
-
-def _get_file_list(record_id):
-    r = requests.get(f"https://zenodo.org/api/records/{record_id}")
-    r.raise_for_status()
-    return {f["key"] for f in r.json()["files"]}
+    timeout = CONFIG.get("url-download-timeout")
+
+    LOG.debug("Resolving DOI %s", doi)
+    try:
+        r = requests.get(f"https://doi.org/{doi}", timeout=timeout)
+        r.raise_for_status()
+    except requests.ConnectionError as e:
+        raise RuntimeError("Could not connect to doi.org") from e
+    except requests.Timeout as e:
+        raise RuntimeError(f"request to doi.org timed out after {timeout}s") from e
+    except requests.HTTPError as e:
+        raise RuntimeError(f"doi.org returned HTTP {r.status_code}") from e
+
+    resolved_url = r.url
+    LOG.debug(f"DOI {doi} resolved to {resolved_url}")
+
+    match = _URL_PATTERN.match(resolved_url)
+    if not match:
+        raise ValueError(f"DOI '{doi}' resolved to an unexpected URL: {resolved_url}. Expected a Zenodo record URL.")
+    return int(match.group(1))
+
+
+def _get_record_files(record_id):
+    timeout = CONFIG.get("url-download-timeout")
+
+    api_url = f"https://zenodo.org/api/records/{record_id}"
+    LOG.debug(f"Fetching file list for record {record_id} from {api_url}")
+    try:
+        r = requests.get(api_url, timeout=timeout)
+        r.raise_for_status()
+    except requests.ConnectionError as e:
+        raise RuntimeError("could not connect to zenodo.org") from e
+    except requests.Timeout as e:
+        raise RuntimeError(f"request to zenodo.org timed out after {timeout}s.") from e
+    except requests.HTTPError as e:
+        raise RuntimeError(f"Zenodo API returned HTTP {r.status_code}") from e
+
+    try:
+        data = r.json()
+    except ValueError as e:
+        raise RuntimeError("failed to parse Zenodo API response") from e
+
+    if "files" not in data or not data["files"]:
+        raise RuntimeError(f"Record {record_id} has no accessible files. The record may be restricted or embargoed.")
+
+    files = [f["key"] for f in data["files"]]
+    LOG.debug(f"Record {record_id} contains {len(files)} file(s): {files!r}")
+
+    # Map of file name to its download URL, sorted by file name
+    return {name: f"https://zenodo.org/records/{record_id}/files/{name}?download=1" for name in sorted(files)}
 
 
 class Zenodo(Source):
-
-    def __init__(self, identifier, file="*", **kwargs):
+    """Source for downloading files from Zenodo records.
+
+    Parameters
+    ----------
+    identifier : int | str
+        Record ID, Zenodo URL or DOI.
+    filenames : str | Sequence[str] | None, optional
+        File selection with a glob string or an explicit list of file names.
+        By default, all files are selected.
+    **kwargs
+        Additional keyword arguments forwarded to the  URL source.
+    """
+
+    def __init__(self, identifier, filenames=None, **kwargs):
         super().__init__()
-        self.kwargs = kwargs
+        self._kwargs = kwargs
 
-        if match := _DOI_PATTERN.match(identifier):
-            identifier = _resolve_doi(match.group(1))
+        if isinstance(identifier, str):
+            identifier = identifier.strip()
 
-        if isinstance(identifier, int):
+        # Resolve DOI to record ID
+        if isinstance(identifier, str) and (match := _DOI_PATTERN.match(identifier)):
+            doi = match.group(1)
+            self.record_id = _resolve_doi(doi)
+        # Treat everything else as a record ID
+        elif isinstance(identifier, int):
             self.record_id = identifier
-        elif match := _URL_PATTERN.match(identifier):
+        elif isinstance(identifier, str) and (match := _URL_PATTERN.match(identifier)):
             self.record_id = int(match.group(1))
-        elif identifier.isnumeric():
+        elif isinstance(identifier, str) and identifier.isnumeric():
             self.record_id = int(identifier)
         else:
-            raise ValueError(f"unable to determine record ID from identifier {identifier}")
-
-        # Obtain the list of files in the record and match against provided pattern
-        record_files = _get_file_list(self.record_id)
-        if file is None:
-            self.files = record_files
+            raise ValueError(f"unable to determine record ID from identifier: {identifier!r}")
+
+        LOG.info(f"Zenodo record ID: {self.record_id}")
+
+        # Fetch file metadata from the Zenodo API
+        record_files = _get_record_files(self.record_id)
+
+        # No filenames specified -> select all
+        if filenames is None:
+            self._file_urls = record_files
+        # Match filenames with provided pattern
+        elif isinstance(filenames, str):
+            matched = fnmatch.filter(record_files.keys(), filenames)
+            if not matched:
+                raise FileNotFoundError(f"no files in record {self.record_id} matched the pattern: {filenames!r}")
+            self._file_urls = {name: record_files[name] for name in matched}
+        # Select filenames based on provided list
         else:
-            self.files = fnmatch.filter(record_files, file)
-            if not self.files:
-                raise FileNotFoundError(file)  # TODO
+            for name in filenames:
+                if name not in record_files:
+                    raise FileNotFoundError(f"file {name!r} not found in record {self.record_id}")
+            self._file_urls = {name: record_files[name] for name in filenames}
+
+        LOG.info(
+            f"Selected {len(self._file_urls)} file(s) from record {self.record_id}:, ".join(self._file_urls.keys())
+        )
 
     def mutate(self):
-        urls = [f"https://zenodo.org/records/{self.record_id}/files/{file}?download=1" for file in self.files]
-        return from_source_internal("url", urls, **self.kwargs)
+        urls = list(self._file_urls.values())
+        return from_source_internal("url", urls, **self._kwargs)
 
 
 source = Zenodo