From d83e2f5eec4e44fb7c623c20c873c68eebd946ce Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Wed, 7 Jan 2026 12:50:10 -0800 Subject: [PATCH 1/6] Fix registry-based archive generation by using ref_lidvid_collection Workaround for #236 where the /products/{bundle-lidvid}/members endpoint has proven brittle. Instead, parse ref_lidvid_collection directly from the bundle properties to discover collections. This approach is equally accurate since it uses the exact collection LIDVIDs specified in the bundle metadata, but avoids the reliability issues with the /members endpoint at the bundle level. The /members endpoint is still used for discovering products within each collection. --- src/pds2/aipgen/registry.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index a449752..c4735a5 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -228,10 +228,19 @@ def _comprehendregistry(url: str, bundlelidvid: str, allcollections=True) -> tup title = bundle.get("title", "«unknown»") _addfiles(bundle, bac) - # It turns out the PDS registry makes this *trivial* compared to the PDS filesystem version; - # Just understanding it all was there was the hard part! 😊 THANK YOU! 🙏 - for collection in _getproducts(url, bundlelidvid, allcollections): + # Get collection LIDVIDs from the bundle's properties instead of using the /members endpoint + # This avoids issues with the /members endpoint and uses the ref_lidvid_collection field + collection_lidvids = bundle.get("properties", {}).get("ref_lidvid_collection", []) + _logger.debug("📦 Found %d collections in bundle properties", len(collection_lidvids)) + + for collection_lidvid in collection_lidvids: + # Fetch the collection details directly + collection = _getbundle(url, collection_lidvid) + if collection is None: + _logger.warning("⚠️ Collection %s not found in registry", collection_lidvid) + continue _addfiles(collection, bac) + # Still use /members endpoint for getting products from each collection for product in _getproducts(url, collection["id"]): _addfiles(product, bac) From e1934d3548f30fb16839caea7c323ead9d931cf3 Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Wed, 7 Jan 2026 13:54:48 -0800 Subject: [PATCH 2/6] Add retry logic with exponential backoff for API calls Fixes #237 by implementing automatic retry with exponential backoff for all PDS API requests. This addresses transient API/database performance issues that result in 500 errors and JSON decode failures. Changes: - Added _get_session_with_retry() to create sessions with retry logic - Configured to retry on HTTP 500, 502, 503, 504 status codes - Uses exponential backoff (2s, 4s, 8s, 16s, 32s) over 5 attempts - Enhanced error handling to catch JSONDecodeError exceptions - Added detailed logging for debugging API failures The retry mechanism uses urllib3.Retry with requests.HTTPAdapter to automatically handle transient failures without code changes in calling functions. --- src/pds2/aipgen/registry.py | 58 ++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index c4735a5..3eff0f6 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -44,6 +44,8 @@ from urllib.parse import urlparse import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry from . import VERSION from .aip import writelabel as writeaiplabel @@ -74,6 +76,13 @@ _defaultserver = "https://pds.nasa.gov/api/search/1/" # Where to find the PDS API _searchkey = "ops:Harvest_Info.ops:harvest_date_time" # How to sort products +# Retry Configuration +# ------------------- + +_retryattempts = 5 # Maximum number of retry attempts +_retrybackoff = 2 # Exponential backoff factor (seconds) +_retrystatus = [500, 502, 503, 504] # HTTP status codes to retry on + # PDS API property keys we're interested in # ----------------------------------------- @@ -117,6 +126,26 @@ def make(cls, url, md5): return cls(fixmultislashes(url), md5) +def _get_session_with_retry() -> requests.Session: + """Create a requests session configured with retry logic and exponential backoff. + + Returns a session that will automatically retry on transient failures (500, 502, 503, 504) + with exponential backoff to handle API performance issues. + """ + session = requests.Session() + retry_strategy = Retry( + total=_retryattempts, + backoff_factor=_retrybackoff, + status_forcelist=_retrystatus, + allowed_methods=["GET"], # Only retry GET requests + raise_on_status=False, # Don't raise on bad status, let us handle it + ) + adapter = HTTPAdapter(max_retries=retry_strategy) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + def _deurnlidvid(lidvid: str) -> tuple[str, str]: """De-URN a LID VID. @@ -156,10 +185,21 @@ def _getbundle(server_url: str, lidvid: str) -> Union[dict[str, Any], None]: identifier ``lidvid`` and return a ``dict`` with its attributes. If it can't be found, return ``None``. """ - r = requests.get(f"{server_url}/products/{lidvid}") + session = _get_session_with_retry() + url = f"{server_url}/products/{lidvid}" + _logger.debug('Fetching bundle/product from %s', url) + r = session.get(url) if r.status_code == HTTPStatus.NOT_FOUND: return None - return r.json() + if not r.ok: + _logger.error("⚠️ Failed to fetch %s: HTTP %d", url, r.status_code) + r.raise_for_status() + try: + return r.json() + except requests.exceptions.JSONDecodeError as e: + _logger.error("⚠️ Failed to parse JSON response from %s: %s", url, e) + _logger.debug("Response content: %s", r.text[:500]) + raise ValueError(f"Invalid JSON response from {url}: {e}") from e def _getproducts(server_url: str, lidvid: str, allcollections=True) -> Iterator[dict[str, Any]]: @@ -170,14 +210,24 @@ def _getproducts(server_url: str, lidvid: str, allcollections=True) -> Iterator[ If ``allcollections`` is True, then return all collections for LID-only references; otherwise return just the latest collection for LID-only references (has no effect on full LIDVID-references). """ + session = _get_session_with_retry() # Commenting out `all` vs. `latest` functionality for now since the API does not support it at this time # url = f"{server_url}/products/{lidvid}/members/{'all' if allcollections else 'latest'}" url = f"{server_url}/products/{lidvid}/members" params = {"sort": _searchkey, "limit": _apiquerylimit} while True: _logger.debug('Making request to %s with params %r', url, params) - r = requests.get(url, params=params) # type: ignore - matches = r.json()["data"] + r = session.get(url, params=params) # type: ignore + if not r.ok: + _logger.error("⚠️ Failed to fetch products from %s: HTTP %d", url, r.status_code) + r.raise_for_status() + try: + data = r.json() + except requests.exceptions.JSONDecodeError as e: + _logger.error("⚠️ Failed to parse JSON response from %s: %s", url, e) + _logger.debug("Response content: %s", r.text[:500]) + raise ValueError(f"Invalid JSON response from {url}: {e}") from e + matches = data.get("data", []) num_matches = len(matches) for i in matches: yield i From 0b025ef01a938e0ca897746595cec11560fd1176 Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Wed, 7 Jan 2026 14:04:58 -0800 Subject: [PATCH 3/6] Optimize API payload size by requesting only required fields Fixes #238 by adding fields parameter to /members endpoint requests to minimize payload size and improve performance. Deep archive only needs 5 specific fields from product responses: - ops:Data_File_Info.ops:file_ref (data file URLs) - ops:Data_File_Info.ops:md5_checksum (data file checksums) - ops:Label_File_Info.ops:file_ref (label file URL) - ops:Label_File_Info.ops:md5_checksum (label file checksum) - ops:Harvest_Info.ops:harvest_date_time (for pagination) Changes: - Updated _fields constant to include harvest_date_time for pagination - Modified _getproducts() to pass fields parameter to API requests - Reduced API response payload by filtering unnecessary metadata Benefits: - Smaller JSON payloads reduce network bandwidth - Faster API responses with less data serialization - Better performance for collections with many products - Reduced load on API server and database --- src/pds2/aipgen/registry.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index 3eff0f6..e67dc67 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -91,7 +91,8 @@ _propdatamd5 = "ops:Data_File_Info.ops:md5_checksum" _proplabelurl = "ops:Label_File_Info.ops:file_ref" _proplabelmd5 = "ops:Label_File_Info.ops:md5_checksum" -_fields = [_propdataurl, _propdatamd5, _proplabelurl, _proplabelmd5] +# Fields to request from API to minimize payload size +_fields = [_propdataurl, _propdatamd5, _proplabelurl, _proplabelmd5, _searchkey] # Program/Module Metadata @@ -214,7 +215,8 @@ def _getproducts(server_url: str, lidvid: str, allcollections=True) -> Iterator[ # Commenting out `all` vs. `latest` functionality for now since the API does not support it at this time # url = f"{server_url}/products/{lidvid}/members/{'all' if allcollections else 'latest'}" url = f"{server_url}/products/{lidvid}/members" - params = {"sort": _searchkey, "limit": _apiquerylimit} + # Request only the fields we need to minimize payload size + params = {"sort": _searchkey, "limit": _apiquerylimit, "fields": ",".join(_fields)} while True: _logger.debug('Making request to %s with params %r', url, params) r = session.get(url, params=params) # type: ignore From f94e082743d3f31b80dec6ceb0c24cf59d3cc19d Mon Sep 17 00:00:00 2001 From: Jordan Padams <33492486+jordanpadams@users.noreply.github.com> Date: Wed, 7 Jan 2026 15:31:11 -0800 Subject: [PATCH 4/6] Add warning of no data is returned Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/pds2/aipgen/registry.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index e67dc67..60bb4ba 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -229,6 +229,8 @@ def _getproducts(server_url: str, lidvid: str, allcollections=True) -> Iterator[ _logger.error("⚠️ Failed to parse JSON response from %s: %s", url, e) _logger.debug("Response content: %s", r.text[:500]) raise ValueError(f"Invalid JSON response from {url}: {e}") from e + if "data" not in data: + _logger.warning('Response missing expected "data" key from %s', url) matches = data.get("data", []) num_matches = len(matches) for i in matches: From ca1aeccbb22949d21804a6eb5d1da10a6fba601b Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Wed, 7 Jan 2026 15:29:08 -0800 Subject: [PATCH 5/6] Use HTTPStatus constants instead of hardcoded status codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updated retry status codes to use HTTPStatus enum constants for better code readability and maintainability: - 500 → HTTPStatus.INTERNAL_SERVER_ERROR - 502 → HTTPStatus.BAD_GATEWAY - 503 → HTTPStatus.SERVICE_UNAVAILABLE - 504 → HTTPStatus.GATEWAY_TIMEOUT Addresses code review feedback. --- src/pds2/aipgen/registry.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index 60bb4ba..575220b 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -81,7 +81,12 @@ _retryattempts = 5 # Maximum number of retry attempts _retrybackoff = 2 # Exponential backoff factor (seconds) -_retrystatus = [500, 502, 503, 504] # HTTP status codes to retry on +_retrystatus = [ + HTTPStatus.INTERNAL_SERVER_ERROR, + HTTPStatus.BAD_GATEWAY, + HTTPStatus.SERVICE_UNAVAILABLE, + HTTPStatus.GATEWAY_TIMEOUT, +] # HTTP status codes to retry on # PDS API property keys we're interested in From 89a9e5fc50b6f5d2bc09874e73785a08a1eb53a7 Mon Sep 17 00:00:00 2001 From: Jordan Padams Date: Wed, 7 Jan 2026 15:47:16 -0800 Subject: [PATCH 6/6] Add batch collection fetching and clarify field requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses PR feedback with two improvements: 1. Clarified _searchkey field requirement in _fields constant - Added comment explaining it must be included for pagination - Required by line 244 for search-after parameter access 2. Implemented batch collection fetching to reduce API calls - New _getcollections_batch() function fetches collections in batches - Uses query syntax: (lidvid eq "..." or lidvid eq "..." or ...) - Batches collections into groups of 50 (API limit) - Reduces API calls dramatically (e.g., 8 collections: 8→1 requests) - Updated _comprehendregistry() to use batch fetching Benefits: - Fewer API requests reduces server load - Better performance for bundles with many collections - Stays within API query limits - Maintains same functionality with better efficiency --- src/pds2/aipgen/registry.py | 51 ++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index 575220b..7e88ca8 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -97,6 +97,7 @@ _proplabelurl = "ops:Label_File_Info.ops:file_ref" _proplabelmd5 = "ops:Label_File_Info.ops:md5_checksum" # Fields to request from API to minimize payload size +# _searchkey must be included because it's accessed in _getproducts() line 244 for pagination _fields = [_propdataurl, _propdatamd5, _proplabelurl, _proplabelmd5, _searchkey] @@ -245,6 +246,48 @@ def _getproducts(server_url: str, lidvid: str, allcollections=True) -> Iterator[ params["search-after"] = matches[-1]["properties"][_searchkey] +def _getcollections_batch(server_url: str, collection_lidvids: list[str]) -> Iterator[dict[str, Any]]: + """Get multiple collections efficiently by batching requests. + + Using the PDS API server at ``server_url``, fetch all collections matching the given + ``collection_lidvids`` by batching them into groups of _apiquerylimit (50) and making + one API call per batch using OR queries. + + Yields collection dictionaries. Collections not found will be omitted from results. + """ + if not collection_lidvids: + return + + session = _get_session_with_retry() + + # Process collections in batches of _apiquerylimit to avoid query string limits + for i in range(0, len(collection_lidvids), _apiquerylimit): + batch = collection_lidvids[i:i + _apiquerylimit] + # Build query: (lidvid eq "urn:..." or lidvid eq "urn:..." or ...) + or_conditions = " or ".join([f'lidvid eq "{lidvid}"' for lidvid in batch]) + query = f"({or_conditions})" + + url = f"{server_url}/products" + params = {"q": query, "limit": _apiquerylimit} + + _logger.debug('Batch fetching %d collections (batch %d-%d of %d)', + len(batch), i + 1, i + len(batch), len(collection_lidvids)) + r = session.get(url, params=params) # type: ignore + if not r.ok: + _logger.error("⚠️ Failed to batch fetch collections: HTTP %d", r.status_code) + r.raise_for_status() + try: + data = r.json() + except requests.exceptions.JSONDecodeError as e: + _logger.error("⚠️ Failed to parse JSON response from batch query: %s", e) + _logger.debug("Response content: %s", r.text[:500]) + raise ValueError(f"Invalid JSON response from batch query: {e}") from e + + matches = data.get("data", []) + for collection in matches: + yield collection + + def _addfiles(product: dict, bac: dict): """Add the PDS files described in the PDS ``product`` to the ``bac``.""" lidvid, props = product["id"], product["properties"] @@ -292,12 +335,8 @@ def _comprehendregistry(url: str, bundlelidvid: str, allcollections=True) -> tup collection_lidvids = bundle.get("properties", {}).get("ref_lidvid_collection", []) _logger.debug("📦 Found %d collections in bundle properties", len(collection_lidvids)) - for collection_lidvid in collection_lidvids: - # Fetch the collection details directly - collection = _getbundle(url, collection_lidvid) - if collection is None: - _logger.warning("⚠️ Collection %s not found in registry", collection_lidvid) - continue + # Batch fetch all collections in groups of _apiquerylimit to minimize API calls + for collection in _getcollections_batch(url, collection_lidvids): _addfiles(collection, bac) # Still use /members endpoint for getting products from each collection for product in _getproducts(url, collection["id"]):