From 8a996b892982cb2a37029bec46286c8665c76402 Mon Sep 17 00:00:00 2001 From: Vo Date: Fri, 13 Mar 2026 17:01:41 -0700 Subject: [PATCH 01/10] Refactor ingestion parser to use env_run and timing --- .../features/ingestion/parsers/case_docs.py | 180 ++++++++- .../features/ingestion/parsers/case_status.py | 169 --------- .../features/ingestion/parsers/e3sm_timing.py | 187 +++------- .../app/features/ingestion/parsers/parser.py | 53 +-- .../ingestion/parsers/test_case_docs.py | 255 ++++++++++++- .../ingestion/parsers/test_case_status.py | 262 ------------- .../ingestion/parsers/test_e3sm_timing.py | 210 +++++++---- .../features/ingestion/parsers/test_parser.py | 348 ++++++++---------- .../features/ingestion/parsers/test_utils.py | 24 ++ 9 files changed, 815 insertions(+), 873 deletions(-) delete mode 100644 backend/app/features/ingestion/parsers/case_status.py delete mode 100644 backend/tests/features/ingestion/parsers/test_case_status.py create mode 100644 backend/tests/features/ingestion/parsers/test_utils.py diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py index 966ef9b9..238c67c7 100644 --- a/backend/app/features/ingestion/parsers/case_docs.py +++ b/backend/app/features/ingestion/parsers/case_docs.py @@ -1,7 +1,13 @@ +import re import xml.etree.ElementTree as ET +from datetime import datetime from pathlib import Path +from dateutil.relativedelta import relativedelta + from app.features.ingestion.parsers.utils import _open_text +from app.features.simulation.enums import SimulationStatus +from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: @@ -18,9 +24,25 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: Dictionary with key 'case_group' (str or None) """ env_case_path = Path(env_case_path) + + case_name = _extract_value_from_file(env_case_path, "CASE") case_group = _extract_value_from_file(env_case_path, "CASE_GROUP") + machine = _extract_value_from_file(env_case_path, "MACH") + user = _extract_value_from_file(env_case_path, "REALUSER") + compset_alias = _extract_value_from_file(env_case_path, "COMPSET") - return {"case_group": case_group} + # Extract metadata that requires special handling + campaign, experiment_type = _extract_campaign_and_experiment_type(case_name) + + return { + "case_name": case_name, + "case_group": case_group, + "machine": machine, + "user": user, + "campaign": campaign, + "experiment_type": experiment_type, + "compset_alias": compset_alias, + } def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]: @@ -34,13 +56,66 @@ def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]: Returns ------- dict - Dictionary with keys 'compiler', 'mpilib' (str or None) + Dictionary with keys 'grid_resolution', 'compiler', 'mpilib' (str or None) """ env_build_path = Path(env_build_path) + + grid_resolution = _extract_value_from_file(env_build_path, "GRID") compiler = _extract_value_from_file(env_build_path, "COMPILER") mpilib = _extract_value_from_file(env_build_path, "MPILIB") - return {"compiler": compiler, "mpilib": mpilib} + return {"grid_resolution": grid_resolution, "compiler": compiler, "mpilib": mpilib} + + +def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]: + """Parse env_run.xml (plain or gzipped) to extract runtime settings. + + Parameters + ---------- + env_run_path : str or Path + Path to the env_run.xml file (plain or .gz) + + Returns + ------- + dict + Dictionary with runtime initialization and simulation date metadata. + """ + env_run_path = Path(env_run_path) + initialization_type = _extract_value_from_file(env_run_path, "RUN_TYPE") + run_start_date = _extract_value_from_file(env_run_path, "RUN_STARTDATE") + run_ref_date = _extract_value_from_file(env_run_path, "RUN_REFDATE") + stop_option = _extract_value_from_file(env_run_path, "STOP_OPTION") + stop_n = _extract_value_from_file(env_run_path, "STOP_N") + stop_date = _extract_value_from_file(env_run_path, "STOP_DATE") + + simulation_start_date = ( + run_ref_date if initialization_type == "branch" else run_start_date + ) + simulation_end_date = _calculate_simulation_end_date( + simulation_start_date, + stop_option, + stop_n, + stop_date, + ) + + return { + "initialization_type": initialization_type, + "simulation_start_date": simulation_start_date, + "simulation_end_date": simulation_end_date, + } + + +def parse_run_artifacts(run_dir: str | Path) -> dict[str, str | None]: + """Inspect execution-root artifacts and derive status metadata.""" + run_dir = Path(run_dir) + + return { + "status": ( + SimulationStatus.COMPLETED.value + if _has_timing_file(run_dir) + else SimulationStatus.UNKNOWN.value + ) + } def _extract_value_from_file(path: Path, entry_id: str) -> str | None: @@ -58,7 +133,10 @@ def _extract_value_from_file(path: Path, entry_id: str) -> str | None: str | None The value of the entry, or None if not found """ - text = _open_text(path) + try: + text = _open_text(path) + except (OSError, UnicodeDecodeError): + return None try: root = ET.fromstring(text) @@ -95,3 +173,97 @@ def _find_entry_value(root, entry_id: str) -> str | None: return entry.text.strip() return None + + +def _extract_campaign_and_experiment_type( + case_name: str | None, +) -> tuple[str | None, str | None]: + """Extract campaign and experiment type from case name. + + Parameters + ---------- + case_name : str or None + The case name to parse. + + Returns + ------- + tuple of (str or None, str or None) + campaign and experiment_type values. + """ + campaign = None + experiment_type = None + + # Example: v3.LR.historical + if case_name: + # Remove trailing instance suffix like _0121 + base = re.sub(r"_\d+$", "", case_name) + + # Only infer campaign for dot-delimited case names. + # Timing files sometimes use short case names (e.g., e3sm_v1_ne30) + # that do not encode campaign/experiment type. + if "." not in base: + return None, None + + # Campaign is the base case name without the trailing instance suffix + campaign = base + + # Candidate experiment type = last dot token + candidate = campaign.split(".")[-1] + + if candidate in KNOWN_EXPERIMENT_TYPES: + experiment_type = candidate + + return campaign, experiment_type + + +def _calculate_simulation_end_date( + simulation_start_date: str | None, + stop_option: str | None, + stop_n: str | None, + stop_date: str | None, +) -> str | None: + if not simulation_start_date or not stop_option: + return None + + if stop_option == "date": + return _parse_stop_date(stop_date) + + if stop_n is None: + return None + + try: + start_date = datetime.strptime(simulation_start_date, "%Y-%m-%d") + stop_n_int = int(stop_n) + except ValueError: + return None + + if stop_option == "ndays": + end_date = start_date + relativedelta(days=stop_n_int) + elif stop_option == "nmonths": + end_date = start_date + relativedelta(months=stop_n_int) + elif stop_option == "nyears": + end_date = start_date + relativedelta(years=stop_n_int) + else: + return None + + return end_date.strftime("%Y-%m-%d") + + +def _parse_stop_date(stop_date: str | None) -> str | None: + if not stop_date: + return None + + try: + return datetime.strptime(stop_date, "%Y%m%d").strftime("%Y-%m-%d") + except ValueError: + return None + + +def _has_timing_file(run_dir: Path) -> bool: + try: + return any( + child.is_file() and re.match(r"^e3sm_timing\..*", child.name) + for child in run_dir.iterdir() + ) + except OSError: + return False diff --git a/backend/app/features/ingestion/parsers/case_status.py b/backend/app/features/ingestion/parsers/case_status.py deleted file mode 100644 index 85e92952..00000000 --- a/backend/app/features/ingestion/parsers/case_status.py +++ /dev/null @@ -1,169 +0,0 @@ -import logging -import re -from datetime import datetime -from typing import Any - -from dateutil.relativedelta import relativedelta - -from app.features.ingestion.parsers.utils import _get_open_func -from app.features.simulation.enums import SimulationStatus - -logger = logging.getLogger(__name__) - -TIMESTAMP_PATTERN = r"(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})" -CASE_RUN_START_PATTERN = re.compile( - rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+starting(?:\s+(?P\S+))?\s*$" -) -CASE_RUN_TERMINAL_PATTERN = re.compile( - rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+(?Psuccess|error)\b" -) -RUN_STARTDATE_PATTERN = re.compile(r"RUN_STARTDATE=(?P\d{4}-\d{2}-\d{2})") -STOP_OPTION_STOP_N_PATTERN = re.compile( - r"STOP_OPTION=(?P[^,\s]+),STOP_N=(?P\d+)" -) - - -def parse_case_status(file_path: str) -> dict[str, Any]: - """ - Parse the case status file and extract relevant information. - - Parameters - ---------- - file_path : str - Path to the case status file. - - Returns - ------- - dict[str, Any] - A dictionary containing simulation date metadata and latest case.run - attempt metadata. - """ - result: dict[str, Any] = { - "simulation_start_date": None, - "simulation_end_date": None, - "run_start_date": None, - "run_end_date": None, - "status": None, - } - - open_func = _get_open_func(file_path) - try: - with open_func(file_path, "rt") as file: - lines = file.readlines() - except (OSError, UnicodeDecodeError) as exc: - logger.warning("Failed to read case status file %s (%s)", file_path, exc) - return result - - latest_start_idx, latest_start_match = _extract_simulation_dates( - lines, file_path, result - ) - - _extract_latest_run_metadata(lines, latest_start_idx, latest_start_match, result) - - return result - - -def _extract_simulation_dates( - lines: list[str], file_path: str, result: dict[str, Any] -) -> tuple[int | None, re.Match[str] | None]: - latest_start_idx: int | None = None - latest_start_match: re.Match[str] | None = None - - for index, line in enumerate(lines): - run_startdate_match = RUN_STARTDATE_PATTERN.search(line) - if "RUN_STARTDATE" in line and not run_startdate_match: - logger.warning( - f"Malformed RUN_STARTDATE line in {file_path}: {line.strip()}" - ) - elif run_startdate_match: - result["simulation_start_date"] = run_startdate_match.group("start_date") - - stop_match = STOP_OPTION_STOP_N_PATTERN.search(line) - if "STOP_OPTION" in line and "STOP_N" in line and not stop_match: - logger.warning( - f"Malformed STOP_OPTION/STOP_N line in {file_path}: {line.strip()}" - ) - elif stop_match: - _update_simulation_end_date(file_path, line, stop_match, result) - - start_match = CASE_RUN_START_PATTERN.match(line.strip()) - if start_match: - latest_start_idx = index - latest_start_match = start_match - - return latest_start_idx, latest_start_match - - -def _update_simulation_end_date( - file_path: str, line: str, stop_match: re.Match[str], result: dict[str, Any] -) -> None: - stop_option = stop_match.group("stop_option") - stop_n_str = stop_match.group("stop_n") - - try: - stop_n = int(stop_n_str) - except ValueError as exc: - logger.warning( - f"Malformed STOP_OPTION/STOP_N line in {file_path}: {line.strip()} ({exc})" - ) - return - - result["simulation_end_date"] = _calculate_simulation_end_date( - result["simulation_start_date"], stop_option, stop_n - ) - - -def _extract_latest_run_metadata( - lines: list[str], - latest_start_idx: int | None, - latest_start_match: re.Match[str] | None, - result: dict[str, Any], -) -> None: - if latest_start_idx is None or latest_start_match is None: - return - - result["run_start_time"] = latest_start_match.group("timestamp") - result["run_start_date"] = result["run_start_time"] - - for line in lines[latest_start_idx + 1 :]: - terminal_match = CASE_RUN_TERMINAL_PATTERN.match(line.strip()) - if not terminal_match: - continue - - result["run_end_time"] = terminal_match.group("timestamp") - result["run_end_date"] = result["run_end_time"] - state = terminal_match.group("state") - - if state == "success": - result["status"] = SimulationStatus.COMPLETED.value - elif state == "error": - result["status"] = SimulationStatus.FAILED.value - return - - result["status"] = SimulationStatus.RUNNING.value - - -def _calculate_simulation_end_date( - simulation_start_date: str | None, stop_option: str | None, stop_n: int | None -) -> str | None: - if not (simulation_start_date and stop_option and stop_n): - return None - - try: - start = datetime.strptime(simulation_start_date, "%Y-%m-%d") - except ValueError: - return None - - option = stop_option.lower() - n = stop_n - - if "days" in option: - end = start + relativedelta(days=n) - elif "months" in option: - end = start + relativedelta(months=n) - elif "years" in option: - end = start + relativedelta(years=n) - else: - return None - - return end.strftime("%Y-%m-%d") diff --git a/backend/app/features/ingestion/parsers/e3sm_timing.py b/backend/app/features/ingestion/parsers/e3sm_timing.py index 131c4d3f..63cb29e9 100644 --- a/backend/app/features/ingestion/parsers/e3sm_timing.py +++ b/backend/app/features/ingestion/parsers/e3sm_timing.py @@ -1,14 +1,13 @@ import re -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path -from typing import Any, Optional +from typing import Any from app.features.ingestion.parsers.utils import _open_text -from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES def parse_e3sm_timing(path: str | Path) -> dict[str, Any]: - """Parse an E3SM timing file and extract metadata fields. + """Parse an E3SM timing file and extract run metadata. Parameters ---------- @@ -18,122 +17,57 @@ def parse_e3sm_timing(path: str | Path) -> dict[str, Any]: Returns ------- dict - Dictionary with all relevant fields, including nested run config. + Dictionary with execution and run timing metadata. """ path = Path(path) - text = _open_text(path) - lines = text.splitlines() - - metadata_fields = { - "case_name": r"Case\s*[:=]\s*(.+)", - "machine": r"Machine\s*[:=]\s*(.+)", - "user": r"User\s*[:=]\s*(.+)", - "lid": r"LID\s*[:=]\s*(.+)", - "simulation_start_date": r"Curr Date\s*[:=]\s*(.+)", - "grid_resolution": r"grid\s*[:=]\s*(.+)", - "compset_alias": r"compset\s*[:=]\s*(.+)", - "initialization_type": r"run type\s*[:=]\s*([^,]+)", - "run_length": r"run length\s*[:=]\s*(.+)", - } - - metadata = { - key: _extract(lines, pattern) for key, pattern in metadata_fields.items() + result: dict[str, str | None] = { + "execution_id": None, + "run_start_date": None, + "run_end_date": None, } - # Extract metadata that requires special handling - campaign, experiment_type = _extract_campaign_and_experiment_type( - metadata.get("case_name") - ) - simulation_start_date = _parse_simulation_start_date( - metadata["simulation_start_date"] - ) - stop_option, stop_n = _extract_stop_option_and_stop_n(lines) - - result = { - "case_name": metadata["case_name"], - "campaign": campaign, - "experiment_type": experiment_type, - "machine": metadata["machine"], - "user": metadata["user"], - "lid": metadata["lid"], - "simulation_start_date": simulation_start_date, - "grid_resolution": metadata["grid_resolution"], - "compset_alias": metadata["compset_alias"], - "initialization_type": metadata["initialization_type"], - "run_config": { - "stop_option": stop_option, - "stop_n": stop_n, - "run_length": metadata["run_length"], - }, - } - - return result - - -def _extract_campaign_and_experiment_type( - case_name: Optional[str], -) -> tuple[Optional[str], Optional[str]]: - """Extract campaign and experiment type from case name. - - Parameters - ---------- - case_name : str or None - The case name to parse. - - Returns - ------- - tuple of (str or None, str or None) - campaign and experiment_type values. - """ - campaign = None - experiment_type = None - - # Example: v3.LR.historical - if case_name: - # Remove trailing instance suffix like _0121 - base = re.sub(r"_\d+$", "", case_name) - - # Only infer campaign for dot-delimited case names. - # Timing files sometimes use short case names (e.g., e3sm_v1_ne30) - # that do not encode campaign/experiment type. - if "." not in base: - return None, None - - # Campaign is the base case name without the trailing instance suffix - campaign = base - - # Candidate experiment type = last dot token - candidate = campaign.split(".")[-1] - - if candidate in KNOWN_EXPERIMENT_TYPES: - experiment_type = candidate + try: + text = _open_text(path) + except (OSError, UnicodeDecodeError): + return result - return campaign, experiment_type + lines = text.splitlines() + execution_id = _extract(lines, r"LID\s*[:=]\s*(.+)") + curr_date = _parse_curr_date(_extract(lines, r"Curr Date\s*[:=]\s*(.+)")) + init_time = _parse_seconds(_extract(lines, r"Init Time\s*[:=]\s*(.+)")) + run_time = _parse_seconds(_extract(lines, r"Run Time\s*[:=]\s*(.+)")) + final_time = _parse_seconds(_extract(lines, r"Final Time\s*[:=]\s*(.+)")) + + result["execution_id"] = execution_id + if curr_date is not None: + result["run_end_date"] = curr_date.isoformat(timespec="seconds") + + if ( + curr_date is not None + and init_time is not None + and run_time is not None + and final_time is not None + ): + total_seconds = init_time + run_time + final_time + run_start_date = curr_date - timedelta(seconds=total_seconds) + result["run_start_date"] = run_start_date.replace(microsecond=0).isoformat() -def _parse_simulation_start_date(date_str: Optional[str]) -> Optional[str]: - """Parse simulation start date string to ISO format. + return result - Parameters - ---------- - date_str : str or None - The date string to parse (e.g., "Tue Jan 10 12:34:56 2023"). - Returns - ------- - str or None - ISO formatted date string, or the original string if parsing fails. - """ +def _parse_curr_date(date_str: str | None) -> datetime | None: + """Parse a timing-file date string.""" if not date_str: return None try: - return datetime.strptime(date_str, "%a %b %d %H:%M:%S %Y").isoformat() + return datetime.strptime(date_str, "%a %b %d %H:%M:%S %Y") except ValueError: - return date_str + return None -def _extract(lines: list[str], pattern: str, group: int = 1) -> Optional[str]: +def _extract(lines: list[str], pattern: str, group: int = 1) -> str | None: """Extract the first regex group matching a pattern from a list of lines. Parameters @@ -159,39 +93,16 @@ def _extract(lines: list[str], pattern: str, group: int = 1) -> Optional[str]: return None -def _extract_stop_option_and_stop_n( - lines: list[str], -) -> tuple[Optional[str], Optional[str]]: - """ - Extract stop_option and stop_n from lines, handling both same-line and - separate-line cases. - - Parameters - ---------- - lines : list of str - Lines to search. - - Returns - ------- - tuple of (str or None, str or None) - stop_option and stop_n values. - """ - stop_option: str | None = None - stop_n: str | None = None - - for line in lines: - m = re.match(r"stop option\s*[:=]\s*([^,]+)", line.strip()) - - if m: - stop_option = m.group(1).strip() - m2 = re.search(r"stop_n\s*[=:]\s*(\d+)", line) - - if m2: - stop_n = m2.group(1).strip() - - break +def _parse_seconds(value: str | None) -> float | None: + """Extract a floating-point seconds value from a timing line.""" + if not value: + return None - if stop_n is None: - stop_n = _extract(lines, r"stop_n\s*[=:]\s*(.+)") + match = re.search(r"(-?\d+(?:\.\d+)?)", value) + if not match: + return None - return stop_option, stop_n + try: + return float(match.group(1)) + except ValueError: + return None diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index def0dfa3..51a9d178 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -29,8 +29,12 @@ from typing import Callable, Iterable, TypedDict from app.core.logger import _setup_custom_logger -from app.features.ingestion.parsers.case_docs import parse_env_build, parse_env_case -from app.features.ingestion.parsers.case_status import parse_case_status +from app.features.ingestion.parsers.case_docs import ( + parse_env_build, + parse_env_case, + parse_env_run, + parse_run_artifacts, +) from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing from app.features.ingestion.parsers.git_info import ( parse_git_config, @@ -57,24 +61,6 @@ class FileSpec(TypedDict, total=False): FILE_SPECS: dict[str, FileSpec] = { - "e3sm_timing": { - "pattern": r"e3sm_timing\..*\..*", - "location": "root", - "parser": parse_e3sm_timing, - "required": True, - }, - "readme_case": { - "pattern": r"README\.case\..*\.gz", - "location": "casedocs", - "parser": parse_readme_case, - "required": True, - }, - "case_status": { - "pattern": r"CaseStatus\..*\.gz", - "location": "root", - "parser": parse_case_status, - "required": True, - }, "case_docs_env_case": { "pattern": r"env_case\.xml\..*\.gz", "location": "casedocs", @@ -85,7 +71,25 @@ class FileSpec(TypedDict, total=False): "pattern": r"env_build\.xml\..*\.gz", "location": "casedocs", "parser": parse_env_build, - "required": False, + "required": True, + }, + "case_docs_env_run": { + "pattern": r"env_run\.xml\..*", + "location": "casedocs", + "parser": parse_env_run, + "required": True, + }, + "readme_case": { + "pattern": r"README\.case\..*\.gz", + "location": "casedocs", + "parser": parse_readme_case, + "required": True, + }, + "e3sm_timing": { + "pattern": r"e3sm_timing\..*", + "location": "root", + "parser": parse_e3sm_timing, + "required": True, }, "git_describe": { "pattern": r"GIT_DESCRIBE\..*\.gz", @@ -179,7 +183,7 @@ def main_parser( continue - results[exec_dir] = _parse_all_files(metadata_files) + results[exec_dir] = _parse_all_files(exec_dir, metadata_files) if skipped_count: logger.info( @@ -394,7 +398,7 @@ def _check_missing_files(files: SimulationFiles, exp_dir: str) -> None: ) -def _parse_all_files(files: dict[str, str | None]) -> SimulationMetadata: +def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationMetadata: """Pass discovered files to their respective parser functions. Parameters @@ -417,7 +421,10 @@ def _parse_all_files(files: dict[str, str | None]) -> SimulationMetadata: parser: Callable = spec["parser"] metadata.update(parser(path)) + metadata.update(parse_run_artifacts(exec_dir)) + populated_fields: SimulationMetadata = { + "execution_id": metadata.get("execution_id"), "case_name": metadata.get("case_name"), "compset": metadata.get("compset"), "compset_alias": metadata.get("compset_alias"), diff --git a/backend/tests/features/ingestion/parsers/test_case_docs.py b/backend/tests/features/ingestion/parsers/test_case_docs.py index b09e140f..ed68436b 100644 --- a/backend/tests/features/ingestion/parsers/test_case_docs.py +++ b/backend/tests/features/ingestion/parsers/test_case_docs.py @@ -1,4 +1,12 @@ -from app.features.ingestion.parsers.case_docs import parse_env_build, parse_env_case +from pathlib import Path +from unittest.mock import patch + +from app.features.ingestion.parsers.case_docs import ( + parse_env_build, + parse_env_case, + parse_env_run, + parse_run_artifacts, +) class TestParseEnvCase: @@ -48,6 +56,44 @@ def test_missing_entry_returns_none(self, tmp_path): assert result["case_group"] is None + def test_campaign_and_experiment_type_are_derived(self, tmp_path): + xml_case = """ + + + + """ + tmp_case = tmp_path / "env_case_campaign.xml" + tmp_case.write_text(xml_case) + + result = parse_env_case(tmp_case) + + assert result["campaign"] == "v3.LR.historical" + assert result["experiment_type"] == "historical" + + def test_non_dot_case_name_does_not_set_campaign(self, tmp_path): + xml_case = """ + + + + """ + tmp_case = tmp_path / "env_case_simple.xml" + tmp_case.write_text(xml_case) + + result = parse_env_case(tmp_case) + + assert result["campaign"] is None + assert result["experiment_type"] is None + + def test_read_error_returns_none(self): + with patch( + "app.features.ingestion.parsers.case_docs._open_text", + side_effect=OSError("boom"), + ): + result = parse_env_case(Path("/tmp/missing.xml")) + + assert result["case_name"] is None + assert result["case_group"] is None + class TestParseEnvBuild: def test_value(self, tmp_path): @@ -91,3 +137,210 @@ def test_missing_entry_returns_none(self, tmp_path): assert result["compiler"] == "intel" assert result["mpilib"] is None + + +class TestParseEnvRun: + def test_branch_uses_run_refdate(self, tmp_path): + xml_run = """ + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["initialization_type"] == "branch" + assert result["simulation_start_date"] == "1990-01-01" + assert result["simulation_end_date"] is None + + def test_non_branch_uses_run_startdate(self, tmp_path): + xml_run = """ + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_start_date"] == "2001-02-03" + + def test_stop_option_ndays(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] == "2020-01-11" + + def test_stop_option_nmonths(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] == "2020-03-01" + + def test_stop_option_nyears(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] == "2023-01-01" + + def test_stop_option_date(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] == "2024-12-31" + + def test_missing_stop_n_returns_none(self, tmp_path): + xml_run = """ + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] is None + + def test_missing_stop_date_returns_none(self, tmp_path): + xml_run = """ + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] is None + + def test_invalid_xml_returns_none_dates(self, tmp_path): + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text("branch") + + result = parse_env_run(tmp_run) + + assert result["initialization_type"] is None + assert result["simulation_start_date"] is None + assert result["simulation_end_date"] is None + + def test_invalid_dates_return_none(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_start_date"] == "invalid-date" + assert result["simulation_end_date"] is None + + def test_unknown_stop_option_returns_none(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] is None + + def test_invalid_stop_date_returns_none(self, tmp_path): + xml_run = """ + + + + + + + """ + tmp_run = tmp_path / "env_run.xml" + tmp_run.write_text(xml_run) + + result = parse_env_run(tmp_run) + + assert result["simulation_end_date"] is None + + +class TestParseRunArtifacts: + def test_completed_status_when_timing_exists(self, tmp_path): + (tmp_path / "e3sm_timing.test").write_text("timing data") + + result = parse_run_artifacts(tmp_path) + + assert result["status"] == "completed" + + def test_unknown_status_when_timing_missing(self, tmp_path): + result = parse_run_artifacts(tmp_path) + + assert result["status"] == "unknown" + + def test_unknown_status_when_directory_read_fails(self): + with patch.object(Path, "iterdir", side_effect=OSError("boom")): + result = parse_run_artifacts("/tmp/missing") + + assert result["status"] == "unknown" diff --git a/backend/tests/features/ingestion/parsers/test_case_status.py b/backend/tests/features/ingestion/parsers/test_case_status.py deleted file mode 100644 index c4b2cbc6..00000000 --- a/backend/tests/features/ingestion/parsers/test_case_status.py +++ /dev/null @@ -1,262 +0,0 @@ -import logging -from unittest.mock import Mock, patch - -from app.features.ingestion.parsers.case_status import ( - _calculate_simulation_end_date, - _extract_latest_run_metadata, - _update_simulation_end_date, - parse_case_status, -) -from app.features.simulation.enums import SimulationStatus - - -class TestCaseStatusParser: - def test_returns_default_result_on_read_error(self): - with ( - patch( - "app.features.ingestion.parsers.case_status._get_open_func", - return_value=Mock(side_effect=OSError("boom")), - ), - patch( - "app.features.ingestion.parsers.case_status.logger.warning" - ) as mock_warning, - ): - result = parse_case_status("/tmp/missing/casestatus.txt") - - assert result == { - "simulation_start_date": None, - "simulation_end_date": None, - "run_start_date": None, - "run_end_date": None, - "status": None, - } - mock_warning.assert_called_once() - assert "Failed to read case status file" in mock_warning.call_args.args[0] - - def test_extracts_simulation_start_and_end_date_nmonths(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE=2015-01-01 \n" - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=nmonths,STOP_N=41 \n" - ) - file_path = tmp_path / "casestatus.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] == "2015-01-01" - assert result["simulation_end_date"] == "2018-06-01" # 41 months after Jan 2015 - - def test_extracts_simulation_start_and_end_date_ndays(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE=2020-01-01 \n" - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=ndays,STOP_N=10 \n" - ) - file_path = tmp_path / "casestatus.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] == "2020-01-01" - assert result["simulation_end_date"] == "2020-01-11" - - def test_extracts_simulation_start_and_end_date_nyears(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE=2000-01-01 \n" - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=nyears,STOP_N=2 \n" - ) - file_path = tmp_path / "casestatus.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] == "2000-01-01" - assert result["simulation_end_date"] == "2002-01-01" - - def test_missing_fields(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE=2015-01-01 \n" - ) - file_path = tmp_path / "casestatus.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] == "2015-01-01" - assert result["simulation_end_date"] is None - - def test_malformed_lines_log_warning_and_continue(self, tmp_path, caplog): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE 2015-01-01 \n" - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=ndays,STOP_N=not-a-number \n" - ) - file_path = tmp_path / "casestatus.txt" - file_path.write_text(content) - - logger = logging.getLogger("app.features.ingestion.parsers.case_status") - old_propagate = logger.propagate - old_level = logger.level - logger.propagate = False - logger.disabled = False - logger.setLevel(logging.WARNING) - - logger.addHandler(caplog.handler) - try: - result = parse_case_status(str(file_path)) - finally: - logger.removeHandler(caplog.handler) - logger.propagate = old_propagate - logger.setLevel(old_level) - - assert result["simulation_start_date"] is None - assert result["simulation_end_date"] is None - assert any("Malformed RUN_STARTDATE" in message for message in caplog.messages) - assert any( - "Malformed STOP_OPTION/STOP_N" in message for message in caplog.messages - ) - - def test_stop_option_without_start_date_returns_none(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=ndays,STOP_N=10 \n" - ) - file_path = tmp_path / "casestatus_missing_start.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] is None - assert result["simulation_end_date"] is None - - def test_unknown_stop_option_returns_none(self, tmp_path): - content = ( - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "RUN_STARTDATE=2020-01-01 \n" - "2025-12-18 22:36:24: xmlchange success ./xmlchange " - "STOP_OPTION=nhours,STOP_N=10 \n" - ) - file_path = tmp_path / "casestatus_unknown_stop.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["simulation_start_date"] == "2020-01-01" - assert result["simulation_end_date"] is None - - def test_extract_latest_run_metadata_sets_completed_status(self): - lines = [ - "2025-01-01 00:00:00: case.run starting 123\n", - "2025-01-01 01:00:00: case.run success\n", - ] - start_match = Mock() - start_match.group.return_value = "2025-01-01 00:00:00" - result = { - "run_start_date": None, - "run_end_date": None, - "status": None, - } - - _extract_latest_run_metadata(lines, 0, start_match, result) - - assert result["run_start_date"] == "2025-01-01 00:00:00" - assert result["run_end_date"] == "2025-01-01 01:00:00" - assert result["status"] == SimulationStatus.COMPLETED.value - - def test_extract_latest_run_metadata_sets_failed_status(self): - lines = [ - "2025-01-01 00:00:00: case.run starting 123\n", - "2025-01-01 01:00:00: case.run error\n", - ] - start_match = Mock() - start_match.group.return_value = "2025-01-01 00:00:00" - result = { - "run_start_date": None, - "run_end_date": None, - "status": None, - } - - _extract_latest_run_metadata(lines, 0, start_match, result) - - assert result["status"] == SimulationStatus.FAILED.value - - def test_extract_latest_run_metadata_sets_running_without_terminal_state(self): - lines = [ - "2025-01-01 00:00:00: case.run starting 123\n", - "noise line\n", - ] - start_match = Mock() - start_match.group.return_value = "2025-01-01 00:00:00" - result = { - "run_start_date": None, - "run_end_date": None, - "status": None, - } - - _extract_latest_run_metadata(lines, 0, start_match, result) - - assert result["run_start_date"] == "2025-01-01 00:00:00" - assert result["run_end_date"] is None - assert result["status"] == SimulationStatus.RUNNING.value - - def test_extract_latest_run_metadata_no_start_match_no_changes(self): - result = { - "run_start_date": None, - "run_end_date": None, - "status": None, - } - - _extract_latest_run_metadata([], None, None, result) - - assert result["run_start_date"] is None - assert result["run_end_date"] is None - assert result["status"] is None - - def test_update_simulation_end_date_logs_and_returns_on_invalid_stop_n(self): - stop_match = Mock() - stop_match.group.side_effect = lambda key: { - "stop_option": "ndays", - "stop_n": "not-an-int", - }[key] - result = {"simulation_start_date": "2020-01-01", "simulation_end_date": None} - - with patch( - "app.features.ingestion.parsers.case_status.logger.warning" - ) as mock_warning: - _update_simulation_end_date( - "/tmp/casestatus.txt", - "STOP_OPTION=ndays,STOP_N=not-an-int", - stop_match, - result, - ) - - assert result["simulation_end_date"] is None - mock_warning.assert_called_once() - warning_message = mock_warning.call_args.args[0] - assert "Malformed STOP_OPTION/STOP_N line" in warning_message - - def test_calculate_simulation_end_date_invalid_start_date_returns_none(self): - assert _calculate_simulation_end_date("invalid-date", "ndays", 3) is None - - def test_parse_case_status_tracks_latest_case_run_start(self, tmp_path): - content = ( - "2025-12-18 22:36:24: case.run starting 111\n" - "2025-12-18 23:36:24: case.run error\n" - "2025-12-19 00:36:24: case.run starting 222\n" - "2025-12-19 01:36:24: case.run success\n" - ) - file_path = tmp_path / "casestatus_with_retries.txt" - file_path.write_text(content) - - result = parse_case_status(str(file_path)) - - assert result["run_start_time"] == "2025-12-19 00:36:24" - assert result["run_end_time"] == "2025-12-19 01:36:24" - assert result["status"] == SimulationStatus.COMPLETED.value diff --git a/backend/tests/features/ingestion/parsers/test_e3sm_timing.py b/backend/tests/features/ingestion/parsers/test_e3sm_timing.py index b014031c..ed159449 100644 --- a/backend/tests/features/ingestion/parsers/test_e3sm_timing.py +++ b/backend/tests/features/ingestion/parsers/test_e3sm_timing.py @@ -1,28 +1,23 @@ import gzip +from pathlib import Path +from unittest.mock import patch import pytest -from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing +from app.features.ingestion.parsers.e3sm_timing import _parse_seconds, parse_e3sm_timing CONTENT_FIXTURE = ( - "Case: e3sm_v1_ne30\n" - "Machine: cori-knl\n" - "User: test_user\n" - "LID: 123456\n" - "Curr Date: Tue Jan 10 12:34:56 2023\n" - "grid: ne30_oECv3\n" - "compset: A_WCYCL1850\n" - "run length: 42 days (42.0 for ocean)\n" - "run type: branch, continue_run = TRUE (inittype = FALSE)\n" - "stop option: ndays\n" - "stop_n: 5\n" + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : 124.909 seconds\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" ) class TestE3SMTimingParser: @pytest.fixture def sample_timing_file(self, tmp_path): - """Create a sample E3SM timing file.""" file_path = tmp_path / "e3sm_timing.txt" file_path.write_text(CONTENT_FIXTURE) @@ -30,7 +25,6 @@ def sample_timing_file(self, tmp_path): @pytest.fixture def sample_gz_timing_file(self, tmp_path): - """Create a sample gzipped E3SM timing file.""" file_path = tmp_path / "e3sm_timing.txt.gz" with gzip.open(file_path, "wt", encoding="utf-8") as f: f.write(CONTENT_FIXTURE) @@ -40,87 +34,157 @@ def sample_gz_timing_file(self, tmp_path): def test_parse_plain(self, sample_timing_file): data = parse_e3sm_timing(sample_timing_file) - assert data["case_name"] == "e3sm_v1_ne30" - assert data["campaign"] is None - assert data["machine"] == "cori-knl" - assert data["user"] == "test_user" - assert data["lid"] == "123456" - assert data["simulation_start_date"] == "2023-01-10T12:34:56" - assert data["grid_resolution"] == "ne30_oECv3" - assert data["compset_alias"] == "A_WCYCL1850" - assert data["initialization_type"] == "branch" - assert data["run_config"]["stop_option"] == "ndays" - assert data["run_config"]["stop_n"] == "5" - assert data["run_config"]["run_length"] == "42 days (42.0 for ocean)" + assert data["execution_id"] == "1081156.251218-200923" + assert data["run_end_date"] == "2025-12-18T20:54:58" + assert data["run_start_date"] == "2025-12-18T20:09:33" + assert "simulation_start_date" not in data def test_parse_gz(self, sample_gz_timing_file): data = parse_e3sm_timing(sample_gz_timing_file) - assert data["case_name"] == "e3sm_v1_ne30" - assert data["campaign"] is None - assert data["machine"] == "cori-knl" - assert data["user"] == "test_user" - assert data["lid"] == "123456" - assert data["simulation_start_date"] == "2023-01-10T12:34:56" - assert data["grid_resolution"] == "ne30_oECv3" - assert data["compset_alias"] == "A_WCYCL1850" - assert data["initialization_type"] == "branch" - assert data["run_config"]["stop_option"] == "ndays" - assert data["run_config"]["stop_n"] == "5" - assert data["run_config"]["run_length"] == "42 days (42.0 for ocean)" - - def test_missing_fields(self, tmp_path): - content = "Case: e3sm_v1_ne30\nMachine: cori-knl\n" - file_path = tmp_path / "e3sm_timing_missing.txt" + assert data["execution_id"] == "1081156.251218-200923" + assert data["run_end_date"] == "2025-12-18T20:54:58" + assert data["run_start_date"] == "2025-12-18T20:09:33" + + def test_missing_init_time_leaves_run_start_none(self, tmp_path): + content = ( + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" + ) + file_path = tmp_path / "e3sm_timing_missing_init.txt" + file_path.write_text(content) + + data = parse_e3sm_timing(file_path) + + assert data["run_end_date"] == "2025-12-18T20:54:58" + assert data["run_start_date"] is None + + def test_missing_run_time_leaves_run_start_none(self, tmp_path): + content = ( + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : 124.909 seconds\n" + "Final Time : 0.375 seconds\n" + ) + file_path = tmp_path / "e3sm_timing_missing_run.txt" file_path.write_text(content) + + data = parse_e3sm_timing(file_path) + + assert data["run_start_date"] is None + + def test_missing_final_time_leaves_run_start_none(self, tmp_path): + content = ( + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : 124.909 seconds\n" + "Run Time : 2599.194 seconds\n" + ) + file_path = tmp_path / "e3sm_timing_missing_final.txt" + file_path.write_text(content) + data = parse_e3sm_timing(file_path) - assert data["case_name"] == "e3sm_v1_ne30" - assert data["campaign"] is None - assert data["machine"] == "cori-knl" - assert data["user"] is None - assert data["lid"] is None - assert data["simulation_start_date"] is None - assert data["grid_resolution"] is None - assert data["compset_alias"] is None - assert data["initialization_type"] is None - assert data["run_config"]["stop_option"] is None - assert data["run_config"]["stop_n"] is None - assert data["run_config"]["run_length"] is None - - def test_invalid_date(self, tmp_path): + assert data["run_start_date"] is None + + def test_malformed_curr_date_leaves_dates_none(self, tmp_path): content = ( - "Case: e3sm_v1_ne30\nMachine: cori-knl\nCurr Date: Invalid Date Format\n" + "LID : 1081156.251218-200923\n" + "Curr Date : Invalid Date Format\n" + "Init Time : 124.909 seconds\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" ) file_path = tmp_path / "e3sm_timing_invalid_date.txt" file_path.write_text(content) + data = parse_e3sm_timing(file_path) - assert data["case_name"] == "e3sm_v1_ne30" - assert data["campaign"] is None - assert data["machine"] == "cori-knl" - assert data["simulation_start_date"] == "Invalid Date Format" + assert data["run_end_date"] is None + assert data["run_start_date"] is None - def test_campaign_and_experiment_type_from_case_name(self, tmp_path): + def test_malformed_numeric_timing_values_leave_run_start_none(self, tmp_path): content = ( - "Case: v3.LR.historical_0121\n" - "Machine: cori-knl\n" - "Curr Date: Tue Jan 10 12:34:56 2023\n" + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : not-a-number\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" ) - file_path = tmp_path / "e3sm_timing_campaign.txt" + file_path = tmp_path / "e3sm_timing_invalid_numeric.txt" file_path.write_text(content) + data = parse_e3sm_timing(file_path) - assert data["campaign"] == "v3.LR.historical" - assert data["experiment_type"] == "historical" + assert data["run_end_date"] == "2025-12-18T20:54:58" + assert data["run_start_date"] is None + + def test_read_error_returns_empty_result(self): + with patch( + "app.features.ingestion.parsers.e3sm_timing._open_text", + side_effect=OSError("boom"), + ): + data = parse_e3sm_timing(Path("/tmp/missing.txt")) + + assert data == { + "execution_id": None, + "run_start_date": None, + "run_end_date": None, + } - def test_stop_option_and_stop_n_same_line(self, tmp_path): + def test_missing_curr_date_keeps_run_dates_none(self, tmp_path): content = ( - "Case: e3sm_v1_ne30\nMachine: cori-knl\nstop option: ndays, stop_n=7\n" + "LID : 1081156.251218-200923\n" + "Init Time : 124.909 seconds\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" ) - file_path = tmp_path / "e3sm_timing_stop_line.txt" + file_path = tmp_path / "e3sm_timing_missing_curr_date.txt" file_path.write_text(content) + data = parse_e3sm_timing(file_path) - assert data["run_config"]["stop_option"] == "ndays" - assert data["run_config"]["stop_n"] == "7" + assert data["execution_id"] == "1081156.251218-200923" + assert data["run_end_date"] is None + assert data["run_start_date"] is None + + def test_missing_lid_returns_none_execution_id(self, tmp_path): + content = ( + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : 124.909 seconds\n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" + ) + file_path = tmp_path / "e3sm_timing_missing_lid.txt" + file_path.write_text(content) + + data = parse_e3sm_timing(file_path) + + assert data["execution_id"] is None + assert data["run_end_date"] == "2025-12-18T20:54:58" + + def test_empty_numeric_field_leaves_run_start_none(self, tmp_path): + content = ( + "LID : 1081156.251218-200923\n" + "Curr Date : Thu Dec 18 20:54:58 2025\n" + "Init Time : \n" + "Run Time : 2599.194 seconds\n" + "Final Time : 0.375 seconds\n" + ) + file_path = tmp_path / "e3sm_timing_empty_numeric.txt" + file_path.write_text(content) + + data = parse_e3sm_timing(file_path) + + assert data["run_end_date"] == "2025-12-18T20:54:58" + assert data["run_start_date"] is None + + def test_parse_seconds_returns_none_on_float_value_error(self): + with patch( + "app.features.ingestion.parsers.e3sm_timing.float", + side_effect=ValueError("boom"), + create=True, + ): + assert _parse_seconds("12.5 seconds") is None diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py index 3266dc19..9bb707e3 100644 --- a/backend/tests/features/ingestion/parsers/test_parser.py +++ b/backend/tests/features/ingestion/parsers/test_parser.py @@ -17,53 +17,39 @@ class TestMainParser: @staticmethod - def _create_execution_metadata_files(execution_dir: Path, version: str) -> None: - """Create standard execution files for testing. - - Parameters - ---------- - execution_dir : Path - Directory where execution files will be created. - version : str - Version string for file naming (e.g., "001.001"). - """ - # Create required e3sm_timing file - timing_file = execution_dir / f"e3sm_timing.{version}" - timing_file.write_text("timing data") - - # Create required CaseStatus file - with gzip.open( - execution_dir / f"CaseStatus.{version.split('.')[0]}.gz", "wt" - ) as f: - f.write("case status") - - # Create required CaseDocs/README and env_case.xml + def _create_execution_metadata_files( + execution_dir: Path, + version: str, + *, + include_env_run: bool = True, + include_timing: bool = True, + ) -> None: + """Create standard execution files for testing.""" + version_base = version.split(".")[0] + + if include_timing: + timing_file = execution_dir / f"e3sm_timing.{version}" + timing_file.write_text("timing data") + casedocs = execution_dir / "CaseDocs" casedocs.mkdir(exist_ok=True) - with gzip.open(casedocs / f"README.case.{version.split('.')[0]}.gz", "wt") as f: + + with gzip.open(casedocs / f"README.case.{version_base}.gz", "wt") as f: f.write("readme content") - with gzip.open( - casedocs / f"env_case.xml.{version.split('.')[0]}.gz", "wt" - ) as f: - f.write('') - - # Create required GIT_DESCRIBE file - with gzip.open( - execution_dir / f"GIT_DESCRIBE.{version.split('.')[0]}.gz", "wt" - ) as f: + with gzip.open(casedocs / f"env_case.xml.{version_base}.gz", "wt") as f: + f.write('') + with gzip.open(casedocs / f"env_build.xml.{version_base}.gz", "wt") as f: + f.write('') + + if include_env_run: + with gzip.open(casedocs / f"env_run.xml.{version_base}.gz", "wt") as f: + f.write('') + + with gzip.open(execution_dir / f"GIT_DESCRIBE.{version_base}.gz", "wt") as f: f.write("describe content") @staticmethod def _create_optional_files(execution_dir: Path, version: str) -> None: - """Create optional git configuration files for testing. - - Parameters - ---------- - execution_dir : Path - Directory where git files will be created. - version : str - Version string for file naming (e.g., "001"). - """ version_base = version.split(".")[0] if "." in version else version with gzip.open(execution_dir / f"GIT_CONFIG.{version_base}.gz", "wt") as f: @@ -73,73 +59,45 @@ def _create_optional_files(execution_dir: Path, version: str) -> None: @staticmethod def _create_zip_archive(base_dir: Path, archive_path: Path) -> None: - """Create a ZIP archive from directory contents. - - Parameters - ---------- - base_dir : Path - Directory relative to which paths are calculated. - archive_path : Path - Path where ZIP archive will be created. - """ zip_file = zipfile.ZipFile(archive_path, "w") for root, _dirs, files_list in os.walk(str(base_dir)): for file in files_list: file_path = Path(root) / file arcname = str(file_path.relative_to(str(base_dir))) - zip_file.write(file_path, arcname) zip_file.close() @staticmethod def _create_tar_gz_archive(base_dir: Path, archive_path: Path) -> None: - """Create a TAR.GZ archive from directory contents. - - Parameters - ---------- - base_dir : Path - Directory relative to which paths are calculated. - archive_path : Path - Path where TAR.GZ archive will be created. - """ tar_file = tarfile.open(archive_path, "w:gz") for root, _dirs, files_list in os.walk(str(base_dir)): for file in files_list: file_path = Path(root) / file arcname = str(file_path.relative_to(str(base_dir))) - tar_file.add(file_path, arcname=arcname) tar_file.close() @contextmanager def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: - """Context manager to mock all parser functions with sensible defaults. - - Parameters - ---------- - **kwargs : dict - Override default return values for specific parsers: - parse_e3sm_timing, parse_readme_case, parse_case_status, - parse_git_describe, parse_git_config, parse_git_status. - - Yields - ------ - None - Context manager available for use in with statement. - """ - # Set up default return values defaults = { - "parse_e3sm_timing": { + "parse_env_case": { "case_name": "test_case", "campaign": "test", "machine": "test", }, + "parse_env_build": {"compiler": "gnu"}, + "parse_env_run": {"simulation_start_date": "2020-01-01"}, "parse_readme_case": {}, - "parse_case_status": {}, + "parse_e3sm_timing": { + "execution_id": "1081156.251218-200923", + "run_start_date": "2025-12-18T20:09:33", + "run_end_date": "2025-12-18T20:54:58", + }, + "parse_run_artifacts": {"status": "completed"}, "parse_git_describe": {}, "parse_git_config": None, "parse_git_status": None, @@ -147,182 +105,160 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: defaults.update(kwargs) with ( - patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m1, - patch("app.features.ingestion.parsers.parser.parse_readme_case") as m2, - patch("app.features.ingestion.parsers.parser.parse_case_status") as m3, - patch("app.features.ingestion.parsers.parser.parse_git_describe") as m4, - patch("app.features.ingestion.parsers.parser.parse_git_config") as m5, - patch("app.features.ingestion.parsers.parser.parse_git_status") as m6, + patch("app.features.ingestion.parsers.parser.parse_env_case") as m1, + patch("app.features.ingestion.parsers.parser.parse_env_build") as m2, + patch("app.features.ingestion.parsers.parser.parse_env_run") as m3, + patch("app.features.ingestion.parsers.parser.parse_readme_case") as m4, + patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m5, + patch("app.features.ingestion.parsers.parser.parse_run_artifacts") as m6, + patch("app.features.ingestion.parsers.parser.parse_git_describe") as m7, + patch("app.features.ingestion.parsers.parser.parse_git_config") as m8, + patch("app.features.ingestion.parsers.parser.parse_git_status") as m9, ): - m1.return_value = defaults["parse_e3sm_timing"] - m2.return_value = defaults["parse_readme_case"] - m3.return_value = defaults["parse_case_status"] - m4.return_value = defaults["parse_git_describe"] - m5.return_value = defaults["parse_git_config"] - m6.return_value = defaults["parse_git_status"] + m1.return_value = defaults["parse_env_case"] + m2.return_value = defaults["parse_env_build"] + m3.return_value = defaults["parse_env_run"] + m4.return_value = defaults["parse_readme_case"] + m5.return_value = defaults["parse_e3sm_timing"] + m6.return_value = defaults["parse_run_artifacts"] + m7.return_value = defaults["parse_git_describe"] + m8.return_value = defaults["parse_git_config"] + m9.return_value = defaults["parse_git_status"] yield + def test_file_specs_remove_case_status_and_add_env_run(self) -> None: + assert "case_status" not in parser.FILE_SPECS + assert "case_docs_env_run" in parser.FILE_SPECS + assert "e3sm_timing" in parser.FILE_SPECS + assert parser.FILE_SPECS["case_docs_env_run"]["required"] is True + assert parser.FILE_SPECS["e3sm_timing"]["required"] is True + def test_with_valid_zip_archive(self, tmp_path: Path) -> None: - """Test processing a valid ZIP archive with executions.""" - # Create execution directory structure archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "1.0-0" execution_dir.mkdir(parents=True) - - # Create standard required files self._create_execution_metadata_files(execution_dir, "001.001") - # Create ZIP archive archive_path = tmp_path / "archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Mock parser functions and verify execution was found and parsed with self._mock_all_parsers(): result, skipped = parser.main_parser(archive_path, extract_dir) - assert len(result) > 0 - assert skipped == 0 - assert any("1.0-0" in key for key in result.keys()) + + assert len(result) > 0 + assert skipped == 0 + assert any("1.0-0" in key for key in result.keys()) def test_with_tar_gz_archive(self, tmp_path: Path) -> None: - """Test processing a TAR.GZ archive.""" - # Create execution directory archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "2.5-10" execution_dir.mkdir(parents=True) - - # Create standard required files self._create_execution_metadata_files(execution_dir, "002.002") - # Create TAR.GZ archive archive_path = tmp_path / "archive.tar.gz" self._create_tar_gz_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Mock parser functions and verify execution was found and parsed - with self._mock_all_parsers(parse_e3sm_timing={"case_name": "tar_test"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(archive_path, extract_dir) - assert len(result) > 0 - assert skipped == 0 - assert any("2.5-10" in key for key in result.keys()) + + assert len(result) > 0 + assert skipped == 0 + assert any("2.5-10" in key for key in result.keys()) def test_with_multiple_executions(self, tmp_path: Path) -> None: - """Test processing archive with multiple executions.""" archive_base = tmp_path / "archive_extract" archive_base.mkdir() - # Create first execution exec_dir1 = archive_base / "1.0-0" exec_dir1.mkdir(parents=True) self._create_execution_metadata_files(exec_dir1, "001.001") - # Create second execution exec_dir2 = archive_base / "2.0-0" exec_dir2.mkdir(parents=True) self._create_execution_metadata_files(exec_dir2, "002.002") - # Create ZIP archive archive_path = tmp_path / "multi_archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Mock parser functions and verify both executions were found - with self._mock_all_parsers(parse_e3sm_timing={"case_name": "test"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(archive_path, extract_dir) - assert len(result) == 2 - assert skipped == 0 - assert any("1.0-0" in key for key in result.keys()) - assert any("2.0-0" in key for key in result.keys()) - def test_with_nested_executions(self, tmp_path: Path) -> None: - """Test finding executions in nested directories. + assert len(result) == 2 + assert skipped == 0 - Parameters - ---------- - tmp_path : Path - Temporary directory provided by pytest. - """ + def test_with_nested_executions(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "parent" / "1.0-0" execution_dir.mkdir(parents=True) - - # Create standard required files self._create_execution_metadata_files(execution_dir, "001.001") - # Create ZIP archive archive_path = tmp_path / "nested_archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Mock parser functions and verify nested execution was found - with self._mock_all_parsers(parse_e3sm_timing={"case_name": "nested_test"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(archive_path, extract_dir) - assert len(result) > 0 - assert skipped == 0 - assert any("1.0-0" in key for key in result.keys()) + + assert len(result) > 0 + assert skipped == 0 def test_missing_required_files_skips_incomplete_run(self, tmp_path: Path) -> None: - """Test that incomplete runs (missing required files) are skipped.""" - # Create execution directory WITHOUT required files archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "1.0-0" execution_dir.mkdir(parents=True) (execution_dir / "dummy.txt").write_text("dummy") - # Create ZIP archive archive_path = tmp_path / "bad_archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Incomplete runs are skipped; result is empty rather than an error result, skipped = parser.main_parser(archive_path, extract_dir) + assert result == {} assert skipped == 1 - def test_multiple_matching_files_raises_error(self, tmp_path: Path) -> None: - """Test error when multiple files match a pattern.""" - # Create execution with duplicate files + def test_multiple_matching_timing_files_raises_error(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "1.0-0" execution_dir.mkdir(parents=True) - # Create duplicate e3sm_timing files (violates single-file requirement) (execution_dir / "e3sm_timing.001.001").write_text("timing1") (execution_dir / "e3sm_timing.002.002").write_text("timing2") - # Create other required files - with gzip.open(execution_dir / "CaseStatus.001.gz", "wt") as f: - f.write("status") casedocs = execution_dir / "CaseDocs" casedocs.mkdir() with gzip.open(casedocs / "README.case.001.gz", "wt") as f: f.write("readme") + with gzip.open(casedocs / "env_case.xml.001.gz", "wt") as f: + f.write('') + with gzip.open(casedocs / "env_build.xml.001.gz", "wt") as f: + f.write('') with gzip.open(execution_dir / "GIT_DESCRIBE.001.gz", "wt") as f: f.write("describe") - # Create ZIP archive archive_path = tmp_path / "duplicate_archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Should raise ValueError for multiple matching files with pytest.raises(ValueError, match="Multiple files matching pattern"): parser.main_parser(archive_path, extract_dir) def test_unsupported_archive_format_raises_error(self, tmp_path: Path) -> None: - """Test error for unsupported archive formats.""" archive_path = tmp_path / "archive.rar" archive_path.write_text("not a real archive") @@ -333,66 +269,91 @@ def test_unsupported_archive_format_raises_error(self, tmp_path: Path) -> None: parser.main_parser(str(archive_path), extract_dir) def test_no_case_directories_raises_error(self, tmp_path: Path) -> None: - """Test error when no case directories are found. - - Parameters - ---------- - tmp_path : Path - Temporary directory provided by pytest. - - Raises - ------ - FileNotFoundError - Expected when no valid case directories are discovered. - """ - # Create a ZIP archive with no case directories archive_base = tmp_path / "archive_extract" archive_base.mkdir() (archive_base / "some_other_dir").mkdir() (archive_base / "some_other_dir" / "file.txt").write_text("data") - # Create ZIP archive archive_path = tmp_path / "empty_archive.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Should raise FileNotFoundError when no case directories found with pytest.raises( FileNotFoundError, match="No cases or execution directories found" ): parser.main_parser(archive_path, extract_dir) - def test_with_optional_files(self, tmp_path): - """Test handling optional files properly.""" - # Create execution with optional files + def test_with_optional_files(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" execution_dir = archive_base / "1.0-0" execution_dir.mkdir(parents=True) - - # Create required files self._create_execution_metadata_files(execution_dir, "001.001") - - # Create optional git files self._create_optional_files(execution_dir, "001") - # Create ZIP archive archive_path = tmp_path / "with_optional.zip" self._create_zip_archive(archive_base, archive_path) extract_dir = tmp_path / "extracted" extract_dir.mkdir() - # Mock parsers with optional file returns with self._mock_all_parsers( - parse_git_config="https://github.com/test/repo", parse_git_status="main" + parse_git_config="https://github.com/test/repo", + parse_git_status="main", ): - result = parser.main_parser(archive_path, extract_dir) - assert len(result) > 0 + result, skipped = parser.main_parser(archive_path, extract_dir) + + assert len(result) > 0 + assert skipped == 0 + + def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None: + archive_base = tmp_path / "archive_extract" + execution_dir = archive_base / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files( + execution_dir, + "001.001", + include_env_run=False, + ) + + archive_path = tmp_path / "missing_env_run.zip" + self._create_zip_archive(archive_base, archive_path) + + extract_dir = tmp_path / "extracted" + extract_dir.mkdir() + + result, skipped = parser.main_parser(archive_path, extract_dir) + + assert result == {} + assert skipped == 1 + + def test_completed_status_is_merged(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files(execution_dir, "001.001") + + with self._mock_all_parsers(parse_run_artifacts={"status": "completed"}): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert skipped == 0 + assert next(iter(result.values()))["status"] == "completed" + + def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files( + execution_dir, + "001.001", + include_timing=False, + ) + + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert result == {} + assert skipped == 1 def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None: - """Test that ZIP extraction rejects path traversal entries.""" archive_path = tmp_path / "traversal.zip" with zipfile.ZipFile(archive_path, "w") as zip_ref: zip_ref.writestr("../evil.txt", "data") @@ -404,7 +365,6 @@ def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None: parser._extract_zip(str(archive_path), str(extract_dir)) def test_tar_path_traversal_rejected(self, tmp_path: Path) -> None: - """Test that TAR.GZ extraction rejects path traversal entries.""" archive_path = tmp_path / "traversal.tar.gz" with tarfile.open(archive_path, "w:gz") as tar_ref: payload = io.BytesIO(b"data") @@ -419,7 +379,6 @@ def test_tar_path_traversal_rejected(self, tmp_path: Path) -> None: parser._extract_tar_gz(str(archive_path), str(extract_dir)) def test_tar_symlink_rejected(self, tmp_path: Path) -> None: - """Test that TAR.GZ extraction rejects symlink entries.""" archive_path = tmp_path / "symlink.tar.gz" with tarfile.open(archive_path, "w:gz") as tar_ref: tar_info = tarfile.TarInfo(name="link") @@ -436,7 +395,6 @@ def test_tar_symlink_rejected(self, tmp_path: Path) -> None: def test_main_parser_treats_directory_as_already_extracted( self, tmp_path: Path ) -> None: - """Test non-archive directory input path branch in main_parser.""" execution_dir = tmp_path / "1.0-0" execution_dir.mkdir(parents=True) self._create_execution_metadata_files(execution_dir, "001.001") @@ -445,23 +403,19 @@ def test_main_parser_treats_directory_as_already_extracted( result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") assert len(result) == 1 - assert any("1.0-0" in key for key in result) + assert skipped == 0 def test_extract_archive_unsupported_format_raises_error(self) -> None: - """Test _extract_archive direct unsupported-extension branch.""" with pytest.raises(ValueError, match="Unsupported archive format"): parser._extract_archive("/tmp/archive.7z", "/tmp/output") def test_incomplete_run_among_valid_runs(self, tmp_path: Path) -> None: - """Test that an incomplete run is skipped while valid runs are parsed.""" archive_base = tmp_path / "archive_extract" - # Create a valid execution directory execution_dir_valid = archive_base / "1.0-0" execution_dir_valid.mkdir(parents=True) self._create_execution_metadata_files(execution_dir_valid, "001.001") - # Create an incomplete execution directory (missing required files) execution_dir_incomplete = archive_base / "2.0-0" execution_dir_incomplete.mkdir(parents=True) (execution_dir_incomplete / "dummy.txt").write_text("no required files") @@ -472,21 +426,13 @@ def test_incomplete_run_among_valid_runs(self, tmp_path: Path) -> None: extract_dir = tmp_path / "extracted" extract_dir.mkdir() - with self._mock_all_parsers(parse_e3sm_timing={"case_name": "mixed_test"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(archive_path, extract_dir) - # Only the valid execution should be in the result - assert len(result) == 1 - assert any("1.0-0" in key for key in result.keys()) - assert not any("2.0-0" in key for key in result.keys()) - def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None: - """Test multiple . dirs under the same casename. + assert len(result) == 1 + assert skipped == 1 - Simulates the performance_archive structure where a casename - directory contains multiple runs, each represented by a - subdirectory matching the . pattern. - """ - # Simulate: casename/1081156.251218-200923, casename/1081290.251218-211543 + def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" casename_dir = archive_base / "v3.LR.historical_0121" casename_dir.mkdir(parents=True) @@ -499,20 +445,17 @@ def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None: run2.mkdir() self._create_execution_metadata_files(run2, "002.002") - with self._mock_all_parsers(parse_e3sm_timing={"case_name": "v3_hist"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(casename_dir, tmp_path / "out") - # Both runs should be parsed successfully - assert len(result) == 2 - assert any("1081156" in key for key in result.keys()) - assert any("1081290" in key for key in result.keys()) + + assert len(result) == 2 + assert skipped == 0 def test_deterministic_sort_order(self, tmp_path: Path) -> None: - """Test that execution dirs are sorted for deterministic processing.""" archive_base = tmp_path / "archive_extract" casename_dir = archive_base / "case1" casename_dir.mkdir(parents=True) - # Create dirs in reverse order for name in ["3.0-0", "1.0-0", "2.0-0"]: d = casename_dir / name d.mkdir() @@ -534,5 +477,4 @@ def tracking_locate(execution_dir: str) -> Any: ): parser.main_parser(casename_dir, tmp_path / "out") - # Should be processed in sorted order assert call_order == sorted(call_order) diff --git a/backend/tests/features/ingestion/parsers/test_utils.py b/backend/tests/features/ingestion/parsers/test_utils.py new file mode 100644 index 00000000..9a7f2d47 --- /dev/null +++ b/backend/tests/features/ingestion/parsers/test_utils.py @@ -0,0 +1,24 @@ +import gzip + +from app.features.ingestion.parsers.utils import _get_open_func, _open_text + + +class TestParserUtils: + def test_open_text_reads_plain_file(self, tmp_path): + file_path = tmp_path / "plain.txt" + file_path.write_text("plain text") + + assert _open_text(file_path) == "plain text" + + def test_open_text_reads_gz_file(self, tmp_path): + file_path = tmp_path / "plain.txt.gz" + with gzip.open(file_path, "wt", encoding="utf-8") as f: + f.write("gz text") + + assert _open_text(file_path) == "gz text" + + def test_get_open_func_returns_gzip_open_for_gz(self): + assert _get_open_func("file.txt.gz") is gzip.open + + def test_get_open_func_returns_open_for_plain_file(self): + assert _get_open_func("file.txt") is open From 84774992a67bcf770746a90724895451dbe403cd Mon Sep 17 00:00:00 2001 From: Vo Date: Fri, 13 Mar 2026 17:15:28 -0700 Subject: [PATCH 02/10] Add NERSC archive ingestor coverage tests --- .../app/features/ingestion/parsers/parser.py | 11 +- .../ingestion/test_nersc_archive_ingestor.py | 679 ++++++++++++++++++ 2 files changed, 682 insertions(+), 8 deletions(-) diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index 51a9d178..db871ee4 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -423,7 +423,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM metadata.update(parse_run_artifacts(exec_dir)) - populated_fields: SimulationMetadata = { + simulation: SimulationMetadata = { "execution_id": metadata.get("execution_id"), "case_name": metadata.get("case_name"), "compset": metadata.get("compset"), @@ -439,16 +439,13 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM "run_start_date": metadata.get("run_start_date"), "run_end_date": metadata.get("run_end_date"), "compiler": metadata.get("compiler"), + "machine": metadata.get("machine"), + "hpc_username": metadata.get("user"), "git_repository_url": metadata.get("git_repository_url"), "git_branch": metadata.get("git_branch"), "git_tag": metadata.get("git_tag"), "git_commit_hash": metadata.get("git_commit_hash"), - "machine": metadata.get("machine"), - "hpc_username": metadata.get("user"), "status": metadata.get("status", SimulationStatus.UNKNOWN.value), - } - - placeholder_fields: SimulationMetadata = { # TODO: Skip this for MVP, not required. We can add it later if we find # a way to determine it from the parsed files. "parent_simulation_id": None, @@ -461,6 +458,4 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM "links": None, } - simulation: SimulationMetadata = {**populated_fields, **placeholder_fields} - return simulation diff --git a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py index cb0772d1..0a9385c0 100644 --- a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py +++ b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py @@ -1,8 +1,17 @@ """Tests for the NERSC archive ingestion runner script.""" +import json +import logging +import runpy +import urllib.error +import urllib.request +from email.message import Message from pathlib import Path from typing import Any +import pytest + +from app.api.version import API_BASE from app.scripts.ingestion import nersc_archive_ingestor as ingestor_module from app.scripts.ingestion.nersc_archive_ingestor import ( CaseScanResult, @@ -11,12 +20,26 @@ IngestionRequestResponse, IngestorConfig, _build_case_scan_results, + _build_config_from_env, _build_ingestion_candidates, + _case_state_processed_ids, _discover_case_executions, _fresh_state, _ingest_case_with_retries, + _is_transient_status, + _load_state, + _log_execution_skip_detail, + _log_startup_configuration, + _log_summary_table, + _normalized_api_base_url, + _parse_bool, + _parse_optional_int, + _post_ingestion_request, _record_successful_case, + _render_log_value, _run_ingestor, + _save_state, + _validate_execution_dir, ) @@ -70,6 +93,40 @@ def fake_locator(execution_dir: str) -> dict[str, str]: assert stats["skipped_invalid"] == 1 +def test_discover_case_executions_logs_suppressed_skips( + tmp_path: Path, + monkeypatch, +) -> None: + archive_root = tmp_path / "archive" + for index in range(ingestor_module.MAX_SKIP_DETAIL_LOGS + 2): + (archive_root / "case_a" / f"{100 + index}.1-1").mkdir(parents=True) + + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + + grouped = _discover_case_executions( + archive_root, + metadata_locator=lambda *_: (_ for _ in ()).throw(FileNotFoundError("missing")), + ) + + assert grouped == {} + suppression = [ + fields + for event, fields in logged_events + if event == "execution_skip_logs_suppressed" + ] + assert suppression == [ + { + "suppressed_count": 2, + "detail_log_limit": ingestor_module.MAX_SKIP_DETAIL_LOGS, + } + ] + + def test_build_ingestion_candidates_is_idempotent() -> None: scan_results = [ CaseScanResult( @@ -114,6 +171,104 @@ def test_build_ingestion_candidates_is_idempotent() -> None: assert third_candidates[0].new_execution_ids == ["101.1-1"] +def test_build_ingestion_candidates_handles_non_dict_case_state_and_limit() -> None: + scan_results = [ + CaseScanResult( + case_path="/performance_archive/case_a", + execution_ids=["100.1-1"], + fingerprint="fp-1", + ), + CaseScanResult( + case_path="/performance_archive/case_b", + execution_ids=["200.1-1"], + fingerprint="fp-2", + ), + ] + state = { + "cases": { + "/performance_archive/case_a": "invalid", + "/performance_archive/case_b": { + "processed_execution_ids": ["200.1-1"], + }, + } + } + + candidates = _build_ingestion_candidates( + scan_results, + state, + max_cases_per_run=1, + ) + + assert len(candidates) == 1 + assert candidates[0].case_path == "/performance_archive/case_a" + + +def test_build_ingestion_candidates_handles_non_dict_cases_root() -> None: + scan_results = [ + CaseScanResult( + case_path="/performance_archive/case_a", + execution_ids=["100.1-1"], + fingerprint="fp-1", + ) + ] + + candidates = _build_ingestion_candidates( + scan_results, + state={"cases": "invalid"}, + max_cases_per_run=None, + ) + + assert len(candidates) == 1 + assert candidates[0].new_execution_ids == ["100.1-1"] + + +def test_validate_execution_dir_counts_incomplete_with_stats(tmp_path: Path) -> None: + case_dir = tmp_path / "case_a" + (case_dir / "100.1-1").mkdir(parents=True) + stats = ingestor_module._new_discovery_stats() + skip_log_state = {"logged": 0, "suppressed": 0} + + valid = _validate_execution_dir( + case_dir, + "100.1-1", + metadata_locator=lambda *_: (_ for _ in ()).throw(FileNotFoundError("missing")), + stats=stats, + skip_log_state=skip_log_state, + ) + + assert valid is False + assert stats["skipped_incomplete"] == 1 + + +def test_validate_execution_dir_counts_value_error_with_stats(tmp_path: Path) -> None: + case_dir = tmp_path / "case_a" + (case_dir / "100.1-1").mkdir(parents=True) + stats = ingestor_module._new_discovery_stats() + skip_log_state = {"logged": 0, "suppressed": 0} + + valid = _validate_execution_dir( + case_dir, + "100.1-1", + metadata_locator=lambda *_: (_ for _ in ()).throw(ValueError("invalid")), + stats=stats, + skip_log_state=skip_log_state, + ) + + assert valid is False + assert stats["skipped_invalid"] == 1 + + +def test_build_case_scan_results_skips_empty_execution_lists() -> None: + grouped = { + "/performance_archive/case_a": [], + "/performance_archive/case_b": ["200.1-1"], + } + + results = _build_case_scan_results(grouped) + + assert [result.case_path for result in results] == ["/performance_archive/case_b"] + + def test_ingest_case_with_retries_retries_transient_errors() -> None: candidate = IngestionCandidate( case_path="/performance_archive/case_a", @@ -259,6 +414,46 @@ def fake_post_request( assert str(case_dir.resolve()) in reloaded_state["cases"] +def test_handle_ingest_run_returns_failure_when_case_ingestion_fails( + tmp_path: Path, + monkeypatch, +) -> None: + archive_root = tmp_path / "archive" + case_dir = archive_root / "case_a" + (case_dir / "100.1-1").mkdir(parents=True) + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + + config = IngestorConfig( + api_base_url="http://backend:8000", + api_token="token", + archive_root=archive_root, + machine_name="perlmutter", + state_path=tmp_path / "state.json", + dry_run=False, + max_cases_per_run=None, + max_attempts=1, + request_timeout_seconds=30, + ) + + def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse: + raise IngestionRequestError("boom", status_code=503, transient=False) + + exit_code = _run_ingestor( + config, + metadata_locator=lambda *_: {}, + sleep_fn=lambda *_: None, + post_request_fn=fake_post_request, + ) + + assert exit_code == 1 + assert any(event == "case_ingestion_failed" for event, _ in logged_events) + + def test_build_case_scan_results_is_deterministic() -> None: grouped = { "/performance_archive/case_b": ["200.1-1"], @@ -490,3 +685,487 @@ def fake_ingest_post_request(*args: Any, **kwargs: Any) -> IngestionRequestRespo assert isinstance(payload["execution_dirs_accepted"], int) assert isinstance(payload["skipped_incomplete"], int) assert isinstance(payload["skipped_invalid"], int) + + +def test_build_config_from_env_parses_valid_values(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("SIMBOARD_API_BASE_URL", "http://example") + monkeypatch.setenv("SIMBOARD_API_TOKEN", "token") + monkeypatch.setenv("PERF_ARCHIVE_ROOT", str(tmp_path / "archive")) + monkeypatch.setenv("MACHINE_NAME", "pm") + monkeypatch.setenv("STATE_PATH", str(tmp_path / "state.json")) + monkeypatch.setenv("DRY_RUN", "true") + monkeypatch.setenv("MAX_CASES_PER_RUN", "5") + monkeypatch.setenv("MAX_ATTEMPTS", "4") + monkeypatch.setenv("REQUEST_TIMEOUT_SECONDS", "90") + + config = _build_config_from_env() + + assert config.api_base_url == "http://example" + assert config.api_token == "token" + assert config.archive_root == (tmp_path / "archive").resolve() + assert config.machine_name == "pm" + assert config.state_path == (tmp_path / "state.json").resolve() + assert config.dry_run is True + assert config.max_cases_per_run == 5 + assert config.max_attempts == 4 + assert config.request_timeout_seconds == 90 + + +@pytest.mark.parametrize( + ("env_name", "env_value", "message"), + [ + ("MAX_CASES_PER_RUN", "0", "MAX_CASES_PER_RUN must be greater than 0"), + ("MAX_ATTEMPTS", "0", "MAX_ATTEMPTS must be greater than 0"), + ( + "REQUEST_TIMEOUT_SECONDS", + "0", + "REQUEST_TIMEOUT_SECONDS must be greater than 0", + ), + ], +) +def test_build_config_from_env_rejects_invalid_positive_values( + monkeypatch, + env_name: str, + env_value: str, + message: str, +) -> None: + monkeypatch.setenv(env_name, env_value) + + with pytest.raises(ValueError, match=message): + _build_config_from_env() + + +def test_main_returns_configuration_error_when_config_build_fails(monkeypatch) -> None: + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr( + ingestor_module, + "_build_config_from_env", + lambda: (_ for _ in ()).throw(ValueError("bad config")), + ) + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + + exit_code = ingestor_module.main() + + assert exit_code == 1 + assert logged_events == [("configuration_error", {"error": "bad config"})] + + +def test_main_logs_run_started_and_finished(monkeypatch, tmp_path: Path) -> None: + config = IngestorConfig( + api_base_url="http://backend:8000", + api_token="token", + archive_root=tmp_path, + machine_name="perlmutter", + state_path=tmp_path / "state.json", + dry_run=False, + max_cases_per_run=None, + max_attempts=1, + request_timeout_seconds=30, + ) + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_build_config_from_env", lambda: config) + monkeypatch.setattr(ingestor_module, "_run_ingestor", lambda cfg: 0) + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + monkeypatch.setattr( + ingestor_module.time, "monotonic", lambda: 10.0 if not logged_events else 12.5 + ) + + exit_code = ingestor_module.main() + + assert exit_code == 0 + assert logged_events[0][0] == "run_started" + assert logged_events[-1][0] == "run_finished" + + +def test_module_main_guard_exits_via_system_exit_on_configuration_error( + monkeypatch, +) -> None: + script_path = ( + Path(__file__).resolve().parents[3] + / "app/scripts/ingestion/nersc_archive_ingestor.py" + ) + monkeypatch.setenv("MAX_ATTEMPTS", "0") + monkeypatch.setattr(logging.Logger, "info", lambda *args, **kwargs: None) + + with pytest.raises(SystemExit) as exc_info: + runpy.run_path(str(script_path), run_name="__main__") + + assert exc_info.value.code == 1 + + +@pytest.mark.parametrize( + ("value", "default", "expected"), + [ + (None, True, True), + ("yes", False, True), + ("off", True, False), + ("maybe", True, True), + ], +) +def test_parse_bool(value: str | None, default: bool, expected: bool) -> None: + assert _parse_bool(value, default=default) is expected + + +def test_parse_optional_int_handles_none_and_blank() -> None: + assert _parse_optional_int(None) is None + assert _parse_optional_int(" ") is None + assert _parse_optional_int("7") == 7 + + +def test_ingest_case_with_retries_uses_default_post_request_fn(monkeypatch) -> None: + candidate = IngestionCandidate( + case_path="/performance_archive/case_a", + execution_ids=["100.1-1"], + new_execution_ids=["100.1-1"], + fingerprint="fp-1", + ) + captured: list[tuple[str, str, str, str, int]] = [] + + def fake_post( + endpoint_url: str, + api_token: str, + archive_path: str, + machine_name: str, + *, + timeout_seconds: int, + ) -> IngestionRequestResponse: + captured.append( + (endpoint_url, api_token, archive_path, machine_name, timeout_seconds) + ) + return {"status_code": 201, "body": {"created_count": 1}} + + monkeypatch.setattr(ingestor_module, "_post_ingestion_request", fake_post) + + result = _ingest_case_with_retries( + candidate, + endpoint_url="http://backend:8000/api/v1/ingestions/from-path", + api_token="token", + machine_name="pm", + max_attempts=1, + timeout_seconds=5, + sleep_fn=lambda *_: None, + ) + + assert result["ok"] is True + assert captured == [ + ( + "http://backend:8000/api/v1/ingestions/from-path", + "token", + "/performance_archive/case_a", + "pm", + 5, + ) + ] + + +def test_ingest_case_with_retries_normalizes_non_dict_body() -> None: + candidate = IngestionCandidate( + case_path="/performance_archive/case_a", + execution_ids=["100.1-1"], + new_execution_ids=["100.1-1"], + fingerprint="fp-1", + ) + + def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse: + return {"status_code": 201, "body": "bad"} # type: ignore[typeddict-item] + + result = _ingest_case_with_retries( + candidate, + endpoint_url="http://backend:8000/api/v1/ingestions/from-path", + api_token="token", + machine_name="pm", + max_attempts=1, + timeout_seconds=5, + sleep_fn=lambda *_: None, + post_request_fn=fake_post_request, + ) + + assert result["ok"] is True + assert result["body"] == {} + + +def test_ingest_case_with_retries_returns_exhausted_retries_when_zero_attempts() -> ( + None +): + candidate = IngestionCandidate( + case_path="/performance_archive/case_a", + execution_ids=["100.1-1"], + new_execution_ids=["100.1-1"], + fingerprint="fp-1", + ) + + def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse: + return {"status_code": 201, "body": {}} + + result = _ingest_case_with_retries( + candidate, + endpoint_url="http://backend:8000/api/v1/ingestions/from-path", + api_token="token", + machine_name="pm", + max_attempts=0, + timeout_seconds=5, + sleep_fn=lambda *_: None, + post_request_fn=fake_post_request, + ) + + assert result == { + "ok": False, + "attempts": 0, + "status_code": None, + "body": None, + "error": "Exhausted retries", + } + + +class _FakeHttpResponse: + def __init__(self, status: int, body: str) -> None: + self.status = status + self._body = body.encode("utf-8") + + def read(self) -> bytes: + return self._body + + def __enter__(self) -> "_FakeHttpResponse": + return self + + def __exit__(self, exc_type, exc, tb) -> None: + return None + + +class _FakeHttpError(urllib.error.HTTPError): + def __init__(self, url: str, code: int, msg: str, body: bytes) -> None: + super().__init__(url, code, msg, hdrs=Message(), fp=None) + self._body = body + + def read(self, amt: int = -1) -> bytes: + return self._body if amt == -1 else self._body[:amt] + + +def test_post_ingestion_request_success(monkeypatch) -> None: + captured_request: list[urllib.request.Request] = [] + + def fake_urlopen(request: urllib.request.Request, timeout: int): + captured_request.append(request) + assert timeout == 12 + return _FakeHttpResponse(201, json.dumps({"created_count": 1})) + + monkeypatch.setattr(ingestor_module.urllib.request, "urlopen", fake_urlopen) + + response = _post_ingestion_request( + "http://backend:8000/api/v1/ingestions/from-path", + "token", + "/archive/case_a", + "pm", + timeout_seconds=12, + ) + + assert response == {"status_code": 201, "body": {"created_count": 1}} + assert captured_request[0].headers["Authorization"] == "Bearer token" + + +def test_post_ingestion_request_handles_http_error(monkeypatch) -> None: + request = urllib.request.Request("http://example.com") + error = _FakeHttpError( + request.full_url, + 503, + "Service Unavailable", + b"retry later", + ) + + monkeypatch.setattr( + ingestor_module.urllib.request, + "urlopen", + lambda *args, **kwargs: (_ for _ in ()).throw(error), + ) + + with pytest.raises(IngestionRequestError) as exc_info: + _post_ingestion_request( + "http://backend:8000/api/v1/ingestions/from-path", + "token", + "/archive/case_a", + "pm", + timeout_seconds=12, + ) + + assert exc_info.value.status_code == 503 + assert exc_info.value.transient is True + + +def test_post_ingestion_request_handles_url_error(monkeypatch) -> None: + monkeypatch.setattr( + ingestor_module.urllib.request, + "urlopen", + lambda *args, **kwargs: (_ for _ in ()).throw( + urllib.error.URLError("network down") + ), + ) + + with pytest.raises(IngestionRequestError, match="URL error: network down"): + _post_ingestion_request( + "http://backend:8000/api/v1/ingestions/from-path", + "token", + "/archive/case_a", + "pm", + timeout_seconds=12, + ) + + +def test_post_ingestion_request_handles_timeout(monkeypatch) -> None: + monkeypatch.setattr( + ingestor_module.urllib.request, + "urlopen", + lambda *args, **kwargs: (_ for _ in ()).throw(TimeoutError()), + ) + + with pytest.raises(IngestionRequestError, match="Request timed out"): + _post_ingestion_request( + "http://backend:8000/api/v1/ingestions/from-path", + "token", + "/archive/case_a", + "pm", + timeout_seconds=12, + ) + + +def test_is_transient_status() -> None: + assert _is_transient_status(503) is True + assert _is_transient_status(400) is False + + +def test_load_state_handles_invalid_json_and_logs(monkeypatch, tmp_path: Path) -> None: + state_path = tmp_path / "state.json" + state_path.write_text("{invalid", encoding="utf-8") + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + + state = _load_state(state_path) + + assert state["version"] == ingestor_module.STATE_VERSION + assert logged_events[0][0] == "state_load_failed" + + +def test_load_state_handles_non_dict_root_and_non_dict_cases(tmp_path: Path) -> None: + non_dict_path = tmp_path / "not_dict.json" + non_dict_path.write_text("[]", encoding="utf-8") + assert _load_state(non_dict_path)["cases"] == {} + + bad_cases_path = tmp_path / "bad_cases.json" + bad_cases_path.write_text( + json.dumps( + {"version": 9, "cases": [], "updated_at": "2024-01-01T00:00:00+00:00"} + ), + encoding="utf-8", + ) + loaded = _load_state(bad_cases_path) + assert loaded["cases"] == {} + assert loaded["version"] == 9 + + +def test_save_state_round_trips(tmp_path: Path) -> None: + state_path = tmp_path / "nested" / "state.json" + state = {"version": 1, "cases": {}} + + _save_state(state_path, state) + + assert state_path.exists() + saved = json.loads(state_path.read_text(encoding="utf-8")) + assert saved["version"] == 1 + assert "updated_at" in saved + + +def test_record_successful_case_replaces_non_dict_cases() -> None: + state: dict[str, Any] = {"cases": []} + candidate = IngestionCandidate( + case_path="/archive/case_a", + execution_ids=["100.1-1"], + new_execution_ids=["100.1-1"], + fingerprint="fp-1", + ) + + _record_successful_case(state, candidate) + + assert state["cases"]["/archive/case_a"]["fingerprint"] == "fp-1" + + +def test_case_state_processed_ids_ignores_non_list() -> None: + assert _case_state_processed_ids({"processed_execution_ids": "bad"}) == set() + + +def test_normalized_api_base_url_handles_existing_api_base() -> None: + base_url = f"http://backend:8000{API_BASE}" + assert _normalized_api_base_url(base_url) == base_url + assert _normalized_api_base_url("http://backend:8000/") == ( + f"http://backend:8000{API_BASE}" + ) + + +def test_render_log_value_formats_values() -> None: + assert _render_log_value("plain-value") == "plain-value" + assert _render_log_value("value with space") == '"value with space"' + assert _render_log_value({"b": 1, "a": 2}) == '{"a": 2, "b": 1}' + + +def test_log_execution_skip_detail_suppresses_after_limit(monkeypatch) -> None: + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + skip_log_state = {"logged": ingestor_module.MAX_SKIP_DETAIL_LOGS, "suppressed": 0} + + _log_execution_skip_detail( + "execution_skipped_incomplete", + "/archive/case_a", + "100.1-1", + "missing", + skip_log_state, + ) + + assert logged_events == [] + assert skip_log_state["suppressed"] == 1 + + +def test_log_summary_table_and_startup_configuration( + monkeypatch, tmp_path: Path +) -> None: + logged_events: list[tuple[str, dict[str, Any]]] = [] + + def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None: + logged_events.append((event, {} if fields is None else fields)) + + monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event) + + _log_summary_table("summary", [("a", 1), ("b", "two words")]) + + config = IngestorConfig( + api_base_url="http://backend:8000", + api_token="token", + archive_root=tmp_path, + machine_name="pm", + state_path=tmp_path / "state.json", + dry_run=True, + max_cases_per_run=5, + max_attempts=2, + request_timeout_seconds=60, + ) + _log_startup_configuration( + config, endpoint_url="http://backend:8000/api/v1/ingestions/from-path" + ) + + assert logged_events[0][0] == "summary_table" + assert logged_events[1][0] == "startup_configuration_begin" + assert logged_events[2][0] == "summary_table" + assert logged_events[3][0] == "startup_configuration_end" From 0c66fc1028b1e6e54cfada8581a2c697875d24b8 Mon Sep 17 00:00:00 2001 From: Tom Vo Date: Wed, 18 Mar 2026 13:47:44 -0700 Subject: [PATCH 03/10] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- backend/app/features/ingestion/parsers/case_docs.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py index 238c67c7..6a16b52c 100644 --- a/backend/app/features/ingestion/parsers/case_docs.py +++ b/backend/app/features/ingestion/parsers/case_docs.py @@ -11,7 +11,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: - """Parse env_case.xml (plain or gzipped) to extract case_group. + """Parse env_case.xml (plain or gzipped) to extract case metadata. Parameters ---------- @@ -21,7 +21,16 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: Returns ------- dict - Dictionary with key 'case_group' (str or None) + Dictionary with case metadata (values are str or None), including: + + - ``case_name``: Case name (``CASE``) + - ``case_group``: Case group (``CASE_GROUP``) + - ``machine``: Machine name (``MACH``) + - ``user``: Real user (``REALUSER``) + - ``campaign``: Derived campaign identifier from the case name + - ``experiment_type``: Derived experiment type, constrained to + KNOWN_EXPERIMENT_TYPES when possible + - ``compset_alias``: Compset alias (``COMPSET``) """ env_case_path = Path(env_case_path) From fc357bdde7f66ffa5f5483f864c1366c66869584 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 13:50:05 -0700 Subject: [PATCH 04/10] Add Copilot review suggestions --- backend/app/features/ingestion/parsers/case_docs.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py index 6a16b52c..8da7589a 100644 --- a/backend/app/features/ingestion/parsers/case_docs.py +++ b/backend/app/features/ingestion/parsers/case_docs.py @@ -11,7 +11,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: - """Parse env_case.xml (plain or gzipped) to extract case metadata. + """Parse env_case.xml (plain or gzipped). Parameters ---------- @@ -55,7 +55,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]: def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]: - """Parse env_build.xml (plain or gzipped) to extract compiler and mpilib. + """Parse env_build.xml (plain or gzipped). Parameters ---------- @@ -231,12 +231,15 @@ def _calculate_simulation_end_date( stop_n: str | None, stop_date: str | None, ) -> str | None: - if not simulation_start_date or not stop_option: + if not stop_option: return None if stop_option == "date": return _parse_stop_date(stop_date) + if not simulation_start_date: + return None + if stop_n is None: return None From 27b194dcc2e66f8ca9c1c5ed54312d86bf89e7fc Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 14:26:23 -0700 Subject: [PATCH 05/10] Refactor ingestion parser boundary --- backend/app/features/ingestion/ingest.py | 296 +++++++------ .../app/features/ingestion/parsers/parser.py | 121 +++--- .../app/features/ingestion/parsers/types.py | 30 ++ backend/app/features/simulation/schemas.py | 8 +- .../features/ingestion/parsers/test_parser.py | 129 ++++-- .../tests/features/ingestion/test_ingest.py | 393 ++++++++++++++++-- 6 files changed, 724 insertions(+), 253 deletions(-) create mode 100644 backend/app/features/ingestion/parsers/types.py diff --git a/backend/app/features/ingestion/ingest.py b/backend/app/features/ingestion/ingest.py index 39e8f1ae..3e5f65db 100644 --- a/backend/app/features/ingestion/ingest.py +++ b/backend/app/features/ingestion/ingest.py @@ -18,7 +18,6 @@ keyed by case.id, to avoid repeated DB queries. """ -import os from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path @@ -29,13 +28,15 @@ from sqlalchemy.orm import Session from app.core.logger import _setup_custom_logger -from app.features.ingestion.parsers.parser import SimulationMetadata, main_parser +from app.features.ingestion.parsers.parser import main_parser +from app.features.ingestion.parsers.types import ParsedSimulation from app.features.machine.models import Machine from app.features.simulation.enums import SimulationStatus, SimulationType from app.features.simulation.models import Case, Simulation from app.features.simulation.schemas import SimulationCreate logger = _setup_custom_logger(__name__) +ConfigProjection = dict[str, str | None] @dataclass @@ -114,11 +115,11 @@ def ingest_archive( Path(output_dir) if isinstance(output_dir, str) else output_dir ) - all_simulations, skipped_count = main_parser( + parsed_simulations, skipped_count = main_parser( archive_path_resolved, output_dir_resolved ) - if not all_simulations: + if not parsed_simulations: logger.warning(f"No simulations found in archive: {archive_path_resolved}") return IngestArchiveResult( @@ -131,14 +132,13 @@ def ingest_archive( simulations: list[SimulationCreate] = [] duplicate_count = 0 errors: list[dict[str, str]] = [] - canonical_cache: dict[str, SimulationMetadata] = {} - persisted_canonical_cache: dict[UUID, SimulationMetadata | None] = {} + canonical_cache: dict[str, ConfigProjection] = {} + persisted_canonical_cache: dict[UUID, ConfigProjection | None] = {} - for execution_dir, metadata in all_simulations.items(): + for parsed_simulation in parsed_simulations: try: simulation, is_duplicate = _process_simulation_for_ingest( - execution_dir=execution_dir, - metadata=metadata, + parsed_simulation=parsed_simulation, db=db, canonical_cache=canonical_cache, persisted_canonical_cache=persisted_canonical_cache, @@ -152,11 +152,15 @@ def ingest_archive( simulations.append(simulation) except (ValueError, LookupError, ValidationError) as e: - logger.error(f"Failed to process simulation from {execution_dir}: {e}") + logger.error( + "Failed to process simulation from %s: %s", + parsed_simulation.execution_dir, + e, + ) errors.append( { - "execution_dir": str(execution_dir), + "execution_dir": parsed_simulation.execution_dir, "error_type": type(e).__name__, "error": str(e), } @@ -175,25 +179,22 @@ def ingest_archive( def _process_simulation_for_ingest( - execution_dir: str, - metadata: SimulationMetadata, + parsed_simulation: ParsedSimulation, db: Session, - canonical_cache: dict[str, SimulationMetadata], - persisted_canonical_cache: dict[UUID, SimulationMetadata | None], + canonical_cache: dict[str, ConfigProjection], + persisted_canonical_cache: dict[UUID, ConfigProjection | None], ) -> tuple[SimulationCreate | None, bool]: """Process one parsed simulation entry. Parameters ---------- - execution_dir : str - Execution directory name (used to derive execution_id). - metadata : SimulationMetadata - Parsed metadata dictionary for the simulation. + parsed_simulation : ParsedSimulation + Parsed archive-derived metadata for the simulation. db : Session Active database session for lookups and case resolution. - canonical_cache : dict[str, SimulationMetadata] - In-memory cache of canonical metadata per case_name for the current batch. - persisted_canonical_cache : dict[UUID, SimulationMetadata | None] + canonical_cache : dict[str, ConfigProjection] + In-memory cache of canonical config values per case_name for the current batch. + persisted_canonical_cache : dict[UUID, ConfigProjection | None] Cache of canonical metadata loaded from the database by case_id. Returns @@ -203,19 +204,19 @@ def _process_simulation_for_ingest( only for new records and ``is_duplicate`` is True when an existing ``execution_id`` was found. """ - execution_id = _derive_execution_id(execution_dir) - case_name = _require_case_name(metadata, execution_dir) - machine_id = _resolve_machine_id(metadata, db) - case = _resolve_case(metadata, case_name, db) - - if _is_duplicate_simulation(execution_id, execution_dir, db): - _seed_canonical_cache_from_duplicate(case_name, metadata, canonical_cache) + execution_id = parsed_simulation.execution_id + case_name = _require_case_name(parsed_simulation) + machine_id = _resolve_machine_id(parsed_simulation, db) + case = _resolve_case(parsed_simulation, case_name, db) + + if _is_duplicate_simulation(execution_id, parsed_simulation.execution_dir, db): + _seed_canonical_cache_from_duplicate( + case_name, parsed_simulation, canonical_cache + ) return None, True simulation = _build_simulation_create( - execution_dir=execution_dir, - metadata=metadata, - execution_id=execution_id, + parsed_simulation=parsed_simulation, machine_id=machine_id, case=case, canonical_cache=canonical_cache, @@ -226,22 +227,24 @@ def _process_simulation_for_ingest( return simulation, False -def _require_case_name(metadata: SimulationMetadata, execution_dir: str) -> str: +def _require_case_name(parsed_simulation: ParsedSimulation) -> str: """Return case_name from metadata or raise a descriptive error.""" - case_name = metadata.get("case_name") + case_name = parsed_simulation.case_name if not case_name: raise ValueError( - f"case_name is required but missing from '{execution_dir}'. " + f"case_name is required but missing from '{parsed_simulation.execution_dir}'. " "Cannot determine Case identity." ) return case_name -def _resolve_case(metadata: SimulationMetadata, case_name: str, db: Session) -> Case: +def _resolve_case( + parsed_simulation: ParsedSimulation, case_name: str, db: Session +) -> Case: """Resolve or create the Case for the current metadata row.""" - case_group = metadata.get("case_group") + case_group = parsed_simulation.case_group result = _get_or_create_case(db, name=case_name, case_group=case_group) @@ -266,41 +269,37 @@ def _is_duplicate_simulation( def _seed_canonical_cache_from_duplicate( case_name: str, - metadata: SimulationMetadata, - canonical_cache: dict[str, SimulationMetadata], + parsed_simulation: ParsedSimulation, + canonical_cache: dict[str, ConfigProjection], ) -> None: """Seed per-case canonical cache using duplicate metadata when needed.""" if case_name not in canonical_cache: - canonical_cache[case_name] = metadata + canonical_cache[case_name] = _parsed_simulation_to_config_projection( + parsed_simulation + ) def _build_simulation_create( - execution_dir: str, - metadata: SimulationMetadata, - execution_id: str, + parsed_simulation: ParsedSimulation, machine_id: UUID, case: Case, - canonical_cache: dict[str, SimulationMetadata], - persisted_canonical_cache: dict[UUID, SimulationMetadata | None], + canonical_cache: dict[str, ConfigProjection], + persisted_canonical_cache: dict[UUID, ConfigProjection | None], db: Session, ) -> SimulationCreate: """Create a SimulationCreate using canonical baseline semantics. Parameters ---------- - execution_dir : str - Execution directory name (used for logging context). - metadata : SimulationMetadata - Parsed metadata dictionary for the simulation. - execution_id : str - Unique execution identifier derived from the execution directory. + parsed_simulation : ParsedSimulation + Parsed archive-derived metadata for the simulation. machine_id : UUID Resolved machine ID from the database. case : Case Resolved Case object for this simulation. - canonical_cache : dict[str, SimulationMetadata] + canonical_cache : dict[str, ConfigProjection] In-memory cache of canonical metadata per case_name for the current batch. - persisted_canonical_cache : dict[UUID, SimulationMetadata | None] + persisted_canonical_cache : dict[UUID, ConfigProjection | None] Cache of canonical metadata loaded from the database by case_id. db : Session Active database session for lookups and case resolution. @@ -315,30 +314,44 @@ def _build_simulation_create( ) if canonical_metadata is None: - canonical_cache[case_name] = metadata + canonical_cache[case_name] = _parsed_simulation_to_config_projection( + parsed_simulation + ) - simulation = _map_metadata_to_schema( - metadata, machine_id, case.id, execution_id + simulation = _map_parsed_simulation_to_schema( + parsed_simulation, machine_id, case.id + ) + logger.info( + "Mapped canonical simulation from %s: %s", + parsed_simulation.execution_dir, + case_name, ) - logger.info(f"Mapped canonical simulation from {execution_dir}: {case_name}") return simulation - delta = _compute_config_delta(canonical_metadata, metadata) + delta = _compute_config_delta( + canonical_metadata, + _parsed_simulation_to_config_projection(parsed_simulation), + ) run_config_deltas = delta if delta else None - simulation = _map_metadata_to_schema( - metadata, machine_id, case.id, execution_id, run_config_deltas=run_config_deltas + simulation = _map_parsed_simulation_to_schema( + parsed_simulation, + machine_id, + case.id, + run_config_deltas=run_config_deltas, ) if delta: logger.info( - f"Non-canonical run in '{execution_dir}' has config differences from " - f"canonical: {list(delta.keys())}" + "Non-canonical run in '%s' has config differences from canonical: %s", + parsed_simulation.execution_dir, + list(delta.keys()), ) else: logger.info( - f"Non-canonical run in '{execution_dir}' has identical configuration to canonical." + "Non-canonical run in '%s' has identical configuration to canonical.", + parsed_simulation.execution_dir, ) return simulation @@ -347,10 +360,10 @@ def _build_simulation_create( def _get_canonical_metadata_for_case( case: Case, case_name: str, - canonical_cache: dict[str, SimulationMetadata], - persisted_canonical_cache: dict[UUID, SimulationMetadata | None], + canonical_cache: dict[str, ConfigProjection], + persisted_canonical_cache: dict[UUID, ConfigProjection | None], db: Session, -) -> SimulationMetadata | None: +) -> ConfigProjection | None: """Resolve canonical metadata from persisted canonical or batch cache. This function is useful for ensuring that all simulations of the same case @@ -362,14 +375,14 @@ def _get_canonical_metadata_for_case( The Case object for which to retrieve canonical metadata. case_name : str The name of the case, used for in-memory cache lookup. - canonical_cache : dict[str, SimulationMetadata] + canonical_cache : dict[str, ConfigProjection] In-memory cache of canonical metadata per case_name for the current batch. - persisted_canonical_cache : dict[UUID, SimulationMetadata | None] + persisted_canonical_cache : dict[UUID, ConfigProjection | None] Cache of canonical metadata loaded from the database by case_id. Returns ------- - SimulationMetadata | None + ConfigProjection | None The canonical metadata for the case, or None if no canonical run exists. """ if case.canonical_simulation_id is not None: @@ -383,7 +396,7 @@ def _get_canonical_metadata_for_case( ) if canonical_sim: - canonical_metadata = _sim_to_metadata(canonical_sim) + canonical_metadata = _simulation_to_config_projection(canonical_sim) persisted_canonical_cache[case.id] = canonical_metadata return canonical_metadata @@ -394,32 +407,6 @@ def _get_canonical_metadata_for_case( return canonical_cache.get(case_name) -def _derive_execution_id(execution_dir: str) -> str: - """Extract execution_id from the execution directory path. - - The execution_id is the basename of the execution directory - (e.g. ``1125772.260116-181605``). Absolute filesystem paths are - never stored. - - Parameters - ---------- - execution_dir : str - Execution directory name from the parser output (e.g. from timing files). - - Raises - ------ - ValueError - If the derived execution_id is empty. - """ - execution_id = os.path.basename(execution_dir) - if not execution_id: - raise ValueError( - f"Cannot derive execution_id from execution directory: '{execution_dir}'" - ) - - return execution_id - - def _get_or_create_case(db: Session, name: str, case_group: str | None = None) -> Case: """Get or create a Case record by case name. @@ -462,10 +449,30 @@ def _get_or_create_case(db: Session, name: str, case_group: str | None = None) - return case -def _sim_to_metadata(sim: Simulation) -> SimulationMetadata: - """Build a parser-style metadata dict from a persisted Simulation.""" +def _parsed_simulation_to_config_projection( + parsed_simulation: ParsedSimulation, +) -> ConfigProjection: + """Return config-comparison fields from a parsed simulation.""" + return { + "compset": parsed_simulation.compset, + "compset_alias": parsed_simulation.compset_alias, + "grid_name": parsed_simulation.grid_name, + "grid_resolution": parsed_simulation.grid_resolution, + "initialization_type": parsed_simulation.initialization_type, + "compiler": parsed_simulation.compiler, + "git_tag": parsed_simulation.git_tag, + "git_commit_hash": parsed_simulation.git_commit_hash, + "git_branch": parsed_simulation.git_branch, + "git_repository_url": _normalize_git_url(parsed_simulation.git_repository_url), + "campaign": parsed_simulation.campaign, + "experiment_type": parsed_simulation.experiment_type, + "simulation_type": SimulationType.UNKNOWN.value, + } + + +def _simulation_to_config_projection(sim: Simulation) -> ConfigProjection: + """Return config-comparison fields from a persisted Simulation.""" return { - "case_name": sim.case.name if sim.case else None, "compset": sim.compset, "compset_alias": sim.compset_alias, "grid_name": sim.grid_name, @@ -475,15 +482,31 @@ def _sim_to_metadata(sim: Simulation) -> SimulationMetadata: "git_tag": sim.git_tag, "git_commit_hash": sim.git_commit_hash, "git_branch": sim.git_branch, - "git_repository_url": sim.git_repository_url, + "git_repository_url": _stringify_config_value(sim.git_repository_url), "campaign": sim.campaign, - "experiment_type": sim.experiment_type, + "experiment_type": _stringify_config_value(sim.experiment_type), + "simulation_type": _stringify_config_value(sim.simulation_type), } +def _stringify_config_value(value: object) -> str | None: + """Convert enum-like config values to strings for delta comparison.""" + if value is None: + return None + + enum_value = getattr(value, "value", None) + if isinstance(enum_value, str): + return enum_value + + if isinstance(value, str): + return value + + return str(value) + + def _compute_config_delta( - canonical: SimulationMetadata, - other: SimulationMetadata, + canonical: ConfigProjection, + other: ConfigProjection, ) -> dict[str, dict[str, str | None]]: """Compare two run metadata dicts and return configuration differences. @@ -491,9 +514,9 @@ def _compute_config_delta( Parameters ---------- - canonical : SimulationMetadata + canonical : ConfigProjection The canonical run metadata to compare against. - other : SimulationMetadata + other : ConfigProjection The other run metadata to compare. Returns @@ -514,13 +537,13 @@ def _compute_config_delta( return delta -def _resolve_machine_id(metadata: SimulationMetadata, db: Session) -> UUID: +def _resolve_machine_id(metadata: ParsedSimulation, db: Session) -> UUID: """Resolve machine name to machine ID from the database. Parameters ---------- - metadata : SimulationMetadata - Parsed metadata dictionary for the simulation, expected to contain a + metadata : ParsedSimulation + Parsed metadata for the simulation, expected to contain a "machine" key with the machine name. db : Session Active database session for querying the Machine table. @@ -532,7 +555,7 @@ def _resolve_machine_id(metadata: SimulationMetadata, db: Session) -> UUID: LookupError If machine name cannot be found in database. """ - machine_name = metadata.get("machine") + machine_name = metadata.machine if not machine_name: raise ValueError("Machine name is required but not found in metadata") @@ -553,7 +576,7 @@ def _find_existing_simulation(db: Session, execution_id: str) -> Simulation | No db : Session Active database session for querying the Simulation table. execution_id : str - Unique execution identifier derived from the execution directory. + Unique execution identifier derived from the timing-file LID. Returns ------- @@ -615,25 +638,22 @@ def _normalize_git_url(url: str | None) -> str | None: return url -def _map_metadata_to_schema( - metadata: SimulationMetadata, +def _map_parsed_simulation_to_schema( + parsed_simulation: ParsedSimulation, machine_id: UUID, case_id: UUID, - execution_id: str, run_config_deltas: dict[str, dict[str, str | None]] | None = None, ) -> SimulationCreate: """Map parser metadata to SimulationCreate schema with type conversions. Parameters ---------- - metadata : SimulationMetadata - Dictionary of parsed simulation metadata with string values. + parsed_simulation : ParsedSimulation + Parsed archive-derived metadata with string values. machine_id : UUID Pre-extracted machine ID. case_id : UUID ID of the Case this simulation belongs to. - execution_id : str - Unique execution identifier derived from the archive directory. run_config_deltas : dict | None Configuration differences vs canonical baseline, or None. @@ -644,15 +664,17 @@ def _map_metadata_to_schema( """ # Parse datetime fields using the shared utility function # Note: simulation_start_date is already validated in _extract_simulation_key() - simulation_start_date = _parse_datetime_field(metadata.get("simulation_start_date")) - simulation_end_date = _parse_datetime_field(metadata.get("simulation_end_date")) + simulation_start_date = _parse_datetime_field( + parsed_simulation.simulation_start_date + ) + simulation_end_date = _parse_datetime_field(parsed_simulation.simulation_end_date) - run_start_date = _parse_datetime_field(metadata.get("run_start_date")) - run_end_date = _parse_datetime_field(metadata.get("run_end_date")) + run_start_date = _parse_datetime_field(parsed_simulation.run_start_date) + run_end_date = _parse_datetime_field(parsed_simulation.run_end_date) - git_repository_url = _normalize_git_url(metadata.get("git_repository_url")) - simulation_type = _normalize_simulation_type(metadata.get("simulation_type")) - status = _normalize_simulation_status(metadata.get("status")) + git_repository_url = _normalize_git_url(parsed_simulation.git_repository_url) + simulation_type = _normalize_simulation_type(None) + status = _normalize_simulation_status(parsed_simulation.status) # Map metadata to schema; Pydantic will validate required fields # Note: SimulationCreate uses CamelInBaseModel which expects camelCase field names @@ -660,35 +682,35 @@ def _map_metadata_to_schema( { # Required identification fields "caseId": case_id, - "executionId": execution_id, + "executionId": parsed_simulation.execution_id, # Required configuration fields - "compset": metadata.get("compset"), - "compsetAlias": metadata.get("compset_alias"), - "gridName": metadata.get("grid_name"), - "gridResolution": metadata.get("grid_resolution"), + "compset": parsed_simulation.compset, + "compsetAlias": parsed_simulation.compset_alias, + "gridName": parsed_simulation.grid_name, + "gridResolution": parsed_simulation.grid_resolution, # Required status fields with sensible defaults "simulationType": simulation_type, "status": status, - "initializationType": metadata.get("initialization_type"), + "initializationType": parsed_simulation.initialization_type, "machineId": machine_id, "simulationStartDate": simulation_start_date, "simulationEndDate": simulation_end_date, - "experimentType": metadata.get("experiment_type"), - "campaign": metadata.get("campaign"), + "experimentType": parsed_simulation.experiment_type, + "campaign": parsed_simulation.campaign, "runStartDate": run_start_date, "runEndDate": run_end_date, - "compiler": metadata.get("compiler"), + "compiler": parsed_simulation.compiler, "gitRepositoryUrl": git_repository_url, - "gitBranch": metadata.get("git_branch"), - "gitTag": metadata.get("git_tag"), - "gitCommitHash": metadata.get("git_commit_hash"), + "gitBranch": parsed_simulation.git_branch, + "gitTag": parsed_simulation.git_tag, + "gitCommitHash": parsed_simulation.git_commit_hash, # Note: created_by and last_updated_by are set to None since archive # metadata contains local usernames that cannot be reliably mapped to # database user UUIDs. The API endpoint will set these values based on # the authenticated user who uploaded the archive. "createdBy": None, "lastUpdatedBy": None, - "hpcUsername": metadata.get("hpc_username"), + "hpcUsername": parsed_simulation.hpc_username, # Canonical run semantics "runConfigDeltas": run_config_deltas, } diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index db871ee4..72a3463b 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -42,11 +42,10 @@ parse_git_status, ) from app.features.ingestion.parsers.readme_case import parse_readme_case -from app.features.simulation.enums import SimulationStatus, SimulationType +from app.features.ingestion.parsers.types import ParsedSimulation +from app.features.simulation.enums import SimulationStatus SimulationFiles = dict[str, str | None] -SimulationMetadata = dict[str, str | None] -AllSimulations = dict[str, SimulationMetadata] logger = _setup_custom_logger(__name__) @@ -114,7 +113,7 @@ class FileSpec(TypedDict, total=False): def main_parser( archive_path: str | Path, output_dir: str | Path -) -> tuple[AllSimulations, int]: +) -> tuple[list[ParsedSimulation], int]: """Main entrypoint for parser workflow. Parses case directories from a performance archive, handling incomplete or @@ -134,10 +133,10 @@ def main_parser( Returns ------- - tuple[AllSimulations, int] - Dictionary mapping execution directory paths to their parsed simulations - and the count of skipped incomplete runs. Only directories that contain - all required metadata files are included. + tuple[list[ParsedSimulation], int] + Parsed simulations in deterministic execution-directory order and the + count of skipped incomplete runs. Only directories that contain all + required metadata files and a timing-file LID are included. """ archive_path = str(archive_path) output_dir = str(output_dir) @@ -164,7 +163,7 @@ def main_parser( "directories matching pattern: .-" ) - results: AllSimulations = {} + results: list[ParsedSimulation] = [] skipped_count = 0 for case_dir, exec_dirs in case_to_executions_dirs.items(): @@ -177,14 +176,13 @@ def main_parser( for exec_dir in sorted_exec_dirs: try: metadata_files = _locate_metadata_files(exec_dir) + results.append(_parse_all_files(exec_dir, metadata_files)) except FileNotFoundError as exc: logger.warning(f"Skipping incomplete run in '{exec_dir}': {exc}") skipped_count += 1 continue - results[exec_dir] = _parse_all_files(exec_dir, metadata_files) - if skipped_count: logger.info( f"Skipped {skipped_count} incomplete run(s) missing required files." @@ -398,7 +396,7 @@ def _check_missing_files(files: SimulationFiles, exp_dir: str) -> None: ) -def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationMetadata: +def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimulation: """Pass discovered files to their respective parser functions. Parameters @@ -408,10 +406,10 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM Returns ------- - SimulationMetadata - Dictionary with parsed results from each file type. + ParsedSimulation + Typed archive-derived metadata for one execution directory. """ - metadata: SimulationMetadata = {} + metadata: dict[str, str | None] = {} for key, spec in FILE_SPECS.items(): path = files.get(key) @@ -423,39 +421,60 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM metadata.update(parse_run_artifacts(exec_dir)) - simulation: SimulationMetadata = { - "execution_id": metadata.get("execution_id"), - "case_name": metadata.get("case_name"), - "compset": metadata.get("compset"), - "compset_alias": metadata.get("compset_alias"), - "grid_name": metadata.get("grid_name"), - "grid_resolution": metadata.get("grid_resolution"), - "campaign": metadata.get("campaign"), - "experiment_type": metadata.get("experiment_type"), - "initialization_type": metadata.get("initialization_type"), - "case_group": metadata.get("case_group"), - "simulation_start_date": metadata.get("simulation_start_date"), - "simulation_end_date": metadata.get("simulation_end_date"), - "run_start_date": metadata.get("run_start_date"), - "run_end_date": metadata.get("run_end_date"), - "compiler": metadata.get("compiler"), - "machine": metadata.get("machine"), - "hpc_username": metadata.get("user"), - "git_repository_url": metadata.get("git_repository_url"), - "git_branch": metadata.get("git_branch"), - "git_tag": metadata.get("git_tag"), - "git_commit_hash": metadata.get("git_commit_hash"), - "status": metadata.get("status", SimulationStatus.UNKNOWN.value), - # TODO: Skip this for MVP, not required. We can add it later if we find - # a way to determine it from the parsed files. - "parent_simulation_id": None, - # TODO: This is a required field, but we don't have a way to determine - # the simulation type from the parsed files yet. Default to UNKNOWN - # for now and manually update on the UI if needed. - "simulation_type": SimulationType.UNKNOWN.value, - "extra": None, - "artifacts": None, - "links": None, - } - - return simulation + execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir) + _warn_on_execution_id_mismatch(exec_dir, execution_id) + + return ParsedSimulation( + execution_dir=exec_dir, + execution_id=execution_id, + case_name=metadata.get("case_name"), + case_group=metadata.get("case_group"), + machine=metadata.get("machine"), + hpc_username=metadata.get("user"), + compset=metadata.get("compset"), + compset_alias=metadata.get("compset_alias"), + grid_name=metadata.get("grid_name"), + grid_resolution=metadata.get("grid_resolution"), + campaign=metadata.get("campaign"), + experiment_type=metadata.get("experiment_type"), + initialization_type=metadata.get("initialization_type"), + simulation_start_date=metadata.get("simulation_start_date"), + simulation_end_date=metadata.get("simulation_end_date"), + run_start_date=metadata.get("run_start_date"), + run_end_date=metadata.get("run_end_date"), + compiler=metadata.get("compiler"), + git_repository_url=metadata.get("git_repository_url"), + git_branch=metadata.get("git_branch"), + git_tag=metadata.get("git_tag"), + git_commit_hash=metadata.get("git_commit_hash"), + status=metadata.get("status", SimulationStatus.UNKNOWN.value), + ) + + +def _require_execution_id(execution_id: str | None, exec_dir: str) -> str: + """Return timing-file LID or treat the run as incomplete.""" + if execution_id is None: + raise FileNotFoundError( + f"Required timing-file LID missing for execution directory '{exec_dir}'" + ) + + normalized = execution_id.strip() + if not normalized: + raise FileNotFoundError( + f"Required timing-file LID missing for execution directory '{exec_dir}'" + ) + + return normalized + + +def _warn_on_execution_id_mismatch(exec_dir: str, execution_id: str) -> None: + """Log when timing-file LID disagrees with the execution directory basename.""" + exec_dir_basename = os.path.basename(exec_dir) + + if exec_dir_basename and exec_dir_basename != execution_id: + logger.warning( + "Timing-file LID '%s' does not match execution directory '%s'. " + "Using timing-file LID as execution_id.", + execution_id, + exec_dir_basename, + ) diff --git a/backend/app/features/ingestion/parsers/types.py b/backend/app/features/ingestion/parsers/types.py new file mode 100644 index 00000000..cc7d43b9 --- /dev/null +++ b/backend/app/features/ingestion/parsers/types.py @@ -0,0 +1,30 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class ParsedSimulation: + """Archive-derived metadata for one parsed execution run.""" + + execution_dir: str + execution_id: str + case_name: str | None + case_group: str | None + machine: str | None + hpc_username: str | None + compset: str | None + compset_alias: str | None + grid_name: str | None + grid_resolution: str | None + campaign: str | None + experiment_type: str | None + initialization_type: str | None + simulation_start_date: str | None + simulation_end_date: str | None + run_start_date: str | None + run_end_date: str | None + compiler: str | None + git_repository_url: str | None + git_branch: str | None + git_tag: str | None + git_commit_hash: str | None + status: str | None diff --git a/backend/app/features/simulation/schemas.py b/backend/app/features/simulation/schemas.py index 15641502..f4772cd7 100644 --- a/backend/app/features/simulation/schemas.py +++ b/backend/app/features/simulation/schemas.py @@ -110,7 +110,7 @@ class SimulationCreate(CamelInBaseModel): ..., description=( "Unique identifier for this execution, derived from the " - "archive directory name (e.g. 1125772.260116-181605)" + "timing-file LID (e.g. 1125772.260116-181605)" ), ), ] @@ -293,8 +293,7 @@ class SimulationSummaryOut(CamelOutBaseModel): Field( ..., description=( - "Unique identifier for this execution, derived from the " - "archive directory name" + "Unique identifier for this execution, derived from the timing-file LID" ), ), ] @@ -390,8 +389,7 @@ class SimulationOut(CamelOutBaseModel): Field( ..., description=( - "Unique identifier for this execution, derived from the " - "archive directory name" + "Unique identifier for this execution, derived from the timing-file LID" ), ), ] diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py index 9bb707e3..64371a5b 100644 --- a/backend/tests/features/ingestion/parsers/test_parser.py +++ b/backend/tests/features/ingestion/parsers/test_parser.py @@ -6,6 +6,7 @@ import tarfile import zipfile from contextlib import contextmanager +from copy import deepcopy from pathlib import Path from typing import Any, Generator from unittest.mock import patch @@ -13,6 +14,7 @@ import pytest from app.features.ingestion.parsers import parser +from app.features.ingestion.parsers.types import ParsedSimulation class TestMainParser: @@ -104,27 +106,54 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: } defaults.update(kwargs) - with ( - patch("app.features.ingestion.parsers.parser.parse_env_case") as m1, - patch("app.features.ingestion.parsers.parser.parse_env_build") as m2, - patch("app.features.ingestion.parsers.parser.parse_env_run") as m3, - patch("app.features.ingestion.parsers.parser.parse_readme_case") as m4, - patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m5, - patch("app.features.ingestion.parsers.parser.parse_run_artifacts") as m6, - patch("app.features.ingestion.parsers.parser.parse_git_describe") as m7, - patch("app.features.ingestion.parsers.parser.parse_git_config") as m8, - patch("app.features.ingestion.parsers.parser.parse_git_status") as m9, - ): - m1.return_value = defaults["parse_env_case"] - m2.return_value = defaults["parse_env_build"] - m3.return_value = defaults["parse_env_run"] - m4.return_value = defaults["parse_readme_case"] - m5.return_value = defaults["parse_e3sm_timing"] - m6.return_value = defaults["parse_run_artifacts"] - m7.return_value = defaults["parse_git_describe"] - m8.return_value = defaults["parse_git_config"] - m9.return_value = defaults["parse_git_status"] - yield + original_file_specs = deepcopy(parser.FILE_SPECS) + + try: + parser.FILE_SPECS["case_docs_env_case"]["parser"] = lambda _path: defaults[ + "parse_env_case" + ] + parser.FILE_SPECS["case_docs_env_build"]["parser"] = lambda _path: defaults[ + "parse_env_build" + ] + parser.FILE_SPECS["case_docs_env_run"]["parser"] = lambda _path: defaults[ + "parse_env_run" + ] + parser.FILE_SPECS["readme_case"]["parser"] = lambda _path: defaults[ + "parse_readme_case" + ] + parser.FILE_SPECS["e3sm_timing"]["parser"] = lambda _path: defaults[ + "parse_e3sm_timing" + ] + parser.FILE_SPECS["git_describe"]["parser"] = lambda _path: defaults[ + "parse_git_describe" + ] + parser.FILE_SPECS["git_config"]["parser"] = lambda _path: ( + defaults["parse_git_config"] + if isinstance(defaults["parse_git_config"], dict) + else ( + {"git_repository_url": defaults["parse_git_config"]} + if defaults["parse_git_config"] + else {} + ) + ) + parser.FILE_SPECS["git_status"]["parser"] = lambda _path: ( + defaults["parse_git_status"] + if isinstance(defaults["parse_git_status"], dict) + else ( + {"git_branch": defaults["parse_git_status"]} + if defaults["parse_git_status"] + else {} + ) + ) + + with patch( + "app.features.ingestion.parsers.parser.parse_run_artifacts", + return_value=defaults["parse_run_artifacts"], + ): + yield + finally: + parser.FILE_SPECS.clear() + parser.FILE_SPECS.update(original_file_specs) def test_file_specs_remove_case_status_and_add_env_run(self) -> None: assert "case_status" not in parser.FILE_SPECS @@ -150,7 +179,8 @@ def test_with_valid_zip_archive(self, tmp_path: Path) -> None: assert len(result) > 0 assert skipped == 0 - assert any("1.0-0" in key for key in result.keys()) + assert isinstance(result[0], ParsedSimulation) + assert any("1.0-0" in parsed.execution_dir for parsed in result) def test_with_tar_gz_archive(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" @@ -169,7 +199,7 @@ def test_with_tar_gz_archive(self, tmp_path: Path) -> None: assert len(result) > 0 assert skipped == 0 - assert any("2.5-10" in key for key in result.keys()) + assert any("2.5-10" in parsed.execution_dir for parsed in result) def test_with_multiple_executions(self, tmp_path: Path) -> None: archive_base = tmp_path / "archive_extract" @@ -227,7 +257,7 @@ def test_missing_required_files_skips_incomplete_run(self, tmp_path: Path) -> No result, skipped = parser.main_parser(archive_path, extract_dir) - assert result == {} + assert result == [] assert skipped == 1 def test_multiple_matching_timing_files_raises_error(self, tmp_path: Path) -> None: @@ -325,7 +355,7 @@ def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None: result, skipped = parser.main_parser(archive_path, extract_dir) - assert result == {} + assert result == [] assert skipped == 1 def test_completed_status_is_merged(self, tmp_path: Path) -> None: @@ -337,7 +367,52 @@ def test_completed_status_is_merged(self, tmp_path: Path) -> None: result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") assert skipped == 0 - assert next(iter(result.values()))["status"] == "completed" + assert result[0].status == "completed" + + def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files(execution_dir, "001.001") + + with self._mock_all_parsers( + parse_e3sm_timing={ + "execution_id": None, + "run_start_date": "2025-12-18T20:09:33", + "run_end_date": "2025-12-18T20:54:58", + } + ): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert result == [] + assert skipped == 1 + + def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files(execution_dir, "001.001") + + with ( + self._mock_all_parsers( + parse_e3sm_timing={ + "execution_id": "different-lid", + "run_start_date": "2025-12-18T20:09:33", + "run_end_date": "2025-12-18T20:54:58", + } + ), + patch( + "app.features.ingestion.parsers.parser.logger.warning" + ) as mock_warning, + ): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert skipped == 0 + assert result[0].execution_id == "different-lid" + mock_warning.assert_any_call( + "Timing-file LID '%s' does not match execution directory '%s'. " + "Using timing-file LID as execution_id.", + "different-lid", + "1.0-0", + ) def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None: execution_dir = tmp_path / "1.0-0" @@ -350,7 +425,7 @@ def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None: result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") - assert result == {} + assert result == [] assert skipped == 1 def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None: diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py index ebc207e2..a84d94a2 100644 --- a/backend/tests/features/ingestion/test_ingest.py +++ b/backend/tests/features/ingestion/test_ingest.py @@ -3,17 +3,17 @@ from unittest.mock import MagicMock, patch from uuid import UUID, uuid4 -import pytest from dateutil import parser as real_dateutil_parser from sqlalchemy.orm import Session from app.features.ingestion.ingest import ( - _derive_execution_id, _get_canonical_metadata_for_case, _get_or_create_case, _normalize_git_url, _normalize_simulation_status, _normalize_simulation_type, + _parsed_simulation_to_config_projection, + _simulation_to_config_projection, ingest_archive, ) from app.features.ingestion.models import ( @@ -21,6 +21,7 @@ IngestionSourceType, IngestionStatus, ) +from app.features.ingestion.parsers.types import ParsedSimulation from app.features.machine.models import Machine from app.features.simulation.enums import SimulationStatus, SimulationType from app.features.simulation.models import Case, Simulation @@ -28,6 +29,53 @@ from app.features.user.models import User +def _parsed_simulations_from_mapping( + simulations_by_dir: dict[str, dict[str, str | None]], +) -> list[ParsedSimulation]: + parsed_simulations: list[ParsedSimulation] = [] + + for execution_dir, metadata in simulations_by_dir.items(): + parsed_simulations.append( + ParsedSimulation( + execution_dir=execution_dir, + execution_id=_require_execution_id(metadata, execution_dir), + case_name=metadata.get("case_name"), + case_group=metadata.get("case_group"), + machine=metadata.get("machine"), + hpc_username=metadata.get("hpc_username") or metadata.get("user"), + compset=metadata.get("compset"), + compset_alias=metadata.get("compset_alias"), + grid_name=metadata.get("grid_name"), + grid_resolution=metadata.get("grid_resolution"), + campaign=metadata.get("campaign"), + experiment_type=metadata.get("experiment_type"), + initialization_type=metadata.get("initialization_type"), + simulation_start_date=metadata.get("simulation_start_date"), + simulation_end_date=metadata.get("simulation_end_date"), + run_start_date=metadata.get("run_start_date"), + run_end_date=metadata.get("run_end_date"), + compiler=metadata.get("compiler"), + git_repository_url=metadata.get("git_repository_url"), + git_branch=metadata.get("git_branch"), + git_tag=metadata.get("git_tag"), + git_commit_hash=metadata.get("git_commit_hash"), + status=metadata.get("status"), + ) + ) + + return parsed_simulations + + +def _require_execution_id(metadata: dict[str, str | None], execution_dir: str) -> str: + execution_id = metadata.get("execution_id") + if not execution_id: + raise AssertionError( + f"Mocked ParsedSimulation for '{execution_dir}' must define execution_id" + ) + + return execution_id + + class TestIngestArchive: """Tests for the ingest_archive public API. @@ -61,6 +109,7 @@ def test_returns_list_of_simulation_create(self, db: Session) -> None: mock_simulations = { "/path/to/1081156.251218-200923": { + "execution_id": "1081156.251218-200923", "case_name": "case1", "compset": "FHIST", "compset_alias": "test_alias", @@ -87,7 +136,7 @@ def test_returns_list_of_simulation_create(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -108,6 +157,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None: mock_simulations = { "/path/to/1081157.251218-200924": { + "execution_id": "1081157.251218-200924", "case_name": "case1", "compset": "FHIST", "compset_alias": "test_alias", @@ -131,6 +181,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None: "last_updated_by": None, }, "/path/to/1081158.251218-200925": { + "execution_id": "1081158.251218-200925", "case_name": "case2", "compset": "FHIST", "compset_alias": "test_alias", @@ -157,7 +208,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -169,7 +220,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None: def test_returns_empty_list_for_empty_archive(self, db: Session) -> None: """Test that empty archive returns empty list.""" - with patch("app.features.ingestion.ingest.main_parser", return_value=({}, 0)): + with patch("app.features.ingestion.ingest.main_parser", return_value=([], 0)): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db ) @@ -183,6 +234,7 @@ def test_accepts_string_paths(self, db: Session) -> None: mock_simulations = { "/path/to/1081159.251218-200926": { + "execution_id": "1081159.251218-200926", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -209,7 +261,7 @@ def test_accepts_string_paths(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ) as mock_main_parser: ingest_result = ingest_archive("/tmp/archive.zip", "/tmp/out", db) @@ -224,6 +276,7 @@ def test_propagates_mapping_errors(self, db: Session) -> None: """Test that mapping errors are collected and ingestion continues.""" mock_simulations = { "/path/to/1081160.251218-200927": { + "execution_id": "1081160.251218-200927", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -250,7 +303,7 @@ def test_propagates_mapping_errors(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -282,6 +335,7 @@ def test_parses_various_datetime_formats_through_public_api( for idx, date_str in enumerate(test_cases): mock_simulations = { f"/path/to/108200{idx}.251218-200900": { + "execution_id": f"108200{idx}.251218-200900", "case_name": f"case1_{date_str}", "compset": "test", "compset_alias": "test_alias", @@ -308,7 +362,7 @@ def test_parses_various_datetime_formats_through_public_api( with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -324,6 +378,7 @@ def test_missing_required_fields_raise_validation_error(self, db: Session) -> No mock_simulations = { "/path/to/1081170.251218-200930": { + "execution_id": "1081170.251218-200930", "case_name": None, "compset": None, "compset_alias": "test_alias", @@ -350,7 +405,7 @@ def test_missing_required_fields_raise_validation_error(self, db: Session) -> No with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -374,6 +429,7 @@ def test_machine_lookup_and_validation_through_public_api( # Create valid simulation valid_mock = { "/path/to/1081171.251218-200931": { + "execution_id": "1081171.251218-200931", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -399,7 +455,8 @@ def test_machine_lookup_and_validation_through_public_api( } with patch( - "app.features.ingestion.ingest.main_parser", return_value=(valid_mock, 0) + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(valid_mock), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -411,6 +468,7 @@ def test_machine_lookup_and_validation_through_public_api( # Test with missing machine invalid_mock = { "/path/to/1081172.251218-200932": { + "execution_id": "1081172.251218-200932", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -436,7 +494,8 @@ def test_machine_lookup_and_validation_through_public_api( } with patch( - "app.features.ingestion.ingest.main_parser", return_value=(invalid_mock, 0) + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(invalid_mock), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -478,6 +537,7 @@ def test_timezone_aware_datetime_parsing_through_public_api( mock_simulations = { "/path/to/1081173.251218-200933": { + "execution_id": "1081173.251218-200933", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -504,7 +564,7 @@ def test_timezone_aware_datetime_parsing_through_public_api( with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -528,6 +588,7 @@ def test_handles_optional_fields_through_public_api(self, db: Session) -> None: mock_simulations = { "/path/to/1081174.251218-200934": { + "execution_id": "1081174.251218-200934", "case_name": "case1", "compset": "FHIST", "compset_alias": "FHIST_f09_fe", @@ -555,7 +616,7 @@ def test_handles_optional_fields_through_public_api(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -635,6 +696,7 @@ def test_skips_duplicate_simulations(self, db: Session) -> None: # Try to ingest a simulation with the same execution_id mock_simulations = { "/path/to/1081175.251218-200935": { + "execution_id": "1081175.251218-200935", "case_name": "existing_case", "compset": "FHIST", "compset_alias": "test_alias", @@ -661,7 +723,7 @@ def test_skips_duplicate_simulations(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -718,6 +780,7 @@ def test_ingest_archive_counts(self, db: Session) -> None: mock_simulations = { "/path/to/1081176.251218-200936": { + "execution_id": "1081176.251218-200936", "case_name": "existing_case", "compset": "FHIST", "compset_alias": "test_alias", @@ -741,6 +804,7 @@ def test_ingest_archive_counts(self, db: Session) -> None: "last_updated_by": None, }, "/path/to/1081177.251218-200937": { + "execution_id": "1081177.251218-200937", "case_name": "new_case", "compset": "FHIST", "compset_alias": "test_alias", @@ -767,7 +831,7 @@ def test_ingest_archive_counts(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -780,7 +844,7 @@ def test_ingest_archive_counts(self, db: Session) -> None: def test_ingest_archive_empty_archive(self, db: Session) -> None: """Test summary counts when the archive contains no simulations.""" - with patch("app.features.ingestion.ingest.main_parser", return_value=({}, 0)): + with patch("app.features.ingestion.ingest.main_parser", return_value=([], 0)): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db ) @@ -801,6 +865,7 @@ def test_handles_invalid_datetime_gracefully(self, db: Session) -> None: # This will be parsed but not raise an error mock_simulations = { "/path/to/1081178.251218-200938": { + "execution_id": "1081178.251218-200938", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -827,7 +892,7 @@ def test_handles_invalid_datetime_gracefully(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -850,6 +915,7 @@ def test_parse_datetime_field_exception_handling(self, db: Session) -> None: # This ensures we exercise the except block in _parse_datetime_field mock_simulations = { "/path/to/1081179.251218-200939": { + "execution_id": "1081179.251218-200939", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -885,7 +951,7 @@ def mock_parse_wrapper(date_str, *args, **kwargs): with ( patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ), patch( "app.features.ingestion.ingest.dateutil_parser.parse", @@ -904,6 +970,7 @@ def test_missing_machine_name_in_metadata(self, db: Session) -> None: """Test error handling when machine name is missing from metadata.""" mock_simulations = { "/path/to/1081180.251218-200940": { + "execution_id": "1081180.251218-200940", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -930,7 +997,7 @@ def test_missing_machine_name_in_metadata(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -947,6 +1014,7 @@ def test_missing_simulation_start_date(self, db: Session) -> None: mock_simulations = { "/path/to/1081181.251218-200941": { + "execution_id": "1081181.251218-200941", "case_name": "case1", "compset": "test", "compset_alias": "test_alias", @@ -973,7 +1041,7 @@ def test_missing_simulation_start_date(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -1055,6 +1123,7 @@ def test_ssh_url_conversion_integrated_in_ingest(self, db: Session) -> None: # SSH URL in metadata mock_simulations = { "/path/to/1081182.251218-200942": { + "execution_id": "1081182.251218-200942", "case_name": "case1", "compset": "FHIST", "compset_alias": "test_alias", @@ -1081,7 +1150,7 @@ def test_ssh_url_conversion_integrated_in_ingest(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): ingest_result = ingest_archive( Path("/tmp/archive.zip"), Path("/tmp/out"), db @@ -1137,6 +1206,7 @@ def _create_machine(db: Session, name: str) -> Machine: @staticmethod def _make_metadata( + execution_id: str, case_name: str = "case1", machine: str = "test-machine", simulation_start_date: str = "2020-01-01", @@ -1144,6 +1214,7 @@ def _make_metadata( ) -> dict[str, str | None]: """Build a complete simulation metadata dict with sensible defaults.""" base: dict[str, str | None] = { + "execution_id": execution_id, "case_name": case_name, "compset": "FHIST", "compset_alias": "test_alias", @@ -1177,10 +1248,12 @@ def test_canonical_run_selected_from_multiple_runs(self, db: Session) -> None: # Two runs with the same case_name but different compilers mock_simulations = { "/path/to/1081183.251218-200943": self._make_metadata( + execution_id="1081183.251218-200943", simulation_start_date="2020-01-01", compiler="gcc-11", ), "/path/to/1081184.251218-200944": self._make_metadata( + execution_id="1081184.251218-200944", simulation_start_date="2020-06-01", compiler="gcc-12", ), @@ -1188,7 +1261,7 @@ def test_canonical_run_selected_from_multiple_runs(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1209,10 +1282,12 @@ def test_config_delta_stored_for_non_canonical_run(self, db: Session) -> None: mock_simulations = { "/path/to/1081185.251218-200945": self._make_metadata( + execution_id="1081185.251218-200945", simulation_start_date="2020-01-01", compiler="gcc-11", ), "/path/to/1081186.251218-200946": self._make_metadata( + execution_id="1081186.251218-200946", simulation_start_date="2020-06-01", compiler="gcc-12", ), @@ -1220,7 +1295,7 @@ def test_config_delta_stored_for_non_canonical_run(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1245,16 +1320,18 @@ def test_no_delta_when_configs_identical(self, db: Session) -> None: mock_simulations = { "/path/to/1081187.251218-200947": self._make_metadata( + execution_id="1081187.251218-200947", simulation_start_date="2020-01-01", ), "/path/to/1081188.251218-200948": self._make_metadata( + execution_id="1081188.251218-200948", simulation_start_date="2020-06-01", ), } with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1271,16 +1348,18 @@ def test_different_case_names_create_separate_simulations( mock_simulations = { "/path/to/1081189.251218-200949": self._make_metadata( + execution_id="1081189.251218-200949", case_name="case_alpha", ), "/path/to/1081190.251218-200950": self._make_metadata( + execution_id="1081190.251218-200950", case_name="case_beta", ), } with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1344,13 +1423,14 @@ def test_idempotent_reingestion(self, db: Session) -> None: # Now re-ingest with the same execution_id mock_simulations = { "/path/to/1081191.251218-200951": self._make_metadata( + execution_id="1081191.251218-200951", simulation_start_date="2020-01-01", ), } with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1414,10 +1494,12 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None: # Ingest archive containing the existing run plus a new one mock_simulations = { "/path/to/1081192.251218-200952": self._make_metadata( + execution_id="1081192.251218-200952", simulation_start_date="2020-01-01", compiler="gcc-11", ), "/path/to/1081193.251218-200953": self._make_metadata( + execution_id="1081193.251218-200953", simulation_start_date="2020-06-01", compiler="gcc-12", ), @@ -1425,7 +1507,7 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1440,15 +1522,165 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None: assert new_sim.run_config_deltas["compiler"]["canonical"] == "gcc-11" assert new_sim.run_config_deltas["compiler"]["current"] == "gcc-12" + def test_incremental_ingestion_normalizes_git_url_for_delta( + self, db: Session + ) -> None: + """Equivalent SSH/HTTPS git URLs should not produce a config delta.""" + machine = self._create_machine(db, "test-machine") + + user = User( + email="test@example.com", + is_active=True, + is_verified=True, + ) + db.add(user) + db.commit() + + ingestion = Ingestion( + source_type=IngestionSourceType.HPC_PATH, + source_reference="/archive", + status=IngestionStatus.SUCCESS, + machine_id=machine.id, + triggered_by=user.id, + ) + db.add(ingestion) + db.commit() + + case = Case(name="case1") + db.add(case) + db.flush() + + sim = Simulation( + case_id=case.id, + execution_id="1081194.251218-200953", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + machine_id=machine.id, + simulation_start_date=datetime(2020, 1, 1), + initialization_type="test", + status=SimulationStatus.CREATED, + simulation_type=SimulationType.UNKNOWN, + created_by=user.id, + last_updated_by=user.id, + ingestion_id=ingestion.id, + git_repository_url="https://github.com/E3SM-Project/E3SM.git", + ) + db.add(sim) + db.flush() + + assert sim.id is not None + case.canonical_simulation_id = sim.id + db.commit() + + mock_simulations = { + "/path/to/1081194.251218-200956": self._make_metadata( + execution_id="1081194.251218-200956", + simulation_start_date="2020-06-01", + git_repository_url="git@github.com:E3SM-Project/E3SM.git", + ), + } + + with patch( + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), + ): + result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) + + assert result.created_count == 1 + assert len(result.simulations) == 1 + assert result.simulations[0].run_config_deltas is None + + def test_incremental_ingestion_persisted_simulation_type_adds_delta( + self, db: Session + ) -> None: + """Persisted canonical simulation_type differences are reflected in deltas.""" + machine = self._create_machine(db, "test-machine") + + user = User( + email="test@example.com", + is_active=True, + is_verified=True, + ) + db.add(user) + db.commit() + + ingestion = Ingestion( + source_type=IngestionSourceType.HPC_PATH, + source_reference="/archive", + status=IngestionStatus.SUCCESS, + machine_id=machine.id, + triggered_by=user.id, + ) + db.add(ingestion) + db.commit() + + case = Case(name="case1") + db.add(case) + db.flush() + + sim = Simulation( + case_id=case.id, + execution_id="1081194.251218-200954", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + machine_id=machine.id, + simulation_start_date=datetime(2020, 1, 1), + initialization_type="test", + status=SimulationStatus.CREATED, + simulation_type=SimulationType.PRODUCTION, + created_by=user.id, + last_updated_by=user.id, + ingestion_id=ingestion.id, + ) + db.add(sim) + db.flush() + + assert sim.id is not None + case.canonical_simulation_id = sim.id + db.commit() + + mock_simulations = { + "/path/to/1081194.251218-200954": self._make_metadata( + execution_id="1081194.251218-200954", + simulation_start_date="2020-01-01", + ), + "/path/to/1081194.251218-200955": self._make_metadata( + execution_id="1081194.251218-200955", + simulation_start_date="2020-06-01", + ), + } + + with patch( + "app.features.ingestion.ingest.main_parser", + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), + ): + result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) + + assert result.duplicate_count == 1 + assert result.created_count == 1 + assert len(result.simulations) == 1 + assert result.simulations[0].run_config_deltas is not None + assert "simulation_type" in result.simulations[0].run_config_deltas + assert result.simulations[0].run_config_deltas["simulation_type"] == { + "canonical": SimulationType.PRODUCTION.value, + "current": SimulationType.UNKNOWN.value, + } + def test_same_case_name_groups_to_same_case(self, db: Session) -> None: """Runs with the same case_name belong to the same Case.""" self._create_machine(db, "test-machine") mock_simulations = { "/path/to/1081195.251218-200955": self._make_metadata( + execution_id="1081195.251218-200955", case_name="case1", ), "/path/to/1081196.251218-200956": self._make_metadata( + execution_id="1081196.251218-200956", case_name="case1", simulation_start_date="2020-06-01", ), @@ -1456,7 +1688,7 @@ def test_same_case_name_groups_to_same_case(self, db: Session) -> None: with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1474,16 +1706,18 @@ def test_different_case_name_creates_separate_cases(self, db: Session) -> None: mock_simulations = { "/path/to/1081197.251218-200957": self._make_metadata( + execution_id="1081197.251218-200957", case_name="case_X", ), "/path/to/1081198.251218-200958": self._make_metadata( + execution_id="1081198.251218-200958", case_name="case_Y", ), } with patch( "app.features.ingestion.ingest.main_parser", - return_value=(mock_simulations, 0), + return_value=(_parsed_simulations_from_mapping(mock_simulations), 0), ): result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db) @@ -1493,10 +1727,6 @@ def test_different_case_name_creates_separate_cases(self, db: Session) -> None: class TestIngestHelpers: - def test_derive_execution_id_raises_on_empty_path(self) -> None: - with pytest.raises(ValueError, match="Cannot derive execution_id"): - _derive_execution_id("") - def test_get_or_create_case_sets_missing_case_group(self, db: Session) -> None: case = Case(name="case_group_test", case_group=None) db.add(case) @@ -1626,3 +1856,100 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup( db=db, ) assert second == first + + def test_parsed_projection_keys_match_config_delta_fields(self) -> None: + parsed = ParsedSimulation( + execution_dir="/path/to/1082001.260305-120001", + execution_id="1082001.260305-120001", + case_name="case1", + case_group=None, + machine="machine", + hpc_username=None, + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign="campaign", + experiment_type="historical", + initialization_type="test", + simulation_start_date="2020-01-01", + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler="gcc", + git_repository_url="https://example.com/repo.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + status="completed", + ) + + projection = _parsed_simulation_to_config_projection(parsed) + + assert set(projection) == Simulation.CONFIG_DELTA_FIELDS + assert projection["simulation_type"] == SimulationType.UNKNOWN.value + + def test_simulation_projection_keys_match_config_delta_fields( + self, db: Session + ) -> None: + machine = Machine( + name="projection-machine", + site="Test Site", + architecture="x86_64", + scheduler="SLURM", + gpu=False, + ) + db.add(machine) + + user = User( + email="projection@example.com", + is_active=True, + is_verified=True, + ) + db.add(user) + db.commit() + + ingestion = Ingestion( + source_type=IngestionSourceType.HPC_PATH, + source_reference="/archive", + status=IngestionStatus.SUCCESS, + machine_id=machine.id, + triggered_by=user.id, + ) + db.add(ingestion) + db.commit() + + case = Case(name="projection_case") + db.add(case) + db.flush() + + sim = Simulation( + case_id=case.id, + execution_id="1082002.260305-120002", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + machine_id=machine.id, + simulation_start_date=datetime(2020, 1, 1), + initialization_type="test", + status=SimulationStatus.CREATED, + simulation_type=SimulationType.TEST, + created_by=user.id, + last_updated_by=user.id, + ingestion_id=ingestion.id, + campaign="campaign", + experiment_type="historical", + compiler="gcc", + git_repository_url="https://example.com/repo.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + ) + db.add(sim) + db.commit() + + projection = _simulation_to_config_projection(sim) + + assert set(projection) == Simulation.CONFIG_DELTA_FIELDS + assert projection["simulation_type"] == SimulationType.TEST.value From 977f1db48e77a46062399585788c0f7ea9295c81 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 14:41:04 -0700 Subject: [PATCH 06/10] Refactor ingest config snapshots --- backend/app/features/ingestion/ingest.py | 293 ++++++++---------- .../app/features/simulation/config_delta.py | 44 +++ backend/app/features/simulation/models.py | 19 +- .../tests/features/ingestion/test_ingest.py | 243 ++++++++++++++- 4 files changed, 408 insertions(+), 191 deletions(-) create mode 100644 backend/app/features/simulation/config_delta.py diff --git a/backend/app/features/ingestion/ingest.py b/backend/app/features/ingestion/ingest.py index 3e5f65db..31704064 100644 --- a/backend/app/features/ingestion/ingest.py +++ b/backend/app/features/ingestion/ingest.py @@ -31,12 +31,12 @@ from app.features.ingestion.parsers.parser import main_parser from app.features.ingestion.parsers.types import ParsedSimulation from app.features.machine.models import Machine +from app.features.simulation.config_delta import SimulationConfigSnapshot from app.features.simulation.enums import SimulationStatus, SimulationType from app.features.simulation.models import Case, Simulation from app.features.simulation.schemas import SimulationCreate logger = _setup_custom_logger(__name__) -ConfigProjection = dict[str, str | None] @dataclass @@ -66,6 +66,37 @@ class IngestArchiveResult: errors: list[dict[str, str]] = field(default_factory=list) +@dataclass(frozen=True) +class SimulationCreateDraft: + """Normalized internal payload validated into ``SimulationCreate``.""" + + case_id: UUID + execution_id: str + compset: str | None + compset_alias: str | None + grid_name: str | None + grid_resolution: str | None + simulation_type: SimulationType + status: SimulationStatus + campaign: str | None + experiment_type: str | None + initialization_type: str | None + machine_id: UUID + simulation_start_date: datetime | None + simulation_end_date: datetime | None + run_start_date: datetime | None + run_end_date: datetime | None + compiler: str | None + git_repository_url: str | None + git_branch: str | None + git_tag: str | None + git_commit_hash: str | None + created_by: UUID | None + last_updated_by: UUID | None + hpc_username: str | None + run_config_deltas: dict[str, dict[str, str | None]] | None = None + + def ingest_archive( archive_path: Path | str, output_dir: Path | str, @@ -132,8 +163,8 @@ def ingest_archive( simulations: list[SimulationCreate] = [] duplicate_count = 0 errors: list[dict[str, str]] = [] - canonical_cache: dict[str, ConfigProjection] = {} - persisted_canonical_cache: dict[UUID, ConfigProjection | None] = {} + canonical_cache: dict[str, SimulationConfigSnapshot] = {} + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {} for parsed_simulation in parsed_simulations: try: @@ -181,8 +212,8 @@ def ingest_archive( def _process_simulation_for_ingest( parsed_simulation: ParsedSimulation, db: Session, - canonical_cache: dict[str, ConfigProjection], - persisted_canonical_cache: dict[UUID, ConfigProjection | None], + canonical_cache: dict[str, SimulationConfigSnapshot], + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None], ) -> tuple[SimulationCreate | None, bool]: """Process one parsed simulation entry. @@ -192,9 +223,9 @@ def _process_simulation_for_ingest( Parsed archive-derived metadata for the simulation. db : Session Active database session for lookups and case resolution. - canonical_cache : dict[str, ConfigProjection] + canonical_cache : dict[str, SimulationConfigSnapshot] In-memory cache of canonical config values per case_name for the current batch. - persisted_canonical_cache : dict[UUID, ConfigProjection | None] + persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None] Cache of canonical metadata loaded from the database by case_id. Returns @@ -270,21 +301,19 @@ def _is_duplicate_simulation( def _seed_canonical_cache_from_duplicate( case_name: str, parsed_simulation: ParsedSimulation, - canonical_cache: dict[str, ConfigProjection], + canonical_cache: dict[str, SimulationConfigSnapshot], ) -> None: """Seed per-case canonical cache using duplicate metadata when needed.""" if case_name not in canonical_cache: - canonical_cache[case_name] = _parsed_simulation_to_config_projection( - parsed_simulation - ) + canonical_cache[case_name] = _build_config_snapshot(parsed_simulation) def _build_simulation_create( parsed_simulation: ParsedSimulation, machine_id: UUID, case: Case, - canonical_cache: dict[str, ConfigProjection], - persisted_canonical_cache: dict[UUID, ConfigProjection | None], + canonical_cache: dict[str, SimulationConfigSnapshot], + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None], db: Session, ) -> SimulationCreate: """Create a SimulationCreate using canonical baseline semantics. @@ -297,15 +326,15 @@ def _build_simulation_create( Resolved machine ID from the database. case : Case Resolved Case object for this simulation. - canonical_cache : dict[str, ConfigProjection] + canonical_cache : dict[str, SimulationConfigSnapshot] In-memory cache of canonical metadata per case_name for the current batch. - persisted_canonical_cache : dict[UUID, ConfigProjection | None] + persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None] Cache of canonical metadata loaded from the database by case_id. db : Session Active database session for lookups and case resolution. """ case_name = case.name - canonical_metadata = _get_canonical_metadata_for_case( + canonical_snapshot = _get_canonical_metadata_for_case( case=case, case_name=case_name, canonical_cache=canonical_cache, @@ -313,13 +342,15 @@ def _build_simulation_create( db=db, ) - if canonical_metadata is None: - canonical_cache[case_name] = _parsed_simulation_to_config_projection( - parsed_simulation - ) + if canonical_snapshot is None: + canonical_cache[case_name] = _build_config_snapshot(parsed_simulation) - simulation = _map_parsed_simulation_to_schema( - parsed_simulation, machine_id, case.id + simulation = _validate_simulation_create( + _build_simulation_create_draft( + parsed_simulation=parsed_simulation, + machine_id=machine_id, + case_id=case.id, + ) ) logger.info( "Mapped canonical simulation from %s: %s", @@ -329,18 +360,15 @@ def _build_simulation_create( return simulation - delta = _compute_config_delta( - canonical_metadata, - _parsed_simulation_to_config_projection(parsed_simulation), - ) + delta = canonical_snapshot.diff(_build_config_snapshot(parsed_simulation)) run_config_deltas = delta if delta else None - - simulation = _map_parsed_simulation_to_schema( - parsed_simulation, - machine_id, - case.id, + simulation_draft = _build_simulation_create_draft( + parsed_simulation=parsed_simulation, + machine_id=machine_id, + case_id=case.id, run_config_deltas=run_config_deltas, ) + simulation = _validate_simulation_create(simulation_draft) if delta: logger.info( @@ -360,10 +388,10 @@ def _build_simulation_create( def _get_canonical_metadata_for_case( case: Case, case_name: str, - canonical_cache: dict[str, ConfigProjection], - persisted_canonical_cache: dict[UUID, ConfigProjection | None], + canonical_cache: dict[str, SimulationConfigSnapshot], + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None], db: Session, -) -> ConfigProjection | None: +) -> SimulationConfigSnapshot | None: """Resolve canonical metadata from persisted canonical or batch cache. This function is useful for ensuring that all simulations of the same case @@ -375,15 +403,15 @@ def _get_canonical_metadata_for_case( The Case object for which to retrieve canonical metadata. case_name : str The name of the case, used for in-memory cache lookup. - canonical_cache : dict[str, ConfigProjection] + canonical_cache : dict[str, SimulationConfigSnapshot] In-memory cache of canonical metadata per case_name for the current batch. - persisted_canonical_cache : dict[UUID, ConfigProjection | None] + persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None] Cache of canonical metadata loaded from the database by case_id. Returns ------- - ConfigProjection | None - The canonical metadata for the case, or None if no canonical run exists. + SimulationConfigSnapshot | None + The canonical config snapshot for the case, or None if no canonical run exists. """ if case.canonical_simulation_id is not None: if case.id in persisted_canonical_cache: @@ -396,10 +424,10 @@ def _get_canonical_metadata_for_case( ) if canonical_sim: - canonical_metadata = _simulation_to_config_projection(canonical_sim) - persisted_canonical_cache[case.id] = canonical_metadata + canonical_snapshot = _build_config_snapshot(canonical_sim) + persisted_canonical_cache[case.id] = canonical_snapshot - return canonical_metadata + return canonical_snapshot persisted_canonical_cache[case.id] = None return None @@ -449,44 +477,30 @@ def _get_or_create_case(db: Session, name: str, case_group: str | None = None) - return case -def _parsed_simulation_to_config_projection( - parsed_simulation: ParsedSimulation, -) -> ConfigProjection: - """Return config-comparison fields from a parsed simulation.""" - return { - "compset": parsed_simulation.compset, - "compset_alias": parsed_simulation.compset_alias, - "grid_name": parsed_simulation.grid_name, - "grid_resolution": parsed_simulation.grid_resolution, - "initialization_type": parsed_simulation.initialization_type, - "compiler": parsed_simulation.compiler, - "git_tag": parsed_simulation.git_tag, - "git_commit_hash": parsed_simulation.git_commit_hash, - "git_branch": parsed_simulation.git_branch, - "git_repository_url": _normalize_git_url(parsed_simulation.git_repository_url), - "campaign": parsed_simulation.campaign, - "experiment_type": parsed_simulation.experiment_type, - "simulation_type": SimulationType.UNKNOWN.value, - } - - -def _simulation_to_config_projection(sim: Simulation) -> ConfigProjection: - """Return config-comparison fields from a persisted Simulation.""" - return { - "compset": sim.compset, - "compset_alias": sim.compset_alias, - "grid_name": sim.grid_name, - "grid_resolution": sim.grid_resolution, - "initialization_type": sim.initialization_type, - "compiler": sim.compiler, - "git_tag": sim.git_tag, - "git_commit_hash": sim.git_commit_hash, - "git_branch": sim.git_branch, - "git_repository_url": _stringify_config_value(sim.git_repository_url), - "campaign": sim.campaign, - "experiment_type": _stringify_config_value(sim.experiment_type), - "simulation_type": _stringify_config_value(sim.simulation_type), - } +def _build_config_snapshot( + source: ParsedSimulation | Simulation, +) -> SimulationConfigSnapshot: + """Return a normalized config snapshot for canonical delta comparison.""" + snapshot_values: dict[str, str | None] = {} + + for field_name in SimulationConfigSnapshot.field_names(): + if field_name == "simulation_type" and isinstance(source, ParsedSimulation): + snapshot_values[field_name] = SimulationType.UNKNOWN.value + continue + + value = getattr(source, field_name) + + if isinstance(source, ParsedSimulation): + normalized_value = value + else: + normalized_value = _stringify_config_value(value) + + if field_name == "git_repository_url": + normalized_value = _normalize_git_url(normalized_value) + + snapshot_values[field_name] = normalized_value + + return SimulationConfigSnapshot(**snapshot_values) def _stringify_config_value(value: object) -> str | None: @@ -504,39 +518,6 @@ def _stringify_config_value(value: object) -> str | None: return str(value) -def _compute_config_delta( - canonical: ConfigProjection, - other: ConfigProjection, -) -> dict[str, dict[str, str | None]]: - """Compare two run metadata dicts and return configuration differences. - - Only the fields in :data:`_CONFIG_DELTA_FIELDS` are compared. - - Parameters - ---------- - canonical : ConfigProjection - The canonical run metadata to compare against. - other : ConfigProjection - The other run metadata to compare. - - Returns - ------- - dict[str, dict[str, str | None]] - Mapping of field name → ``{"canonical": ..., "current": ...}`` - for every field that differs. Empty dict when runs are - identical. - """ - delta: dict[str, dict[str, str | None]] = {} - - for key in Simulation.CONFIG_DELTA_FIELDS: - canonical_val = canonical.get(key) - other_val = other.get(key) - if canonical_val != other_val: - delta[key] = {"canonical": canonical_val, "current": other_val} - - return delta - - def _resolve_machine_id(metadata: ParsedSimulation, db: Session) -> UUID: """Resolve machine name to machine ID from the database. @@ -638,13 +619,13 @@ def _normalize_git_url(url: str | None) -> str | None: return url -def _map_parsed_simulation_to_schema( +def _build_simulation_create_draft( parsed_simulation: ParsedSimulation, machine_id: UUID, case_id: UUID, run_config_deltas: dict[str, dict[str, str | None]] | None = None, -) -> SimulationCreate: - """Map parser metadata to SimulationCreate schema with type conversions. +) -> SimulationCreateDraft: + """Build a normalized internal draft for ``SimulationCreate`` validation. Parameters ---------- @@ -659,11 +640,10 @@ def _map_parsed_simulation_to_schema( Returns ------- - SimulationCreate - Schema object ready for database insertion. + SimulationCreateDraft + Typed ingest draft ready for schema validation. """ - # Parse datetime fields using the shared utility function - # Note: simulation_start_date is already validated in _extract_simulation_key() + # Parse datetime fields using the shared utility function. simulation_start_date = _parse_datetime_field( parsed_simulation.simulation_start_date ) @@ -676,47 +656,44 @@ def _map_parsed_simulation_to_schema( simulation_type = _normalize_simulation_type(None) status = _normalize_simulation_status(parsed_simulation.status) - # Map metadata to schema; Pydantic will validate required fields - # Note: SimulationCreate uses CamelInBaseModel which expects camelCase field names - result = SimulationCreate.model_validate( - { - # Required identification fields - "caseId": case_id, - "executionId": parsed_simulation.execution_id, - # Required configuration fields - "compset": parsed_simulation.compset, - "compsetAlias": parsed_simulation.compset_alias, - "gridName": parsed_simulation.grid_name, - "gridResolution": parsed_simulation.grid_resolution, - # Required status fields with sensible defaults - "simulationType": simulation_type, - "status": status, - "initializationType": parsed_simulation.initialization_type, - "machineId": machine_id, - "simulationStartDate": simulation_start_date, - "simulationEndDate": simulation_end_date, - "experimentType": parsed_simulation.experiment_type, - "campaign": parsed_simulation.campaign, - "runStartDate": run_start_date, - "runEndDate": run_end_date, - "compiler": parsed_simulation.compiler, - "gitRepositoryUrl": git_repository_url, - "gitBranch": parsed_simulation.git_branch, - "gitTag": parsed_simulation.git_tag, - "gitCommitHash": parsed_simulation.git_commit_hash, - # Note: created_by and last_updated_by are set to None since archive - # metadata contains local usernames that cannot be reliably mapped to - # database user UUIDs. The API endpoint will set these values based on - # the authenticated user who uploaded the archive. - "createdBy": None, - "lastUpdatedBy": None, - "hpcUsername": parsed_simulation.hpc_username, - # Canonical run semantics - "runConfigDeltas": run_config_deltas, - } + simulation_draft = SimulationCreateDraft( + case_id=case_id, + execution_id=parsed_simulation.execution_id, + compset=parsed_simulation.compset, + compset_alias=parsed_simulation.compset_alias, + grid_name=parsed_simulation.grid_name, + grid_resolution=parsed_simulation.grid_resolution, + simulation_type=simulation_type, + status=status, + campaign=parsed_simulation.campaign, + experiment_type=parsed_simulation.experiment_type, + initialization_type=parsed_simulation.initialization_type, + machine_id=machine_id, + simulation_start_date=simulation_start_date, + simulation_end_date=simulation_end_date, + run_start_date=run_start_date, + run_end_date=run_end_date, + compiler=parsed_simulation.compiler, + git_repository_url=git_repository_url, + git_branch=parsed_simulation.git_branch, + git_tag=parsed_simulation.git_tag, + git_commit_hash=parsed_simulation.git_commit_hash, + created_by=None, + last_updated_by=None, + hpc_username=parsed_simulation.hpc_username, + run_config_deltas=run_config_deltas, ) - return result + return simulation_draft + + +def _validate_simulation_create(draft: SimulationCreateDraft) -> SimulationCreate: + """Validate a typed ingest draft into ``SimulationCreate``.""" + return SimulationCreate.model_validate( + draft, + by_name=True, + from_attributes=True, + ) def _normalize_simulation_type(value: str | None) -> SimulationType: diff --git a/backend/app/features/simulation/config_delta.py b/backend/app/features/simulation/config_delta.py new file mode 100644 index 00000000..7db24763 --- /dev/null +++ b/backend/app/features/simulation/config_delta.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from dataclasses import fields as dataclass_fields + + +@dataclass(frozen=True) +class SimulationConfigSnapshot: + """Normalized configuration fields used for canonical delta comparison.""" + + compset: str | None + compset_alias: str | None + grid_name: str | None + grid_resolution: str | None + initialization_type: str | None + compiler: str | None + git_tag: str | None + git_commit_hash: str | None + git_branch: str | None + git_repository_url: str | None + campaign: str | None + experiment_type: str | None + simulation_type: str | None + + @classmethod + def field_names(cls) -> frozenset[str]: + """Return the config field set tracked for canonical comparisons.""" + return frozenset(field.name for field in dataclass_fields(cls)) + + def diff( + self, other: "SimulationConfigSnapshot" + ) -> dict[str, dict[str, str | None]]: + """Return changed config fields relative to another snapshot.""" + delta: dict[str, dict[str, str | None]] = {} + + for field in dataclass_fields(self): + field_name = field.name + canonical_value = getattr(self, field_name) + current_value = getattr(other, field_name) + if canonical_value != current_value: + delta[field_name] = { + "canonical": canonical_value, + "current": current_value, + } + + return delta diff --git a/backend/app/features/simulation/models.py b/backend/app/features/simulation/models.py index b969ebea..2f73a503 100644 --- a/backend/app/features/simulation/models.py +++ b/backend/app/features/simulation/models.py @@ -14,6 +14,7 @@ from app.common.models.base import Base from app.common.models.mixins import IDMixin, TimestampMixin +from app.features.simulation.config_delta import SimulationConfigSnapshot from app.features.simulation.enums import ( ArtifactKind, ExternalLinkKind, @@ -177,22 +178,8 @@ class Simulation(Base, IDMixin, TimestampMixin): # Fields used to compute configuration deltas relative to canonical execution. # Excludes timeline, status, and provenance fields that are expected to vary. - CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = frozenset( - { - "compset", - "compset_alias", - "grid_name", - "grid_resolution", - "initialization_type", - "compiler", - "git_tag", - "git_commit_hash", - "git_branch", - "git_repository_url", - "campaign", - "experiment_type", - "simulation_type", - } + CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = ( + SimulationConfigSnapshot.field_names() ) diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py index a84d94a2..0679c5b2 100644 --- a/backend/tests/features/ingestion/test_ingest.py +++ b/backend/tests/features/ingestion/test_ingest.py @@ -7,13 +7,15 @@ from sqlalchemy.orm import Session from app.features.ingestion.ingest import ( + SimulationCreateDraft, + _build_config_snapshot, + _build_simulation_create_draft, _get_canonical_metadata_for_case, _get_or_create_case, _normalize_git_url, _normalize_simulation_status, _normalize_simulation_type, - _parsed_simulation_to_config_projection, - _simulation_to_config_projection, + _validate_simulation_create, ingest_archive, ) from app.features.ingestion.models import ( @@ -23,6 +25,7 @@ ) from app.features.ingestion.parsers.types import ParsedSimulation from app.features.machine.models import Machine +from app.features.simulation.config_delta import SimulationConfigSnapshot from app.features.simulation.enums import SimulationStatus, SimulationType from app.features.simulation.models import Case, Simulation from app.features.simulation.schemas import SimulationCreate @@ -1761,8 +1764,8 @@ def test_get_canonical_metadata_caches_missing_persisted_canonical(self) -> None db = MagicMock(spec=Session) db.query.return_value.filter.return_value.first.return_value = None - canonical_cache: dict[str, dict[str, str | None]] = {} - persisted_canonical_cache: dict[UUID, dict[str, str | None] | None] = {} + canonical_cache: dict[str, SimulationConfigSnapshot] = {} + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {} result = _get_canonical_metadata_for_case( case=case, @@ -1834,8 +1837,8 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup( case.canonical_simulation_id = sim.id db.commit() - canonical_cache: dict[str, dict[str, str | None]] = {} - persisted_canonical_cache: dict[UUID, dict[str, str | None] | None] = {} + canonical_cache: dict[str, SimulationConfigSnapshot] = {} + persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {} first = _get_canonical_metadata_for_case( case=case, @@ -1845,7 +1848,7 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup( db=db, ) assert first is not None - assert first["compiler"] == "gcc-11" + assert first.compiler == "gcc-11" with patch.object(db, "query", side_effect=AssertionError): second = _get_canonical_metadata_for_case( @@ -1857,7 +1860,10 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup( ) assert second == first - def test_parsed_projection_keys_match_config_delta_fields(self) -> None: + def test_snapshot_field_names_match_config_delta_fields(self) -> None: + assert SimulationConfigSnapshot.field_names() == Simulation.CONFIG_DELTA_FIELDS + + def test_parsed_snapshot_defaults_simulation_type_to_unknown(self) -> None: parsed = ParsedSimulation( execution_dir="/path/to/1082001.260305-120001", execution_id="1082001.260305-120001", @@ -1884,14 +1890,11 @@ def test_parsed_projection_keys_match_config_delta_fields(self) -> None: status="completed", ) - projection = _parsed_simulation_to_config_projection(parsed) + snapshot = _build_config_snapshot(parsed) - assert set(projection) == Simulation.CONFIG_DELTA_FIELDS - assert projection["simulation_type"] == SimulationType.UNKNOWN.value + assert snapshot.simulation_type == SimulationType.UNKNOWN.value - def test_simulation_projection_keys_match_config_delta_fields( - self, db: Session - ) -> None: + def test_persisted_snapshot_matches_config_delta_fields(self, db: Session) -> None: machine = Machine( name="projection-machine", site="Test Site", @@ -1949,7 +1952,213 @@ def test_simulation_projection_keys_match_config_delta_fields( db.add(sim) db.commit() - projection = _simulation_to_config_projection(sim) + snapshot = _build_config_snapshot(sim) + + assert snapshot.simulation_type == SimulationType.TEST.value + assert set(snapshot.field_names()) == Simulation.CONFIG_DELTA_FIELDS + + def test_snapshot_normalizes_ssh_git_url_for_equality(self, db: Session) -> None: + parsed = ParsedSimulation( + execution_dir="/path/to/1082003.260305-120003", + execution_id="1082003.260305-120003", + case_name="case1", + case_group=None, + machine="machine", + hpc_username=None, + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign="campaign", + experiment_type="historical", + initialization_type="test", + simulation_start_date="2020-01-01", + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler="gcc", + git_repository_url="git@github.com:E3SM-Project/E3SM.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + status="completed", + ) + + machine = Machine( + name="snapshot-machine", + site="Test Site", + architecture="x86_64", + scheduler="SLURM", + gpu=False, + ) + db.add(machine) + + user = User( + email="snapshot@example.com", + is_active=True, + is_verified=True, + ) + db.add(user) + db.commit() + + ingestion = Ingestion( + source_type=IngestionSourceType.HPC_PATH, + source_reference="/archive", + status=IngestionStatus.SUCCESS, + machine_id=machine.id, + triggered_by=user.id, + ) + db.add(ingestion) + db.commit() + + case = Case(name="snapshot_case") + db.add(case) + db.flush() + + sim = Simulation( + case_id=case.id, + execution_id="1082004.260305-120004", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + machine_id=machine.id, + simulation_start_date=datetime(2020, 1, 1), + initialization_type="test", + status=SimulationStatus.CREATED, + simulation_type=SimulationType.UNKNOWN, + created_by=user.id, + last_updated_by=user.id, + ingestion_id=ingestion.id, + campaign="campaign", + experiment_type="historical", + compiler="gcc", + git_repository_url="https://github.com/E3SM-Project/E3SM.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + ) + db.add(sim) + db.commit() + + parsed_snapshot = _build_config_snapshot(parsed) + persisted_snapshot = _build_config_snapshot(sim) + + assert parsed_snapshot == persisted_snapshot + assert parsed_snapshot.diff(persisted_snapshot) == {} + + def test_snapshot_diff_returns_only_changed_fields(self) -> None: + canonical = SimulationConfigSnapshot( + compset="FHIST", + compset_alias="alias1", + grid_name="grid1", + grid_resolution="0.9x1.25", + initialization_type="initial", + compiler="gcc-11", + git_tag="v1.0.0", + git_commit_hash="abc123", + git_branch="main", + git_repository_url="https://example.com/repo.git", + campaign="campaign1", + experiment_type="historical", + simulation_type=SimulationType.UNKNOWN.value, + ) + current = SimulationConfigSnapshot( + compset="FHIST", + compset_alias="alias1", + grid_name="grid1", + grid_resolution="0.9x1.25", + initialization_type="initial", + compiler="gcc-12", + git_tag="v1.0.0", + git_commit_hash="abc123", + git_branch="feature", + git_repository_url="https://example.com/repo.git", + campaign="campaign1", + experiment_type="historical", + simulation_type=SimulationType.UNKNOWN.value, + ) + + delta = canonical.diff(current) + + assert delta == { + "compiler": {"canonical": "gcc-11", "current": "gcc-12"}, + "git_branch": {"canonical": "main", "current": "feature"}, + } + + def test_simulation_create_draft_validates_by_field_name(self) -> None: + draft = SimulationCreateDraft( + case_id=uuid4(), + execution_id="1082005.260305-120005", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + simulation_type=SimulationType.UNKNOWN, + status=SimulationStatus.CREATED, + campaign="campaign", + experiment_type="historical", + initialization_type="test", + machine_id=uuid4(), + simulation_start_date=datetime(2020, 1, 1), + simulation_end_date=None, + run_start_date=None, + run_end_date=None, + compiler="gcc", + git_repository_url="https://example.com/repo.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + created_by=None, + last_updated_by=None, + hpc_username="test-user", + run_config_deltas=None, + ) + + schema = _validate_simulation_create(draft) + + assert isinstance(schema, SimulationCreate) + assert schema.execution_id == draft.execution_id + assert schema.extra == {} + assert schema.artifacts == [] + assert schema.links == [] + assert schema.run_config_deltas is None + + def test_build_simulation_create_draft_normalizes_values(self) -> None: + parsed = ParsedSimulation( + execution_dir="/path/to/1082006.260305-120006", + execution_id="1082006.260305-120006", + case_name="case1", + case_group=None, + machine="machine", + hpc_username="test-user", + compset="FHIST", + compset_alias="test_alias", + grid_name="grid1", + grid_resolution="0.9x1.25", + campaign="campaign", + experiment_type="historical", + initialization_type="test", + simulation_start_date="2020-01-01", + simulation_end_date=None, + run_start_date="2020-01-02T03:04:05Z", + run_end_date=None, + compiler="gcc", + git_repository_url="git@github.com:E3SM-Project/E3SM.git", + git_branch="main", + git_tag="v1.0.0", + git_commit_hash="abc123", + status="completed", + ) + + draft = _build_simulation_create_draft( + parsed_simulation=parsed, + machine_id=uuid4(), + case_id=uuid4(), + ) - assert set(projection) == Simulation.CONFIG_DELTA_FIELDS - assert projection["simulation_type"] == SimulationType.TEST.value + assert draft.simulation_type == SimulationType.UNKNOWN + assert draft.status == SimulationStatus.COMPLETED + assert draft.git_repository_url == "https://github.com/E3SM-Project/E3SM.git" + assert draft.simulation_start_date is not None + assert draft.run_start_date is not None From fb245caec2f5cffdc08eaaaeb8efbbd422c5c436 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 14:48:10 -0700 Subject: [PATCH 07/10] Fix simulation provenance displays --- backend/app/features/simulation/schemas.py | 7 +++++ frontend/src/features/browse/BrowsePage.tsx | 22 ++++++++++++++++ .../components/BrowseFiltersSidePanel.tsx | 26 ++++++++++++++++--- .../components/SimulationDetailsView.tsx | 6 +++++ frontend/src/types/simulation.ts | 10 +++++++ 5 files changed, 67 insertions(+), 4 deletions(-) diff --git a/backend/app/features/simulation/schemas.py b/backend/app/features/simulation/schemas.py index f4772cd7..cf9be94f 100644 --- a/backend/app/features/simulation/schemas.py +++ b/backend/app/features/simulation/schemas.py @@ -535,6 +535,13 @@ class SimulationOut(CamelOutBaseModel): UserPreview | None, Field(description="Full user info of who last updated the simulation."), ] + hpc_username: Annotated[ + str | None, + Field( + None, + description="HPC username for provenance (trusted, informational only)", + ), + ] # Miscellaneous # ----------------- diff --git a/frontend/src/features/browse/BrowsePage.tsx b/frontend/src/features/browse/BrowsePage.tsx index f3286438..c46f1e14 100644 --- a/frontend/src/features/browse/BrowsePage.tsx +++ b/frontend/src/features/browse/BrowsePage.tsx @@ -44,6 +44,7 @@ export interface FilterState { // Metadata & Provenance gitTag: string[]; createdBy: string[]; + hpcUsername: string[]; } interface BrowsePageProps { @@ -75,6 +76,7 @@ const createEmptyFilters = (): FilterState => ({ // Metadata & Provenance gitTag: [], createdBy: [], + hpcUsername: [], }); const parseViewMode = (params: URLSearchParams): 'grid' | 'table' => @@ -190,6 +192,20 @@ export const BrowsePage = ({ return sortedMachines; }, [simulations]); + const creatorOptions = useMemo(() => { + const creators = new Map(); + + for (const simulation of simulations) { + if (!simulation.createdBy) continue; + + creators.set(simulation.createdBy, simulation.createdByUser?.email ?? simulation.createdBy); + } + + return Array.from(creators, ([value, label]) => ({ value, label })).sort((a, b) => + a.label.localeCompare(b.label, undefined, { sensitivity: 'base' }), + ); + }, [simulations]); + const filteredData = useMemo(() => { const arrayFilterGetters: Partial< Record< @@ -209,6 +225,7 @@ export const BrowsePage = ({ status: (rec) => rec.status ?? [], gitTag: (rec) => rec.gitTag ?? [], createdBy: (rec) => rec.createdBy ?? [], + hpcUsername: (rec) => rec.hpcUsername ?? [], }; return simulations.filter((record) => { @@ -441,6 +458,7 @@ export const BrowsePage = ({ availableFilters={availableFilters} onChange={setAppliedFilters} machineOptions={machineOptions} + creatorOptions={creatorOptions} caseOptions={caseOptions} selectedCaseName={selectedCaseName} onCaseNameChange={handleCaseNameChange} @@ -510,6 +528,8 @@ export const BrowsePage = ({ const display = key === 'machineId' ? (machineOptions.find((opt) => opt.value === value)?.label ?? value) + : key === 'createdBy' + ? (creatorOptions.find((opt) => opt.value === value)?.label ?? value) : value; return ( opt.value === values)?.label ?? values) + : key === 'createdBy' + ? (creatorOptions.find((opt) => opt.value === values)?.label ?? values) : values; return ( void; machineOptions: { value: string; label: string }[]; + creatorOptions: { value: string; label: string }[]; caseOptions: { value: string; label: string }[]; selectedCaseName: string; onCaseNameChange: (caseName: string) => void; @@ -20,6 +21,7 @@ export const BrowseFiltersSidePanel = ({ availableFilters, onChange, machineOptions, + creatorOptions, caseOptions, selectedCaseName, onCaseNameChange, @@ -222,15 +224,31 @@ export const BrowseFiltersSidePanel = ({ ({ - value: id, - label: id, - }))} + options={ + creatorOptions.length > 0 + ? creatorOptions + : (availableFilters.createdBy || []).map((id) => ({ + value: id, + label: id, + })) + } defaultValue={appliedFilters.createdBy || []} onValueChange={(next) => handleChange('createdBy', next as string[])} placeholder="Select creators" resetOnDefaultValueChange={true} /> + + + ({ + value: id, + label: id, + }))} + defaultValue={appliedFilters.hpcUsername || []} + onValueChange={(next) => handleChange('hpcUsername', next as string[])} + placeholder="Select HPC usernames" + resetOnDefaultValueChange={true} + /> ); diff --git a/frontend/src/features/simulations/components/SimulationDetailsView.tsx b/frontend/src/features/simulations/components/SimulationDetailsView.tsx index abdf0bdb..bc5bb571 100644 --- a/frontend/src/features/simulations/components/SimulationDetailsView.tsx +++ b/frontend/src/features/simulations/components/SimulationDetailsView.tsx @@ -366,6 +366,12 @@ export const SimulationDetailsView = ({

{simulation.gitCommitHash ?? '—'}

+
+ +

{simulation.hpcUsername ?? '—'}

+
diff --git a/frontend/src/types/simulation.ts b/frontend/src/types/simulation.ts index b0061096..c70a9a21 100644 --- a/frontend/src/types/simulation.ts +++ b/frontend/src/types/simulation.ts @@ -2,6 +2,13 @@ import type { ArtifactIn, ArtifactOut } from '@/types/artifact'; import type { ExternalLinkIn, ExternalLinkOut } from '@/types/link'; import type { Machine } from '@/types/machine'; +export interface SimulationUserPreview { + id: string; + email: string; + role: string; + full_name?: string | null; +} + /** * API response model for a Case with nested simulation summaries. */ @@ -79,6 +86,7 @@ export interface SimulationCreate { // ~~~~~~~~~~~~~~~~~~~~~~~ createdBy?: string | null; lastUpdatedBy?: string | null; + hpcUsername?: string | null; // Miscellaneous // ~~~~~~~~~~~~~~~~~ @@ -115,6 +123,8 @@ export interface SimulationOut extends SimulationCreate { // ~~~~~~~~~~~~~~~~~~~~~~~ createdAt: string; // Server-managed field updatedAt: string; // Server-managed field + createdByUser?: SimulationUserPreview | null; + lastUpdatedByUser?: SimulationUserPreview | null; // Relationships // ~~~~~~~~~~~~~~ From 540649e309fc8c2e2e03ad4094b961e994eafb41 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 15:04:59 -0700 Subject: [PATCH 08/10] Remove unused config delta field alias --- backend/app/features/simulation/models.py | 9 +-------- backend/tests/features/ingestion/test_ingest.py | 4 ---- backend/tests/features/simulation/test_models.py | 7 ------- 3 files changed, 1 insertion(+), 19 deletions(-) delete mode 100644 backend/tests/features/simulation/test_models.py diff --git a/backend/app/features/simulation/models.py b/backend/app/features/simulation/models.py index 2f73a503..abe10090 100644 --- a/backend/app/features/simulation/models.py +++ b/backend/app/features/simulation/models.py @@ -3,7 +3,7 @@ from __future__ import annotations from datetime import datetime -from typing import TYPE_CHECKING, Any, ClassVar, Optional +from typing import TYPE_CHECKING, Any, Optional from uuid import UUID from sqlalchemy import DateTime, ForeignKey, Integer, String, Text @@ -14,7 +14,6 @@ from app.common.models.base import Base from app.common.models.mixins import IDMixin, TimestampMixin -from app.features.simulation.config_delta import SimulationConfigSnapshot from app.features.simulation.enums import ( ArtifactKind, ExternalLinkKind, @@ -176,12 +175,6 @@ class Simulation(Base, IDMixin, TimestampMixin): back_populates="simulation", cascade="all, delete-orphan" ) - # Fields used to compute configuration deltas relative to canonical execution. - # Excludes timeline, status, and provenance fields that are expected to vary. - CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = ( - SimulationConfigSnapshot.field_names() - ) - class Artifact(Base, IDMixin, TimestampMixin): __tablename__ = "artifacts" diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py index 0679c5b2..e7dae0c1 100644 --- a/backend/tests/features/ingestion/test_ingest.py +++ b/backend/tests/features/ingestion/test_ingest.py @@ -1860,9 +1860,6 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup( ) assert second == first - def test_snapshot_field_names_match_config_delta_fields(self) -> None: - assert SimulationConfigSnapshot.field_names() == Simulation.CONFIG_DELTA_FIELDS - def test_parsed_snapshot_defaults_simulation_type_to_unknown(self) -> None: parsed = ParsedSimulation( execution_dir="/path/to/1082001.260305-120001", @@ -1955,7 +1952,6 @@ def test_persisted_snapshot_matches_config_delta_fields(self, db: Session) -> No snapshot = _build_config_snapshot(sim) assert snapshot.simulation_type == SimulationType.TEST.value - assert set(snapshot.field_names()) == Simulation.CONFIG_DELTA_FIELDS def test_snapshot_normalizes_ssh_git_url_for_equality(self, db: Session) -> None: parsed = ParsedSimulation( diff --git a/backend/tests/features/simulation/test_models.py b/backend/tests/features/simulation/test_models.py deleted file mode 100644 index 77b46013..00000000 --- a/backend/tests/features/simulation/test_models.py +++ /dev/null @@ -1,7 +0,0 @@ -from app.features.simulation.models import Simulation - - -class TestSimulationModel: - def test_config_delta_fields_are_valid(self): - model_columns = set(Simulation.__table__.columns.keys()) - assert Simulation.CONFIG_DELTA_FIELDS <= model_columns From 491871c080336341709d0851f6e14f62286654d9 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 15:12:32 -0700 Subject: [PATCH 09/10] Restore archive status parsing from CaseStatus --- .../features/ingestion/parsers/case_docs.py | 24 ----- .../features/ingestion/parsers/case_status.py | 67 ++++++++++++++ .../app/features/ingestion/parsers/parser.py | 24 ++++- .../ingestion/parsers/test_case_docs.py | 21 ----- .../ingestion/parsers/test_case_status.py | 87 +++++++++++++++++++ .../features/ingestion/parsers/test_parser.py | 72 ++++++++++++--- 6 files changed, 235 insertions(+), 60 deletions(-) create mode 100644 backend/app/features/ingestion/parsers/case_status.py create mode 100644 backend/tests/features/ingestion/parsers/test_case_status.py diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py index 8da7589a..9e5ec964 100644 --- a/backend/app/features/ingestion/parsers/case_docs.py +++ b/backend/app/features/ingestion/parsers/case_docs.py @@ -6,7 +6,6 @@ from dateutil.relativedelta import relativedelta from app.features.ingestion.parsers.utils import _open_text -from app.features.simulation.enums import SimulationStatus from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES @@ -114,19 +113,6 @@ def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]: } -def parse_run_artifacts(run_dir: str | Path) -> dict[str, str | None]: - """Inspect execution-root artifacts and derive status metadata.""" - run_dir = Path(run_dir) - - return { - "status": ( - SimulationStatus.COMPLETED.value - if _has_timing_file(run_dir) - else SimulationStatus.UNKNOWN.value - ) - } - - def _extract_value_from_file(path: Path, entry_id: str) -> str | None: """Extract the value of a specific entry from an XML file. @@ -269,13 +255,3 @@ def _parse_stop_date(stop_date: str | None) -> str | None: return datetime.strptime(stop_date, "%Y%m%d").strftime("%Y-%m-%d") except ValueError: return None - - -def _has_timing_file(run_dir: Path) -> bool: - try: - return any( - child.is_file() and re.match(r"^e3sm_timing\..*", child.name) - for child in run_dir.iterdir() - ) - except OSError: - return False diff --git a/backend/app/features/ingestion/parsers/case_status.py b/backend/app/features/ingestion/parsers/case_status.py new file mode 100644 index 00000000..5efc4753 --- /dev/null +++ b/backend/app/features/ingestion/parsers/case_status.py @@ -0,0 +1,67 @@ +import re +from pathlib import Path + +from app.core.logger import _setup_custom_logger +from app.features.ingestion.parsers.utils import _open_text +from app.features.simulation.enums import SimulationStatus + +logger = _setup_custom_logger(__name__) + +TIMESTAMP_PATTERN = r"(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})" +CASE_RUN_START_PATTERN = re.compile( + rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+starting(?:\s+\S+)?\s*$" +) +CASE_RUN_TERMINAL_PATTERN = re.compile( + rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+(?Psuccess|error)\b" +) + + +def parse_case_status(file_path: str | Path) -> dict[str, str | None]: + """Parse the latest ``case.run`` attempt from ``CaseStatus``. + + ``CaseStatus`` can record multiple attempts for the same execution. The + latest ``case.run starting`` entry is treated as authoritative and the first + terminal entry after it determines the run status. + """ + file_path = Path(file_path) + result: dict[str, str | None] = { + "run_start_date": None, + "run_end_date": None, + "status": SimulationStatus.UNKNOWN.value, + } + + try: + lines = _open_text(file_path).splitlines() + except (OSError, UnicodeDecodeError) as exc: + logger.warning("Failed to read case status file %s (%s)", file_path, exc) + return result + + latest_start_index: int | None = None + latest_start_timestamp: str | None = None + + for index, line in enumerate(lines): + start_match = CASE_RUN_START_PATTERN.match(line.strip()) + if start_match: + latest_start_index = index + latest_start_timestamp = start_match.group("timestamp") + + if latest_start_index is None or latest_start_timestamp is None: + return result + + result["run_start_date"] = latest_start_timestamp + + for line in lines[latest_start_index + 1 :]: + terminal_match = CASE_RUN_TERMINAL_PATTERN.match(line.strip()) + if not terminal_match: + continue + + result["run_end_date"] = terminal_match.group("timestamp") + result["status"] = ( + SimulationStatus.COMPLETED.value + if terminal_match.group("state") == "success" + else SimulationStatus.FAILED.value + ) + return result + + result["status"] = SimulationStatus.RUNNING.value + return result diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index 72a3463b..e8240d68 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -33,8 +33,8 @@ parse_env_build, parse_env_case, parse_env_run, - parse_run_artifacts, ) +from app.features.ingestion.parsers.case_status import parse_case_status from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing from app.features.ingestion.parsers.git_info import ( parse_git_config, @@ -84,6 +84,12 @@ class FileSpec(TypedDict, total=False): "parser": parse_readme_case, "required": True, }, + "case_status": { + "pattern": r"CaseStatus\..*\.gz", + "location": "root", + "parser": parse_case_status, + "required": False, + }, "e3sm_timing": { "pattern": r"e3sm_timing\..*", "location": "root", @@ -410,6 +416,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul Typed archive-derived metadata for one execution directory. """ metadata: dict[str, str | None] = {} + case_status_metadata: dict[str, str | None] | None = None for key, spec in FILE_SPECS.items(): path = files.get(key) @@ -417,9 +424,18 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul continue parser: Callable = spec["parser"] - metadata.update(parser(path)) + parsed_metadata = parser(path) + + if key == "case_status": + case_status_metadata = parsed_metadata + continue + + metadata.update(parsed_metadata) - metadata.update(parse_run_artifacts(exec_dir)) + # CaseStatus reflects the latest case.run attempt, so its status and run + # timestamps should override timing-file values when the artifact exists. + if case_status_metadata is not None: + metadata.update(case_status_metadata) execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir) _warn_on_execution_id_mismatch(exec_dir, execution_id) @@ -447,7 +463,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul git_branch=metadata.get("git_branch"), git_tag=metadata.get("git_tag"), git_commit_hash=metadata.get("git_commit_hash"), - status=metadata.get("status", SimulationStatus.UNKNOWN.value), + status=metadata.get("status") or SimulationStatus.UNKNOWN.value, ) diff --git a/backend/tests/features/ingestion/parsers/test_case_docs.py b/backend/tests/features/ingestion/parsers/test_case_docs.py index ed68436b..5b692276 100644 --- a/backend/tests/features/ingestion/parsers/test_case_docs.py +++ b/backend/tests/features/ingestion/parsers/test_case_docs.py @@ -5,7 +5,6 @@ parse_env_build, parse_env_case, parse_env_run, - parse_run_artifacts, ) @@ -324,23 +323,3 @@ def test_invalid_stop_date_returns_none(self, tmp_path): result = parse_env_run(tmp_run) assert result["simulation_end_date"] is None - - -class TestParseRunArtifacts: - def test_completed_status_when_timing_exists(self, tmp_path): - (tmp_path / "e3sm_timing.test").write_text("timing data") - - result = parse_run_artifacts(tmp_path) - - assert result["status"] == "completed" - - def test_unknown_status_when_timing_missing(self, tmp_path): - result = parse_run_artifacts(tmp_path) - - assert result["status"] == "unknown" - - def test_unknown_status_when_directory_read_fails(self): - with patch.object(Path, "iterdir", side_effect=OSError("boom")): - result = parse_run_artifacts("/tmp/missing") - - assert result["status"] == "unknown" diff --git a/backend/tests/features/ingestion/parsers/test_case_status.py b/backend/tests/features/ingestion/parsers/test_case_status.py new file mode 100644 index 00000000..ea8d35af --- /dev/null +++ b/backend/tests/features/ingestion/parsers/test_case_status.py @@ -0,0 +1,87 @@ +from pathlib import Path +from unittest.mock import patch + +from app.features.ingestion.parsers.case_status import parse_case_status + + +class TestCaseStatusParser: + def test_returns_unknown_status_on_read_error(self) -> None: + with ( + patch( + "app.features.ingestion.parsers.case_status._open_text", + side_effect=OSError("boom"), + ), + patch( + "app.features.ingestion.parsers.case_status.logger.warning" + ) as mock_warning, + ): + result = parse_case_status(Path("/tmp/missing/CaseStatus.001.gz")) + + assert result == { + "run_start_date": None, + "run_end_date": None, + "status": "unknown", + } + mock_warning.assert_called_once() + + def test_extracts_completed_status_from_latest_attempt(self, tmp_path) -> None: + case_status = tmp_path / "CaseStatus.001" + case_status.write_text( + "\n".join( + [ + "2025-01-01 00:00:00: case.run starting 111", + "2025-01-01 01:00:00: case.run error", + "2025-01-01 02:00:00: case.run starting 222", + "2025-01-01 03:00:00: case.run success", + ] + ) + ) + + result = parse_case_status(case_status) + + assert result["run_start_date"] == "2025-01-01 02:00:00" + assert result["run_end_date"] == "2025-01-01 03:00:00" + assert result["status"] == "completed" + + def test_extracts_failed_status(self, tmp_path) -> None: + case_status = tmp_path / "CaseStatus.001" + case_status.write_text( + "\n".join( + [ + "2025-01-01 00:00:00: case.run starting 111", + "2025-01-01 01:00:00: case.run error", + ] + ) + ) + + result = parse_case_status(case_status) + + assert result["run_end_date"] == "2025-01-01 01:00:00" + assert result["status"] == "failed" + + def test_extracts_running_status_without_terminal_entry(self, tmp_path) -> None: + case_status = tmp_path / "CaseStatus.001" + case_status.write_text( + "\n".join( + [ + "2025-01-01 00:00:00: case.run starting 111", + "2025-01-01 00:10:00: unrelated log line", + ] + ) + ) + + result = parse_case_status(case_status) + + assert result["run_start_date"] == "2025-01-01 00:00:00" + assert result["run_end_date"] is None + assert result["status"] == "running" + + def test_returns_unknown_status_without_case_run_entries(self, tmp_path) -> None: + case_status = tmp_path / "CaseStatus.001" + case_status.write_text("2025-01-01 00:00:00: xmlchange success") + + result = parse_case_status(case_status) + + assert result["run_start_date"] is None + assert result["run_end_date"] is None + assert result["status"] == "unknown" diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py index 64371a5b..e90dddf0 100644 --- a/backend/tests/features/ingestion/parsers/test_parser.py +++ b/backend/tests/features/ingestion/parsers/test_parser.py @@ -25,6 +25,7 @@ def _create_execution_metadata_files( *, include_env_run: bool = True, include_timing: bool = True, + case_status_content: str | None = None, ) -> None: """Create standard execution files for testing.""" version_base = version.split(".")[0] @@ -33,6 +34,10 @@ def _create_execution_metadata_files( timing_file = execution_dir / f"e3sm_timing.{version}" timing_file.write_text("timing data") + if case_status_content is not None: + with gzip.open(execution_dir / f"CaseStatus.{version_base}.gz", "wt") as f: + f.write(case_status_content) + casedocs = execution_dir / "CaseDocs" casedocs.mkdir(exist_ok=True) @@ -94,12 +99,12 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: "parse_env_build": {"compiler": "gnu"}, "parse_env_run": {"simulation_start_date": "2020-01-01"}, "parse_readme_case": {}, + "parse_case_status": {"status": "completed"}, "parse_e3sm_timing": { "execution_id": "1081156.251218-200923", "run_start_date": "2025-12-18T20:09:33", "run_end_date": "2025-12-18T20:54:58", }, - "parse_run_artifacts": {"status": "completed"}, "parse_git_describe": {}, "parse_git_config": None, "parse_git_status": None, @@ -121,6 +126,9 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: parser.FILE_SPECS["readme_case"]["parser"] = lambda _path: defaults[ "parse_readme_case" ] + parser.FILE_SPECS["case_status"]["parser"] = lambda _path: defaults[ + "parse_case_status" + ] parser.FILE_SPECS["e3sm_timing"]["parser"] = lambda _path: defaults[ "parse_e3sm_timing" ] @@ -146,19 +154,16 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]: ) ) - with patch( - "app.features.ingestion.parsers.parser.parse_run_artifacts", - return_value=defaults["parse_run_artifacts"], - ): - yield + yield finally: parser.FILE_SPECS.clear() parser.FILE_SPECS.update(original_file_specs) - def test_file_specs_remove_case_status_and_add_env_run(self) -> None: - assert "case_status" not in parser.FILE_SPECS + def test_file_specs_include_case_status_and_env_run(self) -> None: + assert "case_status" in parser.FILE_SPECS assert "case_docs_env_run" in parser.FILE_SPECS assert "e3sm_timing" in parser.FILE_SPECS + assert parser.FILE_SPECS["case_status"]["required"] is False assert parser.FILE_SPECS["case_docs_env_run"]["required"] is True assert parser.FILE_SPECS["e3sm_timing"]["required"] is True @@ -358,16 +363,61 @@ def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None: assert result == [] assert skipped == 1 - def test_completed_status_is_merged(self, tmp_path: Path) -> None: + def test_case_status_is_merged(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files( + execution_dir, + "001.001", + case_status_content="2025-01-01 00:00:00: case.run error", + ) + + with self._mock_all_parsers(parse_case_status={"status": "failed"}): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert skipped == 0 + assert result[0].status == "failed" + + def test_case_status_run_dates_override_timing_dates(self, tmp_path: Path) -> None: + execution_dir = tmp_path / "1.0-0" + execution_dir.mkdir(parents=True) + self._create_execution_metadata_files( + execution_dir, + "001.001", + case_status_content="2025-01-01 00:00:00: case.run error", + ) + + with self._mock_all_parsers( + parse_case_status={ + "run_start_date": "2025-01-01 00:00:00", + "run_end_date": None, + "status": "running", + }, + parse_e3sm_timing={ + "execution_id": "1081156.251218-200923", + "run_start_date": "2025-12-18T20:09:33", + "run_end_date": "2025-12-18T20:54:58", + }, + ): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert skipped == 0 + assert result[0].status == "running" + assert result[0].run_start_date == "2025-01-01 00:00:00" + assert result[0].run_end_date is None + + def test_missing_case_status_defaults_status_to_unknown( + self, tmp_path: Path + ) -> None: execution_dir = tmp_path / "1.0-0" execution_dir.mkdir(parents=True) self._create_execution_metadata_files(execution_dir, "001.001") - with self._mock_all_parsers(parse_run_artifacts={"status": "completed"}): + with self._mock_all_parsers(): result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") assert skipped == 0 - assert result[0].status == "completed" + assert result[0].status == "unknown" def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None: execution_dir = tmp_path / "1.0-0" From 4048e4e7b37f2782a1e2aab97270c38e37007c55 Mon Sep 17 00:00:00 2001 From: Vo Date: Wed, 18 Mar 2026 15:17:05 -0700 Subject: [PATCH 10/10] Preserve execution directory IDs on timing mismatch --- .../app/features/ingestion/parsers/parser.py | 23 ++++++-------- .../features/ingestion/parsers/test_parser.py | 31 +++++++++++++++++-- 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py index e8240d68..dc9542d4 100644 --- a/backend/app/features/ingestion/parsers/parser.py +++ b/backend/app/features/ingestion/parsers/parser.py @@ -437,8 +437,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul if case_status_metadata is not None: metadata.update(case_status_metadata) - execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir) - _warn_on_execution_id_mismatch(exec_dir, execution_id) + execution_id = _resolve_execution_id(metadata.get("execution_id"), exec_dir) return ParsedSimulation( execution_dir=exec_dir, @@ -467,8 +466,8 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul ) -def _require_execution_id(execution_id: str | None, exec_dir: str) -> str: - """Return timing-file LID or treat the run as incomplete.""" +def _resolve_execution_id(execution_id: str | None, exec_dir: str) -> str: + """Return a stable execution_id or treat the run as incomplete.""" if execution_id is None: raise FileNotFoundError( f"Required timing-file LID missing for execution directory '{exec_dir}'" @@ -480,17 +479,15 @@ def _require_execution_id(execution_id: str | None, exec_dir: str) -> str: f"Required timing-file LID missing for execution directory '{exec_dir}'" ) - return normalized - - -def _warn_on_execution_id_mismatch(exec_dir: str, execution_id: str) -> None: - """Log when timing-file LID disagrees with the execution directory basename.""" exec_dir_basename = os.path.basename(exec_dir) - - if exec_dir_basename and exec_dir_basename != execution_id: + if exec_dir_basename and exec_dir_basename != normalized: logger.warning( "Timing-file LID '%s' does not match execution directory '%s'. " - "Using timing-file LID as execution_id.", - execution_id, + "Using execution directory basename as execution_id.", + normalized, exec_dir_basename, ) + + return exec_dir_basename + + return normalized diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py index e90dddf0..cf7bc009 100644 --- a/backend/tests/features/ingestion/parsers/test_parser.py +++ b/backend/tests/features/ingestion/parsers/test_parser.py @@ -436,7 +436,9 @@ def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None: assert result == [] assert skipped == 1 - def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None: + def test_mismatched_timing_lid_falls_back_to_directory_basename( + self, tmp_path: Path + ) -> None: execution_dir = tmp_path / "1.0-0" execution_dir.mkdir(parents=True) self._create_execution_metadata_files(execution_dir, "001.001") @@ -456,14 +458,37 @@ def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None: result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") assert skipped == 0 - assert result[0].execution_id == "different-lid" + assert result[0].execution_id == "1.0-0" mock_warning.assert_any_call( "Timing-file LID '%s' does not match execution directory '%s'. " - "Using timing-file LID as execution_id.", + "Using execution directory basename as execution_id.", "different-lid", "1.0-0", ) + def test_mismatched_timing_lid_preserves_distinct_execution_directories( + self, tmp_path: Path + ) -> None: + first_execution_dir = tmp_path / "1.0-0" + first_execution_dir.mkdir(parents=True) + self._create_execution_metadata_files(first_execution_dir, "001.001") + + second_execution_dir = tmp_path / "1.0-1" + second_execution_dir.mkdir(parents=True) + self._create_execution_metadata_files(second_execution_dir, "002.002") + + with self._mock_all_parsers( + parse_e3sm_timing={ + "execution_id": "stale-lid", + "run_start_date": "2025-12-18T20:09:33", + "run_end_date": "2025-12-18T20:54:58", + } + ): + result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output") + + assert skipped == 0 + assert [parsed.execution_id for parsed in result] == ["1.0-0", "1.0-1"] + def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None: execution_dir = tmp_path / "1.0-0" execution_dir.mkdir(parents=True)