From 8a996b892982cb2a37029bec46286c8665c76402 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Fri, 13 Mar 2026 17:01:41 -0700
Subject: [PATCH 01/10] Refactor ingestion parser to use env_run and timing

---
 .../features/ingestion/parsers/case_docs.py   | 180 ++++++++-
 .../features/ingestion/parsers/case_status.py | 169 ---------
 .../features/ingestion/parsers/e3sm_timing.py | 187 +++-------
 .../app/features/ingestion/parsers/parser.py  |  53 +--
 .../ingestion/parsers/test_case_docs.py       | 255 ++++++++++++-
 .../ingestion/parsers/test_case_status.py     | 262 -------------
 .../ingestion/parsers/test_e3sm_timing.py     | 210 +++++++----
 .../features/ingestion/parsers/test_parser.py | 348 ++++++++----------
 .../features/ingestion/parsers/test_utils.py  |  24 ++
 9 files changed, 815 insertions(+), 873 deletions(-)
 delete mode 100644 backend/app/features/ingestion/parsers/case_status.py
 delete mode 100644 backend/tests/features/ingestion/parsers/test_case_status.py
 create mode 100644 backend/tests/features/ingestion/parsers/test_utils.py

diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py
index 966ef9b9..238c67c7 100644
--- a/backend/app/features/ingestion/parsers/case_docs.py
+++ b/backend/app/features/ingestion/parsers/case_docs.py
@@ -1,7 +1,13 @@
+import re
 import xml.etree.ElementTree as ET
+from datetime import datetime
 from pathlib import Path
 
+from dateutil.relativedelta import relativedelta
+
 from app.features.ingestion.parsers.utils import _open_text
+from app.features.simulation.enums import SimulationStatus
+from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES
 
 
 def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
@@ -18,9 +24,25 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
         Dictionary with key 'case_group' (str or None)
     """
     env_case_path = Path(env_case_path)
+
+    case_name = _extract_value_from_file(env_case_path, "CASE")
     case_group = _extract_value_from_file(env_case_path, "CASE_GROUP")
+    machine = _extract_value_from_file(env_case_path, "MACH")
+    user = _extract_value_from_file(env_case_path, "REALUSER")
+    compset_alias = _extract_value_from_file(env_case_path, "COMPSET")
 
-    return {"case_group": case_group}
+    # Extract metadata that requires special handling
+    campaign, experiment_type = _extract_campaign_and_experiment_type(case_name)
+
+    return {
+        "case_name": case_name,
+        "case_group": case_group,
+        "machine": machine,
+        "user": user,
+        "campaign": campaign,
+        "experiment_type": experiment_type,
+        "compset_alias": compset_alias,
+    }
 
 
 def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]:
@@ -34,13 +56,66 @@ def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]:
     Returns
     -------
     dict
-        Dictionary with keys 'compiler', 'mpilib' (str or None)
+        Dictionary with keys 'grid_resolution', 'compiler', 'mpilib' (str or None)
     """
     env_build_path = Path(env_build_path)
+
+    grid_resolution = _extract_value_from_file(env_build_path, "GRID")
     compiler = _extract_value_from_file(env_build_path, "COMPILER")
     mpilib = _extract_value_from_file(env_build_path, "MPILIB")
 
-    return {"compiler": compiler, "mpilib": mpilib}
+    return {"grid_resolution": grid_resolution, "compiler": compiler, "mpilib": mpilib}
+
+
+def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]:
+    """Parse env_run.xml (plain or gzipped) to extract runtime settings.
+
+    Parameters
+    ----------
+    env_run_path : str or Path
+        Path to the env_run.xml file (plain or .gz)
+
+    Returns
+    -------
+    dict
+        Dictionary with runtime initialization and simulation date metadata.
+    """
+    env_run_path = Path(env_run_path)
+    initialization_type = _extract_value_from_file(env_run_path, "RUN_TYPE")
+    run_start_date = _extract_value_from_file(env_run_path, "RUN_STARTDATE")
+    run_ref_date = _extract_value_from_file(env_run_path, "RUN_REFDATE")
+    stop_option = _extract_value_from_file(env_run_path, "STOP_OPTION")
+    stop_n = _extract_value_from_file(env_run_path, "STOP_N")
+    stop_date = _extract_value_from_file(env_run_path, "STOP_DATE")
+
+    simulation_start_date = (
+        run_ref_date if initialization_type == "branch" else run_start_date
+    )
+    simulation_end_date = _calculate_simulation_end_date(
+        simulation_start_date,
+        stop_option,
+        stop_n,
+        stop_date,
+    )
+
+    return {
+        "initialization_type": initialization_type,
+        "simulation_start_date": simulation_start_date,
+        "simulation_end_date": simulation_end_date,
+    }
+
+
+def parse_run_artifacts(run_dir: str | Path) -> dict[str, str | None]:
+    """Inspect execution-root artifacts and derive status metadata."""
+    run_dir = Path(run_dir)
+
+    return {
+        "status": (
+            SimulationStatus.COMPLETED.value
+            if _has_timing_file(run_dir)
+            else SimulationStatus.UNKNOWN.value
+        )
+    }
 
 
 def _extract_value_from_file(path: Path, entry_id: str) -> str | None:
@@ -58,7 +133,10 @@ def _extract_value_from_file(path: Path, entry_id: str) -> str | None:
     str | None
         The value of the entry, or None if not found
     """
-    text = _open_text(path)
+    try:
+        text = _open_text(path)
+    except (OSError, UnicodeDecodeError):
+        return None
 
     try:
         root = ET.fromstring(text)
@@ -95,3 +173,97 @@ def _find_entry_value(root, entry_id: str) -> str | None:
                 return entry.text.strip()
 
     return None
+
+
+def _extract_campaign_and_experiment_type(
+    case_name: str | None,
+) -> tuple[str | None, str | None]:
+    """Extract campaign and experiment type from case name.
+
+    Parameters
+    ----------
+    case_name : str or None
+        The case name to parse.
+
+    Returns
+    -------
+    tuple of (str or None, str or None)
+        campaign and experiment_type values.
+    """
+    campaign = None
+    experiment_type = None
+
+    # Example: v3.LR.historical
+    if case_name:
+        # Remove trailing instance suffix like _0121
+        base = re.sub(r"_\d+$", "", case_name)
+
+        # Only infer campaign for dot-delimited case names.
+        # Timing files sometimes use short case names (e.g., e3sm_v1_ne30)
+        # that do not encode campaign/experiment type.
+        if "." not in base:
+            return None, None
+
+        # Campaign is the base case name without the trailing instance suffix
+        campaign = base
+
+        # Candidate experiment type = last dot token
+        candidate = campaign.split(".")[-1]
+
+        if candidate in KNOWN_EXPERIMENT_TYPES:
+            experiment_type = candidate
+
+    return campaign, experiment_type
+
+
+def _calculate_simulation_end_date(
+    simulation_start_date: str | None,
+    stop_option: str | None,
+    stop_n: str | None,
+    stop_date: str | None,
+) -> str | None:
+    if not simulation_start_date or not stop_option:
+        return None
+
+    if stop_option == "date":
+        return _parse_stop_date(stop_date)
+
+    if stop_n is None:
+        return None
+
+    try:
+        start_date = datetime.strptime(simulation_start_date, "%Y-%m-%d")
+        stop_n_int = int(stop_n)
+    except ValueError:
+        return None
+
+    if stop_option == "ndays":
+        end_date = start_date + relativedelta(days=stop_n_int)
+    elif stop_option == "nmonths":
+        end_date = start_date + relativedelta(months=stop_n_int)
+    elif stop_option == "nyears":
+        end_date = start_date + relativedelta(years=stop_n_int)
+    else:
+        return None
+
+    return end_date.strftime("%Y-%m-%d")
+
+
+def _parse_stop_date(stop_date: str | None) -> str | None:
+    if not stop_date:
+        return None
+
+    try:
+        return datetime.strptime(stop_date, "%Y%m%d").strftime("%Y-%m-%d")
+    except ValueError:
+        return None
+
+
+def _has_timing_file(run_dir: Path) -> bool:
+    try:
+        return any(
+            child.is_file() and re.match(r"^e3sm_timing\..*", child.name)
+            for child in run_dir.iterdir()
+        )
+    except OSError:
+        return False
diff --git a/backend/app/features/ingestion/parsers/case_status.py b/backend/app/features/ingestion/parsers/case_status.py
deleted file mode 100644
index 85e92952..00000000
--- a/backend/app/features/ingestion/parsers/case_status.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import logging
-import re
-from datetime import datetime
-from typing import Any
-
-from dateutil.relativedelta import relativedelta
-
-from app.features.ingestion.parsers.utils import _get_open_func
-from app.features.simulation.enums import SimulationStatus
-
-logger = logging.getLogger(__name__)
-
-TIMESTAMP_PATTERN = r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
-CASE_RUN_START_PATTERN = re.compile(
-    rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+starting(?:\s+(?P<job_id>\S+))?\s*$"
-)
-CASE_RUN_TERMINAL_PATTERN = re.compile(
-    rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+(?P<state>success|error)\b"
-)
-RUN_STARTDATE_PATTERN = re.compile(r"RUN_STARTDATE=(?P<start_date>\d{4}-\d{2}-\d{2})")
-STOP_OPTION_STOP_N_PATTERN = re.compile(
-    r"STOP_OPTION=(?P<stop_option>[^,\s]+),STOP_N=(?P<stop_n>\d+)"
-)
-
-
-def parse_case_status(file_path: str) -> dict[str, Any]:
-    """
-    Parse the case status file and extract relevant information.
-
-    Parameters
-    ----------
-    file_path : str
-        Path to the case status file.
-
-    Returns
-    -------
-    dict[str, Any]
-        A dictionary containing simulation date metadata and latest case.run
-        attempt metadata.
-    """
-    result: dict[str, Any] = {
-        "simulation_start_date": None,
-        "simulation_end_date": None,
-        "run_start_date": None,
-        "run_end_date": None,
-        "status": None,
-    }
-
-    open_func = _get_open_func(file_path)
-    try:
-        with open_func(file_path, "rt") as file:
-            lines = file.readlines()
-    except (OSError, UnicodeDecodeError) as exc:
-        logger.warning("Failed to read case status file %s (%s)", file_path, exc)
-        return result
-
-    latest_start_idx, latest_start_match = _extract_simulation_dates(
-        lines, file_path, result
-    )
-
-    _extract_latest_run_metadata(lines, latest_start_idx, latest_start_match, result)
-
-    return result
-
-
-def _extract_simulation_dates(
-    lines: list[str], file_path: str, result: dict[str, Any]
-) -> tuple[int | None, re.Match[str] | None]:
-    latest_start_idx: int | None = None
-    latest_start_match: re.Match[str] | None = None
-
-    for index, line in enumerate(lines):
-        run_startdate_match = RUN_STARTDATE_PATTERN.search(line)
-        if "RUN_STARTDATE" in line and not run_startdate_match:
-            logger.warning(
-                f"Malformed RUN_STARTDATE line in {file_path}: {line.strip()}"
-            )
-        elif run_startdate_match:
-            result["simulation_start_date"] = run_startdate_match.group("start_date")
-
-        stop_match = STOP_OPTION_STOP_N_PATTERN.search(line)
-        if "STOP_OPTION" in line and "STOP_N" in line and not stop_match:
-            logger.warning(
-                f"Malformed STOP_OPTION/STOP_N line in {file_path}: {line.strip()}"
-            )
-        elif stop_match:
-            _update_simulation_end_date(file_path, line, stop_match, result)
-
-        start_match = CASE_RUN_START_PATTERN.match(line.strip())
-        if start_match:
-            latest_start_idx = index
-            latest_start_match = start_match
-
-    return latest_start_idx, latest_start_match
-
-
-def _update_simulation_end_date(
-    file_path: str, line: str, stop_match: re.Match[str], result: dict[str, Any]
-) -> None:
-    stop_option = stop_match.group("stop_option")
-    stop_n_str = stop_match.group("stop_n")
-
-    try:
-        stop_n = int(stop_n_str)
-    except ValueError as exc:
-        logger.warning(
-            f"Malformed STOP_OPTION/STOP_N line in {file_path}: {line.strip()} ({exc})"
-        )
-        return
-
-    result["simulation_end_date"] = _calculate_simulation_end_date(
-        result["simulation_start_date"], stop_option, stop_n
-    )
-
-
-def _extract_latest_run_metadata(
-    lines: list[str],
-    latest_start_idx: int | None,
-    latest_start_match: re.Match[str] | None,
-    result: dict[str, Any],
-) -> None:
-    if latest_start_idx is None or latest_start_match is None:
-        return
-
-    result["run_start_time"] = latest_start_match.group("timestamp")
-    result["run_start_date"] = result["run_start_time"]
-
-    for line in lines[latest_start_idx + 1 :]:
-        terminal_match = CASE_RUN_TERMINAL_PATTERN.match(line.strip())
-        if not terminal_match:
-            continue
-
-        result["run_end_time"] = terminal_match.group("timestamp")
-        result["run_end_date"] = result["run_end_time"]
-        state = terminal_match.group("state")
-
-        if state == "success":
-            result["status"] = SimulationStatus.COMPLETED.value
-        elif state == "error":
-            result["status"] = SimulationStatus.FAILED.value
-        return
-
-    result["status"] = SimulationStatus.RUNNING.value
-
-
-def _calculate_simulation_end_date(
-    simulation_start_date: str | None, stop_option: str | None, stop_n: int | None
-) -> str | None:
-    if not (simulation_start_date and stop_option and stop_n):
-        return None
-
-    try:
-        start = datetime.strptime(simulation_start_date, "%Y-%m-%d")
-    except ValueError:
-        return None
-
-    option = stop_option.lower()
-    n = stop_n
-
-    if "days" in option:
-        end = start + relativedelta(days=n)
-    elif "months" in option:
-        end = start + relativedelta(months=n)
-    elif "years" in option:
-        end = start + relativedelta(years=n)
-    else:
-        return None
-
-    return end.strftime("%Y-%m-%d")
diff --git a/backend/app/features/ingestion/parsers/e3sm_timing.py b/backend/app/features/ingestion/parsers/e3sm_timing.py
index 131c4d3f..63cb29e9 100644
--- a/backend/app/features/ingestion/parsers/e3sm_timing.py
+++ b/backend/app/features/ingestion/parsers/e3sm_timing.py
@@ -1,14 +1,13 @@
 import re
-from datetime import datetime
+from datetime import datetime, timedelta
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any
 
 from app.features.ingestion.parsers.utils import _open_text
-from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES
 
 
 def parse_e3sm_timing(path: str | Path) -> dict[str, Any]:
-    """Parse an E3SM timing file and extract metadata fields.
+    """Parse an E3SM timing file and extract run metadata.
 
     Parameters
     ----------
@@ -18,122 +17,57 @@ def parse_e3sm_timing(path: str | Path) -> dict[str, Any]:
     Returns
     -------
     dict
-        Dictionary with all relevant fields, including nested run config.
+        Dictionary with execution and run timing metadata.
     """
     path = Path(path)
-    text = _open_text(path)
-    lines = text.splitlines()
-
-    metadata_fields = {
-        "case_name": r"Case\s*[:=]\s*(.+)",
-        "machine": r"Machine\s*[:=]\s*(.+)",
-        "user": r"User\s*[:=]\s*(.+)",
-        "lid": r"LID\s*[:=]\s*(.+)",
-        "simulation_start_date": r"Curr Date\s*[:=]\s*(.+)",
-        "grid_resolution": r"grid\s*[:=]\s*(.+)",
-        "compset_alias": r"compset\s*[:=]\s*(.+)",
-        "initialization_type": r"run type\s*[:=]\s*([^,]+)",
-        "run_length": r"run length\s*[:=]\s*(.+)",
-    }
-
-    metadata = {
-        key: _extract(lines, pattern) for key, pattern in metadata_fields.items()
+    result: dict[str, str | None] = {
+        "execution_id": None,
+        "run_start_date": None,
+        "run_end_date": None,
     }
 
-    # Extract metadata that requires special handling
-    campaign, experiment_type = _extract_campaign_and_experiment_type(
-        metadata.get("case_name")
-    )
-    simulation_start_date = _parse_simulation_start_date(
-        metadata["simulation_start_date"]
-    )
-    stop_option, stop_n = _extract_stop_option_and_stop_n(lines)
-
-    result = {
-        "case_name": metadata["case_name"],
-        "campaign": campaign,
-        "experiment_type": experiment_type,
-        "machine": metadata["machine"],
-        "user": metadata["user"],
-        "lid": metadata["lid"],
-        "simulation_start_date": simulation_start_date,
-        "grid_resolution": metadata["grid_resolution"],
-        "compset_alias": metadata["compset_alias"],
-        "initialization_type": metadata["initialization_type"],
-        "run_config": {
-            "stop_option": stop_option,
-            "stop_n": stop_n,
-            "run_length": metadata["run_length"],
-        },
-    }
-
-    return result
-
-
-def _extract_campaign_and_experiment_type(
-    case_name: Optional[str],
-) -> tuple[Optional[str], Optional[str]]:
-    """Extract campaign and experiment type from case name.
-
-    Parameters
-    ----------
-    case_name : str or None
-        The case name to parse.
-
-    Returns
-    -------
-    tuple of (str or None, str or None)
-        campaign and experiment_type values.
-    """
-    campaign = None
-    experiment_type = None
-
-    # Example: v3.LR.historical
-    if case_name:
-        # Remove trailing instance suffix like _0121
-        base = re.sub(r"_\d+$", "", case_name)
-
-        # Only infer campaign for dot-delimited case names.
-        # Timing files sometimes use short case names (e.g., e3sm_v1_ne30)
-        # that do not encode campaign/experiment type.
-        if "." not in base:
-            return None, None
-
-        # Campaign is the base case name without the trailing instance suffix
-        campaign = base
-
-        # Candidate experiment type = last dot token
-        candidate = campaign.split(".")[-1]
-
-        if candidate in KNOWN_EXPERIMENT_TYPES:
-            experiment_type = candidate
+    try:
+        text = _open_text(path)
+    except (OSError, UnicodeDecodeError):
+        return result
 
-    return campaign, experiment_type
+    lines = text.splitlines()
 
+    execution_id = _extract(lines, r"LID\s*[:=]\s*(.+)")
+    curr_date = _parse_curr_date(_extract(lines, r"Curr Date\s*[:=]\s*(.+)"))
+    init_time = _parse_seconds(_extract(lines, r"Init Time\s*[:=]\s*(.+)"))
+    run_time = _parse_seconds(_extract(lines, r"Run Time\s*[:=]\s*(.+)"))
+    final_time = _parse_seconds(_extract(lines, r"Final Time\s*[:=]\s*(.+)"))
+
+    result["execution_id"] = execution_id
+    if curr_date is not None:
+        result["run_end_date"] = curr_date.isoformat(timespec="seconds")
+
+    if (
+        curr_date is not None
+        and init_time is not None
+        and run_time is not None
+        and final_time is not None
+    ):
+        total_seconds = init_time + run_time + final_time
+        run_start_date = curr_date - timedelta(seconds=total_seconds)
+        result["run_start_date"] = run_start_date.replace(microsecond=0).isoformat()
 
-def _parse_simulation_start_date(date_str: Optional[str]) -> Optional[str]:
-    """Parse simulation start date string to ISO format.
+    return result
 
-    Parameters
-    ----------
-    date_str : str or None
-        The date string to parse (e.g., "Tue Jan 10 12:34:56 2023").
 
-    Returns
-    -------
-    str or None
-        ISO formatted date string, or the original string if parsing fails.
-    """
+def _parse_curr_date(date_str: str | None) -> datetime | None:
+    """Parse a timing-file date string."""
     if not date_str:
         return None
 
     try:
-        return datetime.strptime(date_str, "%a %b %d %H:%M:%S %Y").isoformat()
+        return datetime.strptime(date_str, "%a %b %d %H:%M:%S %Y")
     except ValueError:
-        return date_str
+        return None
 
 
-def _extract(lines: list[str], pattern: str, group: int = 1) -> Optional[str]:
+def _extract(lines: list[str], pattern: str, group: int = 1) -> str | None:
     """Extract the first regex group matching a pattern from a list of lines.
 
     Parameters
@@ -159,39 +93,16 @@ def _extract(lines: list[str], pattern: str, group: int = 1) -> Optional[str]:
     return None
 
 
-def _extract_stop_option_and_stop_n(
-    lines: list[str],
-) -> tuple[Optional[str], Optional[str]]:
-    """
-    Extract stop_option and stop_n from lines, handling both same-line and
-    separate-line cases.
-
-    Parameters
-    ----------
-    lines : list of str
-        Lines to search.
-
-    Returns
-    -------
-    tuple of (str or None, str or None)
-        stop_option and stop_n values.
-    """
-    stop_option: str | None = None
-    stop_n: str | None = None
-
-    for line in lines:
-        m = re.match(r"stop option\s*[:=]\s*([^,]+)", line.strip())
-
-        if m:
-            stop_option = m.group(1).strip()
-            m2 = re.search(r"stop_n\s*[=:]\s*(\d+)", line)
-
-            if m2:
-                stop_n = m2.group(1).strip()
-
-            break
+def _parse_seconds(value: str | None) -> float | None:
+    """Extract a floating-point seconds value from a timing line."""
+    if not value:
+        return None
 
-    if stop_n is None:
-        stop_n = _extract(lines, r"stop_n\s*[=:]\s*(.+)")
+    match = re.search(r"(-?\d+(?:\.\d+)?)", value)
+    if not match:
+        return None
 
-    return stop_option, stop_n
+    try:
+        return float(match.group(1))
+    except ValueError:
+        return None
diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py
index def0dfa3..51a9d178 100644
--- a/backend/app/features/ingestion/parsers/parser.py
+++ b/backend/app/features/ingestion/parsers/parser.py
@@ -29,8 +29,12 @@
 from typing import Callable, Iterable, TypedDict
 
 from app.core.logger import _setup_custom_logger
-from app.features.ingestion.parsers.case_docs import parse_env_build, parse_env_case
-from app.features.ingestion.parsers.case_status import parse_case_status
+from app.features.ingestion.parsers.case_docs import (
+    parse_env_build,
+    parse_env_case,
+    parse_env_run,
+    parse_run_artifacts,
+)
 from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing
 from app.features.ingestion.parsers.git_info import (
     parse_git_config,
@@ -57,24 +61,6 @@ class FileSpec(TypedDict, total=False):
 
 
 FILE_SPECS: dict[str, FileSpec] = {
-    "e3sm_timing": {
-        "pattern": r"e3sm_timing\..*\..*",
-        "location": "root",
-        "parser": parse_e3sm_timing,
-        "required": True,
-    },
-    "readme_case": {
-        "pattern": r"README\.case\..*\.gz",
-        "location": "casedocs",
-        "parser": parse_readme_case,
-        "required": True,
-    },
-    "case_status": {
-        "pattern": r"CaseStatus\..*\.gz",
-        "location": "root",
-        "parser": parse_case_status,
-        "required": True,
-    },
     "case_docs_env_case": {
         "pattern": r"env_case\.xml\..*\.gz",
         "location": "casedocs",
@@ -85,7 +71,25 @@ class FileSpec(TypedDict, total=False):
         "pattern": r"env_build\.xml\..*\.gz",
         "location": "casedocs",
         "parser": parse_env_build,
-        "required": False,
+        "required": True,
+    },
+    "case_docs_env_run": {
+        "pattern": r"env_run\.xml\..*",
+        "location": "casedocs",
+        "parser": parse_env_run,
+        "required": True,
+    },
+    "readme_case": {
+        "pattern": r"README\.case\..*\.gz",
+        "location": "casedocs",
+        "parser": parse_readme_case,
+        "required": True,
+    },
+    "e3sm_timing": {
+        "pattern": r"e3sm_timing\..*",
+        "location": "root",
+        "parser": parse_e3sm_timing,
+        "required": True,
     },
     "git_describe": {
         "pattern": r"GIT_DESCRIBE\..*\.gz",
@@ -179,7 +183,7 @@ def main_parser(
 
                 continue
 
-            results[exec_dir] = _parse_all_files(metadata_files)
+            results[exec_dir] = _parse_all_files(exec_dir, metadata_files)
 
     if skipped_count:
         logger.info(
@@ -394,7 +398,7 @@ def _check_missing_files(files: SimulationFiles, exp_dir: str) -> None:
         )
 
 
-def _parse_all_files(files: dict[str, str | None]) -> SimulationMetadata:
+def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationMetadata:
     """Pass discovered files to their respective parser functions.
 
     Parameters
@@ -417,7 +421,10 @@ def _parse_all_files(files: dict[str, str | None]) -> SimulationMetadata:
         parser: Callable = spec["parser"]
         metadata.update(parser(path))
 
+    metadata.update(parse_run_artifacts(exec_dir))
+
     populated_fields: SimulationMetadata = {
+        "execution_id": metadata.get("execution_id"),
         "case_name": metadata.get("case_name"),
         "compset": metadata.get("compset"),
         "compset_alias": metadata.get("compset_alias"),
diff --git a/backend/tests/features/ingestion/parsers/test_case_docs.py b/backend/tests/features/ingestion/parsers/test_case_docs.py
index b09e140f..ed68436b 100644
--- a/backend/tests/features/ingestion/parsers/test_case_docs.py
+++ b/backend/tests/features/ingestion/parsers/test_case_docs.py
@@ -1,4 +1,12 @@
-from app.features.ingestion.parsers.case_docs import parse_env_build, parse_env_case
+from pathlib import Path
+from unittest.mock import patch
+
+from app.features.ingestion.parsers.case_docs import (
+    parse_env_build,
+    parse_env_case,
+    parse_env_run,
+    parse_run_artifacts,
+)
 
 
 class TestParseEnvCase:
@@ -48,6 +56,44 @@ def test_missing_entry_returns_none(self, tmp_path):
 
         assert result["case_group"] is None
 
+    def test_campaign_and_experiment_type_are_derived(self, tmp_path):
+        xml_case = """
+        <config>
+            <entry id="CASE" value="v3.LR.historical_0121" />
+        </config>
+        """
+        tmp_case = tmp_path / "env_case_campaign.xml"
+        tmp_case.write_text(xml_case)
+
+        result = parse_env_case(tmp_case)
+
+        assert result["campaign"] == "v3.LR.historical"
+        assert result["experiment_type"] == "historical"
+
+    def test_non_dot_case_name_does_not_set_campaign(self, tmp_path):
+        xml_case = """
+        <config>
+            <entry id="CASE" value="simple_case_0001" />
+        </config>
+        """
+        tmp_case = tmp_path / "env_case_simple.xml"
+        tmp_case.write_text(xml_case)
+
+        result = parse_env_case(tmp_case)
+
+        assert result["campaign"] is None
+        assert result["experiment_type"] is None
+
+    def test_read_error_returns_none(self):
+        with patch(
+            "app.features.ingestion.parsers.case_docs._open_text",
+            side_effect=OSError("boom"),
+        ):
+            result = parse_env_case(Path("/tmp/missing.xml"))
+
+        assert result["case_name"] is None
+        assert result["case_group"] is None
+
 
 class TestParseEnvBuild:
     def test_value(self, tmp_path):
@@ -91,3 +137,210 @@ def test_missing_entry_returns_none(self, tmp_path):
 
         assert result["compiler"] == "intel"
         assert result["mpilib"] is None
+
+
+class TestParseEnvRun:
+    def test_branch_uses_run_refdate(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="branch" />
+            <entry id="RUN_STARTDATE" value="2000-01-01" />
+            <entry id="RUN_REFDATE" value="1990-01-01" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["initialization_type"] == "branch"
+        assert result["simulation_start_date"] == "1990-01-01"
+        assert result["simulation_end_date"] is None
+
+    def test_non_branch_uses_run_startdate(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2001-02-03" />
+            <entry id="RUN_REFDATE" value="1990-01-01" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_start_date"] == "2001-02-03"
+
+    def test_stop_option_ndays(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="ndays" />
+            <entry id="STOP_N" value="10" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] == "2020-01-11"
+
+    def test_stop_option_nmonths(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="nmonths" />
+            <entry id="STOP_N" value="2" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] == "2020-03-01"
+
+    def test_stop_option_nyears(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="nyears" />
+            <entry id="STOP_N" value="3" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] == "2023-01-01"
+
+    def test_stop_option_date(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="date" />
+            <entry id="STOP_DATE" value="20241231" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] == "2024-12-31"
+
+    def test_missing_stop_n_returns_none(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="ndays" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] is None
+
+    def test_missing_stop_date_returns_none(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="date" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] is None
+
+    def test_invalid_xml_returns_none_dates(self, tmp_path):
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text("<config><entry id='RUN_TYPE'>branch")
+
+        result = parse_env_run(tmp_run)
+
+        assert result["initialization_type"] is None
+        assert result["simulation_start_date"] is None
+        assert result["simulation_end_date"] is None
+
+    def test_invalid_dates_return_none(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="invalid-date" />
+            <entry id="STOP_OPTION" value="ndays" />
+            <entry id="STOP_N" value="2" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_start_date"] == "invalid-date"
+        assert result["simulation_end_date"] is None
+
+    def test_unknown_stop_option_returns_none(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="nhours" />
+            <entry id="STOP_N" value="2" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] is None
+
+    def test_invalid_stop_date_returns_none(self, tmp_path):
+        xml_run = """
+        <config>
+            <entry id="RUN_TYPE" value="startup" />
+            <entry id="RUN_STARTDATE" value="2020-01-01" />
+            <entry id="STOP_OPTION" value="date" />
+            <entry id="STOP_DATE" value="invalid" />
+        </config>
+        """
+        tmp_run = tmp_path / "env_run.xml"
+        tmp_run.write_text(xml_run)
+
+        result = parse_env_run(tmp_run)
+
+        assert result["simulation_end_date"] is None
+
+
+class TestParseRunArtifacts:
+    def test_completed_status_when_timing_exists(self, tmp_path):
+        (tmp_path / "e3sm_timing.test").write_text("timing data")
+
+        result = parse_run_artifacts(tmp_path)
+
+        assert result["status"] == "completed"
+
+    def test_unknown_status_when_timing_missing(self, tmp_path):
+        result = parse_run_artifacts(tmp_path)
+
+        assert result["status"] == "unknown"
+
+    def test_unknown_status_when_directory_read_fails(self):
+        with patch.object(Path, "iterdir", side_effect=OSError("boom")):
+            result = parse_run_artifacts("/tmp/missing")
+
+        assert result["status"] == "unknown"
diff --git a/backend/tests/features/ingestion/parsers/test_case_status.py b/backend/tests/features/ingestion/parsers/test_case_status.py
deleted file mode 100644
index c4b2cbc6..00000000
--- a/backend/tests/features/ingestion/parsers/test_case_status.py
+++ /dev/null
@@ -1,262 +0,0 @@
-import logging
-from unittest.mock import Mock, patch
-
-from app.features.ingestion.parsers.case_status import (
-    _calculate_simulation_end_date,
-    _extract_latest_run_metadata,
-    _update_simulation_end_date,
-    parse_case_status,
-)
-from app.features.simulation.enums import SimulationStatus
-
-
-class TestCaseStatusParser:
-    def test_returns_default_result_on_read_error(self):
-        with (
-            patch(
-                "app.features.ingestion.parsers.case_status._get_open_func",
-                return_value=Mock(side_effect=OSError("boom")),
-            ),
-            patch(
-                "app.features.ingestion.parsers.case_status.logger.warning"
-            ) as mock_warning,
-        ):
-            result = parse_case_status("/tmp/missing/casestatus.txt")
-
-        assert result == {
-            "simulation_start_date": None,
-            "simulation_end_date": None,
-            "run_start_date": None,
-            "run_end_date": None,
-            "status": None,
-        }
-        mock_warning.assert_called_once()
-        assert "Failed to read case status file" in mock_warning.call_args.args[0]
-
-    def test_extracts_simulation_start_and_end_date_nmonths(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE=2015-01-01  </command>\n"
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=nmonths,STOP_N=41  </command>\n"
-        )
-        file_path = tmp_path / "casestatus.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] == "2015-01-01"
-        assert result["simulation_end_date"] == "2018-06-01"  # 41 months after Jan 2015
-
-    def test_extracts_simulation_start_and_end_date_ndays(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE=2020-01-01  </command>\n"
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=ndays,STOP_N=10  </command>\n"
-        )
-        file_path = tmp_path / "casestatus.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] == "2020-01-01"
-        assert result["simulation_end_date"] == "2020-01-11"
-
-    def test_extracts_simulation_start_and_end_date_nyears(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE=2000-01-01  </command>\n"
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=nyears,STOP_N=2  </command>\n"
-        )
-        file_path = tmp_path / "casestatus.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] == "2000-01-01"
-        assert result["simulation_end_date"] == "2002-01-01"
-
-    def test_missing_fields(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE=2015-01-01  </command>\n"
-        )
-        file_path = tmp_path / "casestatus.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] == "2015-01-01"
-        assert result["simulation_end_date"] is None
-
-    def test_malformed_lines_log_warning_and_continue(self, tmp_path, caplog):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE 2015-01-01  </command>\n"
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=ndays,STOP_N=not-a-number  </command>\n"
-        )
-        file_path = tmp_path / "casestatus.txt"
-        file_path.write_text(content)
-
-        logger = logging.getLogger("app.features.ingestion.parsers.case_status")
-        old_propagate = logger.propagate
-        old_level = logger.level
-        logger.propagate = False
-        logger.disabled = False
-        logger.setLevel(logging.WARNING)
-
-        logger.addHandler(caplog.handler)
-        try:
-            result = parse_case_status(str(file_path))
-        finally:
-            logger.removeHandler(caplog.handler)
-            logger.propagate = old_propagate
-            logger.setLevel(old_level)
-
-        assert result["simulation_start_date"] is None
-        assert result["simulation_end_date"] is None
-        assert any("Malformed RUN_STARTDATE" in message for message in caplog.messages)
-        assert any(
-            "Malformed STOP_OPTION/STOP_N" in message for message in caplog.messages
-        )
-
-    def test_stop_option_without_start_date_returns_none(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=ndays,STOP_N=10  </command>\n"
-        )
-        file_path = tmp_path / "casestatus_missing_start.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] is None
-        assert result["simulation_end_date"] is None
-
-    def test_unknown_stop_option_returns_none(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "RUN_STARTDATE=2020-01-01  </command>\n"
-            "2025-12-18 22:36:24: xmlchange success <command> ./xmlchange "
-            "STOP_OPTION=nhours,STOP_N=10  </command>\n"
-        )
-        file_path = tmp_path / "casestatus_unknown_stop.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["simulation_start_date"] == "2020-01-01"
-        assert result["simulation_end_date"] is None
-
-    def test_extract_latest_run_metadata_sets_completed_status(self):
-        lines = [
-            "2025-01-01 00:00:00: case.run starting 123\n",
-            "2025-01-01 01:00:00: case.run success\n",
-        ]
-        start_match = Mock()
-        start_match.group.return_value = "2025-01-01 00:00:00"
-        result = {
-            "run_start_date": None,
-            "run_end_date": None,
-            "status": None,
-        }
-
-        _extract_latest_run_metadata(lines, 0, start_match, result)
-
-        assert result["run_start_date"] == "2025-01-01 00:00:00"
-        assert result["run_end_date"] == "2025-01-01 01:00:00"
-        assert result["status"] == SimulationStatus.COMPLETED.value
-
-    def test_extract_latest_run_metadata_sets_failed_status(self):
-        lines = [
-            "2025-01-01 00:00:00: case.run starting 123\n",
-            "2025-01-01 01:00:00: case.run error\n",
-        ]
-        start_match = Mock()
-        start_match.group.return_value = "2025-01-01 00:00:00"
-        result = {
-            "run_start_date": None,
-            "run_end_date": None,
-            "status": None,
-        }
-
-        _extract_latest_run_metadata(lines, 0, start_match, result)
-
-        assert result["status"] == SimulationStatus.FAILED.value
-
-    def test_extract_latest_run_metadata_sets_running_without_terminal_state(self):
-        lines = [
-            "2025-01-01 00:00:00: case.run starting 123\n",
-            "noise line\n",
-        ]
-        start_match = Mock()
-        start_match.group.return_value = "2025-01-01 00:00:00"
-        result = {
-            "run_start_date": None,
-            "run_end_date": None,
-            "status": None,
-        }
-
-        _extract_latest_run_metadata(lines, 0, start_match, result)
-
-        assert result["run_start_date"] == "2025-01-01 00:00:00"
-        assert result["run_end_date"] is None
-        assert result["status"] == SimulationStatus.RUNNING.value
-
-    def test_extract_latest_run_metadata_no_start_match_no_changes(self):
-        result = {
-            "run_start_date": None,
-            "run_end_date": None,
-            "status": None,
-        }
-
-        _extract_latest_run_metadata([], None, None, result)
-
-        assert result["run_start_date"] is None
-        assert result["run_end_date"] is None
-        assert result["status"] is None
-
-    def test_update_simulation_end_date_logs_and_returns_on_invalid_stop_n(self):
-        stop_match = Mock()
-        stop_match.group.side_effect = lambda key: {
-            "stop_option": "ndays",
-            "stop_n": "not-an-int",
-        }[key]
-        result = {"simulation_start_date": "2020-01-01", "simulation_end_date": None}
-
-        with patch(
-            "app.features.ingestion.parsers.case_status.logger.warning"
-        ) as mock_warning:
-            _update_simulation_end_date(
-                "/tmp/casestatus.txt",
-                "STOP_OPTION=ndays,STOP_N=not-an-int",
-                stop_match,
-                result,
-            )
-
-        assert result["simulation_end_date"] is None
-        mock_warning.assert_called_once()
-        warning_message = mock_warning.call_args.args[0]
-        assert "Malformed STOP_OPTION/STOP_N line" in warning_message
-
-    def test_calculate_simulation_end_date_invalid_start_date_returns_none(self):
-        assert _calculate_simulation_end_date("invalid-date", "ndays", 3) is None
-
-    def test_parse_case_status_tracks_latest_case_run_start(self, tmp_path):
-        content = (
-            "2025-12-18 22:36:24: case.run starting 111\n"
-            "2025-12-18 23:36:24: case.run error\n"
-            "2025-12-19 00:36:24: case.run starting 222\n"
-            "2025-12-19 01:36:24: case.run success\n"
-        )
-        file_path = tmp_path / "casestatus_with_retries.txt"
-        file_path.write_text(content)
-
-        result = parse_case_status(str(file_path))
-
-        assert result["run_start_time"] == "2025-12-19 00:36:24"
-        assert result["run_end_time"] == "2025-12-19 01:36:24"
-        assert result["status"] == SimulationStatus.COMPLETED.value
diff --git a/backend/tests/features/ingestion/parsers/test_e3sm_timing.py b/backend/tests/features/ingestion/parsers/test_e3sm_timing.py
index b014031c..ed159449 100644
--- a/backend/tests/features/ingestion/parsers/test_e3sm_timing.py
+++ b/backend/tests/features/ingestion/parsers/test_e3sm_timing.py
@@ -1,28 +1,23 @@
 import gzip
+from pathlib import Path
+from unittest.mock import patch
 
 import pytest
 
-from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing
+from app.features.ingestion.parsers.e3sm_timing import _parse_seconds, parse_e3sm_timing
 
 CONTENT_FIXTURE = (
-    "Case: e3sm_v1_ne30\n"
-    "Machine: cori-knl\n"
-    "User: test_user\n"
-    "LID: 123456\n"
-    "Curr Date: Tue Jan 10 12:34:56 2023\n"
-    "grid: ne30_oECv3\n"
-    "compset: A_WCYCL1850\n"
-    "run length: 42 days (42.0 for ocean)\n"
-    "run type: branch, continue_run = TRUE (inittype = FALSE)\n"
-    "stop option: ndays\n"
-    "stop_n: 5\n"
+    "LID         : 1081156.251218-200923\n"
+    "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+    "Init Time   : 124.909 seconds\n"
+    "Run Time    : 2599.194 seconds\n"
+    "Final Time  : 0.375 seconds\n"
 )
 
 
 class TestE3SMTimingParser:
     @pytest.fixture
     def sample_timing_file(self, tmp_path):
-        """Create a sample E3SM timing file."""
         file_path = tmp_path / "e3sm_timing.txt"
         file_path.write_text(CONTENT_FIXTURE)
 
@@ -30,7 +25,6 @@ def sample_timing_file(self, tmp_path):
 
     @pytest.fixture
     def sample_gz_timing_file(self, tmp_path):
-        """Create a sample gzipped E3SM timing file."""
         file_path = tmp_path / "e3sm_timing.txt.gz"
         with gzip.open(file_path, "wt", encoding="utf-8") as f:
             f.write(CONTENT_FIXTURE)
@@ -40,87 +34,157 @@ def sample_gz_timing_file(self, tmp_path):
     def test_parse_plain(self, sample_timing_file):
         data = parse_e3sm_timing(sample_timing_file)
 
-        assert data["case_name"] == "e3sm_v1_ne30"
-        assert data["campaign"] is None
-        assert data["machine"] == "cori-knl"
-        assert data["user"] == "test_user"
-        assert data["lid"] == "123456"
-        assert data["simulation_start_date"] == "2023-01-10T12:34:56"
-        assert data["grid_resolution"] == "ne30_oECv3"
-        assert data["compset_alias"] == "A_WCYCL1850"
-        assert data["initialization_type"] == "branch"
-        assert data["run_config"]["stop_option"] == "ndays"
-        assert data["run_config"]["stop_n"] == "5"
-        assert data["run_config"]["run_length"] == "42 days (42.0 for ocean)"
+        assert data["execution_id"] == "1081156.251218-200923"
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+        assert data["run_start_date"] == "2025-12-18T20:09:33"
+        assert "simulation_start_date" not in data
 
     def test_parse_gz(self, sample_gz_timing_file):
         data = parse_e3sm_timing(sample_gz_timing_file)
 
-        assert data["case_name"] == "e3sm_v1_ne30"
-        assert data["campaign"] is None
-        assert data["machine"] == "cori-knl"
-        assert data["user"] == "test_user"
-        assert data["lid"] == "123456"
-        assert data["simulation_start_date"] == "2023-01-10T12:34:56"
-        assert data["grid_resolution"] == "ne30_oECv3"
-        assert data["compset_alias"] == "A_WCYCL1850"
-        assert data["initialization_type"] == "branch"
-        assert data["run_config"]["stop_option"] == "ndays"
-        assert data["run_config"]["stop_n"] == "5"
-        assert data["run_config"]["run_length"] == "42 days (42.0 for ocean)"
-
-    def test_missing_fields(self, tmp_path):
-        content = "Case: e3sm_v1_ne30\nMachine: cori-knl\n"
-        file_path = tmp_path / "e3sm_timing_missing.txt"
+        assert data["execution_id"] == "1081156.251218-200923"
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+        assert data["run_start_date"] == "2025-12-18T20:09:33"
+
+    def test_missing_init_time_leaves_run_start_none(self, tmp_path):
+        content = (
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
+        )
+        file_path = tmp_path / "e3sm_timing_missing_init.txt"
+        file_path.write_text(content)
+
+        data = parse_e3sm_timing(file_path)
+
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+        assert data["run_start_date"] is None
+
+    def test_missing_run_time_leaves_run_start_none(self, tmp_path):
+        content = (
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Init Time   : 124.909 seconds\n"
+            "Final Time  : 0.375 seconds\n"
+        )
+        file_path = tmp_path / "e3sm_timing_missing_run.txt"
         file_path.write_text(content)
+
+        data = parse_e3sm_timing(file_path)
+
+        assert data["run_start_date"] is None
+
+    def test_missing_final_time_leaves_run_start_none(self, tmp_path):
+        content = (
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Init Time   : 124.909 seconds\n"
+            "Run Time    : 2599.194 seconds\n"
+        )
+        file_path = tmp_path / "e3sm_timing_missing_final.txt"
+        file_path.write_text(content)
+
         data = parse_e3sm_timing(file_path)
 
-        assert data["case_name"] == "e3sm_v1_ne30"
-        assert data["campaign"] is None
-        assert data["machine"] == "cori-knl"
-        assert data["user"] is None
-        assert data["lid"] is None
-        assert data["simulation_start_date"] is None
-        assert data["grid_resolution"] is None
-        assert data["compset_alias"] is None
-        assert data["initialization_type"] is None
-        assert data["run_config"]["stop_option"] is None
-        assert data["run_config"]["stop_n"] is None
-        assert data["run_config"]["run_length"] is None
-
-    def test_invalid_date(self, tmp_path):
+        assert data["run_start_date"] is None
+
+    def test_malformed_curr_date_leaves_dates_none(self, tmp_path):
         content = (
-            "Case: e3sm_v1_ne30\nMachine: cori-knl\nCurr Date: Invalid Date Format\n"
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Invalid Date Format\n"
+            "Init Time   : 124.909 seconds\n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
         )
         file_path = tmp_path / "e3sm_timing_invalid_date.txt"
         file_path.write_text(content)
+
         data = parse_e3sm_timing(file_path)
 
-        assert data["case_name"] == "e3sm_v1_ne30"
-        assert data["campaign"] is None
-        assert data["machine"] == "cori-knl"
-        assert data["simulation_start_date"] == "Invalid Date Format"
+        assert data["run_end_date"] is None
+        assert data["run_start_date"] is None
 
-    def test_campaign_and_experiment_type_from_case_name(self, tmp_path):
+    def test_malformed_numeric_timing_values_leave_run_start_none(self, tmp_path):
         content = (
-            "Case: v3.LR.historical_0121\n"
-            "Machine: cori-knl\n"
-            "Curr Date: Tue Jan 10 12:34:56 2023\n"
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Init Time   : not-a-number\n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
         )
-        file_path = tmp_path / "e3sm_timing_campaign.txt"
+        file_path = tmp_path / "e3sm_timing_invalid_numeric.txt"
         file_path.write_text(content)
+
         data = parse_e3sm_timing(file_path)
 
-        assert data["campaign"] == "v3.LR.historical"
-        assert data["experiment_type"] == "historical"
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+        assert data["run_start_date"] is None
+
+    def test_read_error_returns_empty_result(self):
+        with patch(
+            "app.features.ingestion.parsers.e3sm_timing._open_text",
+            side_effect=OSError("boom"),
+        ):
+            data = parse_e3sm_timing(Path("/tmp/missing.txt"))
+
+        assert data == {
+            "execution_id": None,
+            "run_start_date": None,
+            "run_end_date": None,
+        }
 
-    def test_stop_option_and_stop_n_same_line(self, tmp_path):
+    def test_missing_curr_date_keeps_run_dates_none(self, tmp_path):
         content = (
-            "Case: e3sm_v1_ne30\nMachine: cori-knl\nstop option: ndays, stop_n=7\n"
+            "LID         : 1081156.251218-200923\n"
+            "Init Time   : 124.909 seconds\n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
         )
-        file_path = tmp_path / "e3sm_timing_stop_line.txt"
+        file_path = tmp_path / "e3sm_timing_missing_curr_date.txt"
         file_path.write_text(content)
+
         data = parse_e3sm_timing(file_path)
 
-        assert data["run_config"]["stop_option"] == "ndays"
-        assert data["run_config"]["stop_n"] == "7"
+        assert data["execution_id"] == "1081156.251218-200923"
+        assert data["run_end_date"] is None
+        assert data["run_start_date"] is None
+
+    def test_missing_lid_returns_none_execution_id(self, tmp_path):
+        content = (
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Init Time   : 124.909 seconds\n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
+        )
+        file_path = tmp_path / "e3sm_timing_missing_lid.txt"
+        file_path.write_text(content)
+
+        data = parse_e3sm_timing(file_path)
+
+        assert data["execution_id"] is None
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+
+    def test_empty_numeric_field_leaves_run_start_none(self, tmp_path):
+        content = (
+            "LID         : 1081156.251218-200923\n"
+            "Curr Date   : Thu Dec 18 20:54:58 2025\n"
+            "Init Time   : \n"
+            "Run Time    : 2599.194 seconds\n"
+            "Final Time  : 0.375 seconds\n"
+        )
+        file_path = tmp_path / "e3sm_timing_empty_numeric.txt"
+        file_path.write_text(content)
+
+        data = parse_e3sm_timing(file_path)
+
+        assert data["run_end_date"] == "2025-12-18T20:54:58"
+        assert data["run_start_date"] is None
+
+    def test_parse_seconds_returns_none_on_float_value_error(self):
+        with patch(
+            "app.features.ingestion.parsers.e3sm_timing.float",
+            side_effect=ValueError("boom"),
+            create=True,
+        ):
+            assert _parse_seconds("12.5 seconds") is None
diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py
index 3266dc19..9bb707e3 100644
--- a/backend/tests/features/ingestion/parsers/test_parser.py
+++ b/backend/tests/features/ingestion/parsers/test_parser.py
@@ -17,53 +17,39 @@
 
 class TestMainParser:
     @staticmethod
-    def _create_execution_metadata_files(execution_dir: Path, version: str) -> None:
-        """Create standard execution files for testing.
-
-        Parameters
-        ----------
-        execution_dir : Path
-            Directory where execution files will be created.
-        version : str
-            Version string for file naming (e.g., "001.001").
-        """
-        # Create required e3sm_timing file
-        timing_file = execution_dir / f"e3sm_timing.{version}"
-        timing_file.write_text("timing data")
-
-        # Create required CaseStatus file
-        with gzip.open(
-            execution_dir / f"CaseStatus.{version.split('.')[0]}.gz", "wt"
-        ) as f:
-            f.write("case status")
-
-        # Create required CaseDocs/README and env_case.xml
+    def _create_execution_metadata_files(
+        execution_dir: Path,
+        version: str,
+        *,
+        include_env_run: bool = True,
+        include_timing: bool = True,
+    ) -> None:
+        """Create standard execution files for testing."""
+        version_base = version.split(".")[0]
+
+        if include_timing:
+            timing_file = execution_dir / f"e3sm_timing.{version}"
+            timing_file.write_text("timing data")
+
         casedocs = execution_dir / "CaseDocs"
         casedocs.mkdir(exist_ok=True)
-        with gzip.open(casedocs / f"README.case.{version.split('.')[0]}.gz", "wt") as f:
+
+        with gzip.open(casedocs / f"README.case.{version_base}.gz", "wt") as f:
             f.write("readme content")
-        with gzip.open(
-            casedocs / f"env_case.xml.{version.split('.')[0]}.gz", "wt"
-        ) as f:
-            f.write('<config><entry id="CASE_HASH" value="testhash123" /></config>')
-
-        # Create required GIT_DESCRIBE file
-        with gzip.open(
-            execution_dir / f"GIT_DESCRIBE.{version.split('.')[0]}.gz", "wt"
-        ) as f:
+        with gzip.open(casedocs / f"env_case.xml.{version_base}.gz", "wt") as f:
+            f.write('<config><entry id="CASE" value="test_case" /></config>')
+        with gzip.open(casedocs / f"env_build.xml.{version_base}.gz", "wt") as f:
+            f.write('<config><entry id="COMPILER" value="gnu" /></config>')
+
+        if include_env_run:
+            with gzip.open(casedocs / f"env_run.xml.{version_base}.gz", "wt") as f:
+                f.write('<config><entry id="RUN_TYPE" value="startup" /></config>')
+
+        with gzip.open(execution_dir / f"GIT_DESCRIBE.{version_base}.gz", "wt") as f:
             f.write("describe content")
 
     @staticmethod
     def _create_optional_files(execution_dir: Path, version: str) -> None:
-        """Create optional git configuration files for testing.
-
-        Parameters
-        ----------
-        execution_dir : Path
-            Directory where git files will be created.
-        version : str
-            Version string for file naming (e.g., "001").
-        """
         version_base = version.split(".")[0] if "." in version else version
 
         with gzip.open(execution_dir / f"GIT_CONFIG.{version_base}.gz", "wt") as f:
@@ -73,73 +59,45 @@ def _create_optional_files(execution_dir: Path, version: str) -> None:
 
     @staticmethod
     def _create_zip_archive(base_dir: Path, archive_path: Path) -> None:
-        """Create a ZIP archive from directory contents.
-
-        Parameters
-        ----------
-        base_dir : Path
-            Directory relative to which paths are calculated.
-        archive_path : Path
-            Path where ZIP archive will be created.
-        """
         zip_file = zipfile.ZipFile(archive_path, "w")
 
         for root, _dirs, files_list in os.walk(str(base_dir)):
             for file in files_list:
                 file_path = Path(root) / file
                 arcname = str(file_path.relative_to(str(base_dir)))
-
                 zip_file.write(file_path, arcname)
 
         zip_file.close()
 
     @staticmethod
     def _create_tar_gz_archive(base_dir: Path, archive_path: Path) -> None:
-        """Create a TAR.GZ archive from directory contents.
-
-        Parameters
-        ----------
-        base_dir : Path
-            Directory relative to which paths are calculated.
-        archive_path : Path
-            Path where TAR.GZ archive will be created.
-        """
         tar_file = tarfile.open(archive_path, "w:gz")
 
         for root, _dirs, files_list in os.walk(str(base_dir)):
             for file in files_list:
                 file_path = Path(root) / file
                 arcname = str(file_path.relative_to(str(base_dir)))
-
                 tar_file.add(file_path, arcname=arcname)
 
         tar_file.close()
 
     @contextmanager
     def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
-        """Context manager to mock all parser functions with sensible defaults.
-
-        Parameters
-        ----------
-        **kwargs : dict
-            Override default return values for specific parsers:
-            parse_e3sm_timing, parse_readme_case, parse_case_status,
-            parse_git_describe, parse_git_config, parse_git_status.
-
-        Yields
-        ------
-        None
-            Context manager available for use in with statement.
-        """
-        # Set up default return values
         defaults = {
-            "parse_e3sm_timing": {
+            "parse_env_case": {
                 "case_name": "test_case",
                 "campaign": "test",
                 "machine": "test",
             },
+            "parse_env_build": {"compiler": "gnu"},
+            "parse_env_run": {"simulation_start_date": "2020-01-01"},
             "parse_readme_case": {},
-            "parse_case_status": {},
+            "parse_e3sm_timing": {
+                "execution_id": "1081156.251218-200923",
+                "run_start_date": "2025-12-18T20:09:33",
+                "run_end_date": "2025-12-18T20:54:58",
+            },
+            "parse_run_artifacts": {"status": "completed"},
             "parse_git_describe": {},
             "parse_git_config": None,
             "parse_git_status": None,
@@ -147,182 +105,160 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
         defaults.update(kwargs)
 
         with (
-            patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m1,
-            patch("app.features.ingestion.parsers.parser.parse_readme_case") as m2,
-            patch("app.features.ingestion.parsers.parser.parse_case_status") as m3,
-            patch("app.features.ingestion.parsers.parser.parse_git_describe") as m4,
-            patch("app.features.ingestion.parsers.parser.parse_git_config") as m5,
-            patch("app.features.ingestion.parsers.parser.parse_git_status") as m6,
+            patch("app.features.ingestion.parsers.parser.parse_env_case") as m1,
+            patch("app.features.ingestion.parsers.parser.parse_env_build") as m2,
+            patch("app.features.ingestion.parsers.parser.parse_env_run") as m3,
+            patch("app.features.ingestion.parsers.parser.parse_readme_case") as m4,
+            patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m5,
+            patch("app.features.ingestion.parsers.parser.parse_run_artifacts") as m6,
+            patch("app.features.ingestion.parsers.parser.parse_git_describe") as m7,
+            patch("app.features.ingestion.parsers.parser.parse_git_config") as m8,
+            patch("app.features.ingestion.parsers.parser.parse_git_status") as m9,
         ):
-            m1.return_value = defaults["parse_e3sm_timing"]
-            m2.return_value = defaults["parse_readme_case"]
-            m3.return_value = defaults["parse_case_status"]
-            m4.return_value = defaults["parse_git_describe"]
-            m5.return_value = defaults["parse_git_config"]
-            m6.return_value = defaults["parse_git_status"]
+            m1.return_value = defaults["parse_env_case"]
+            m2.return_value = defaults["parse_env_build"]
+            m3.return_value = defaults["parse_env_run"]
+            m4.return_value = defaults["parse_readme_case"]
+            m5.return_value = defaults["parse_e3sm_timing"]
+            m6.return_value = defaults["parse_run_artifacts"]
+            m7.return_value = defaults["parse_git_describe"]
+            m8.return_value = defaults["parse_git_config"]
+            m9.return_value = defaults["parse_git_status"]
             yield
 
+    def test_file_specs_remove_case_status_and_add_env_run(self) -> None:
+        assert "case_status" not in parser.FILE_SPECS
+        assert "case_docs_env_run" in parser.FILE_SPECS
+        assert "e3sm_timing" in parser.FILE_SPECS
+        assert parser.FILE_SPECS["case_docs_env_run"]["required"] is True
+        assert parser.FILE_SPECS["e3sm_timing"]["required"] is True
+
     def test_with_valid_zip_archive(self, tmp_path: Path) -> None:
-        """Test processing a valid ZIP archive with executions."""
-        # Create execution directory structure
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "1.0-0"
         execution_dir.mkdir(parents=True)
-
-        # Create standard required files
         self._create_execution_metadata_files(execution_dir, "001.001")
 
-        # Create ZIP archive
         archive_path = tmp_path / "archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Mock parser functions and verify execution was found and parsed
         with self._mock_all_parsers():
             result, skipped = parser.main_parser(archive_path, extract_dir)
-            assert len(result) > 0
-            assert skipped == 0
-            assert any("1.0-0" in key for key in result.keys())
+
+        assert len(result) > 0
+        assert skipped == 0
+        assert any("1.0-0" in key for key in result.keys())
 
     def test_with_tar_gz_archive(self, tmp_path: Path) -> None:
-        """Test processing a TAR.GZ archive."""
-        # Create execution directory
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "2.5-10"
         execution_dir.mkdir(parents=True)
-
-        # Create standard required files
         self._create_execution_metadata_files(execution_dir, "002.002")
 
-        # Create TAR.GZ archive
         archive_path = tmp_path / "archive.tar.gz"
         self._create_tar_gz_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Mock parser functions and verify execution was found and parsed
-        with self._mock_all_parsers(parse_e3sm_timing={"case_name": "tar_test"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(archive_path, extract_dir)
-            assert len(result) > 0
-            assert skipped == 0
-            assert any("2.5-10" in key for key in result.keys())
+
+        assert len(result) > 0
+        assert skipped == 0
+        assert any("2.5-10" in key for key in result.keys())
 
     def test_with_multiple_executions(self, tmp_path: Path) -> None:
-        """Test processing archive with multiple executions."""
         archive_base = tmp_path / "archive_extract"
         archive_base.mkdir()
 
-        # Create first execution
         exec_dir1 = archive_base / "1.0-0"
         exec_dir1.mkdir(parents=True)
         self._create_execution_metadata_files(exec_dir1, "001.001")
 
-        # Create second execution
         exec_dir2 = archive_base / "2.0-0"
         exec_dir2.mkdir(parents=True)
         self._create_execution_metadata_files(exec_dir2, "002.002")
 
-        # Create ZIP archive
         archive_path = tmp_path / "multi_archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Mock parser functions and verify both executions were found
-        with self._mock_all_parsers(parse_e3sm_timing={"case_name": "test"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(archive_path, extract_dir)
-            assert len(result) == 2
-            assert skipped == 0
-            assert any("1.0-0" in key for key in result.keys())
-            assert any("2.0-0" in key for key in result.keys())
 
-    def test_with_nested_executions(self, tmp_path: Path) -> None:
-        """Test finding executions in nested directories.
+        assert len(result) == 2
+        assert skipped == 0
 
-        Parameters
-        ----------
-        tmp_path : Path
-            Temporary directory provided by pytest.
-        """
+    def test_with_nested_executions(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "parent" / "1.0-0"
         execution_dir.mkdir(parents=True)
-
-        # Create standard required files
         self._create_execution_metadata_files(execution_dir, "001.001")
 
-        # Create ZIP archive
         archive_path = tmp_path / "nested_archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Mock parser functions and verify nested execution was found
-        with self._mock_all_parsers(parse_e3sm_timing={"case_name": "nested_test"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(archive_path, extract_dir)
-            assert len(result) > 0
-            assert skipped == 0
-            assert any("1.0-0" in key for key in result.keys())
+
+        assert len(result) > 0
+        assert skipped == 0
 
     def test_missing_required_files_skips_incomplete_run(self, tmp_path: Path) -> None:
-        """Test that incomplete runs (missing required files) are skipped."""
-        # Create execution directory WITHOUT required files
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "1.0-0"
         execution_dir.mkdir(parents=True)
         (execution_dir / "dummy.txt").write_text("dummy")
 
-        # Create ZIP archive
         archive_path = tmp_path / "bad_archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Incomplete runs are skipped; result is empty rather than an error
         result, skipped = parser.main_parser(archive_path, extract_dir)
+
         assert result == {}
         assert skipped == 1
 
-    def test_multiple_matching_files_raises_error(self, tmp_path: Path) -> None:
-        """Test error when multiple files match a pattern."""
-        # Create execution with duplicate files
+    def test_multiple_matching_timing_files_raises_error(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "1.0-0"
         execution_dir.mkdir(parents=True)
 
-        # Create duplicate e3sm_timing files (violates single-file requirement)
         (execution_dir / "e3sm_timing.001.001").write_text("timing1")
         (execution_dir / "e3sm_timing.002.002").write_text("timing2")
 
-        # Create other required files
-        with gzip.open(execution_dir / "CaseStatus.001.gz", "wt") as f:
-            f.write("status")
         casedocs = execution_dir / "CaseDocs"
         casedocs.mkdir()
         with gzip.open(casedocs / "README.case.001.gz", "wt") as f:
             f.write("readme")
+        with gzip.open(casedocs / "env_case.xml.001.gz", "wt") as f:
+            f.write('<config><entry id="CASE" value="test_case" /></config>')
+        with gzip.open(casedocs / "env_build.xml.001.gz", "wt") as f:
+            f.write('<config><entry id="COMPILER" value="gnu" /></config>')
         with gzip.open(execution_dir / "GIT_DESCRIBE.001.gz", "wt") as f:
             f.write("describe")
 
-        # Create ZIP archive
         archive_path = tmp_path / "duplicate_archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Should raise ValueError for multiple matching files
         with pytest.raises(ValueError, match="Multiple files matching pattern"):
             parser.main_parser(archive_path, extract_dir)
 
     def test_unsupported_archive_format_raises_error(self, tmp_path: Path) -> None:
-        """Test error for unsupported archive formats."""
         archive_path = tmp_path / "archive.rar"
         archive_path.write_text("not a real archive")
 
@@ -333,66 +269,91 @@ def test_unsupported_archive_format_raises_error(self, tmp_path: Path) -> None:
             parser.main_parser(str(archive_path), extract_dir)
 
     def test_no_case_directories_raises_error(self, tmp_path: Path) -> None:
-        """Test error when no case directories are found.
-
-        Parameters
-        ----------
-        tmp_path : Path
-            Temporary directory provided by pytest.
-
-        Raises
-        ------
-        FileNotFoundError
-            Expected when no valid case directories are discovered.
-        """
-        # Create a ZIP archive with no case directories
         archive_base = tmp_path / "archive_extract"
         archive_base.mkdir()
         (archive_base / "some_other_dir").mkdir()
         (archive_base / "some_other_dir" / "file.txt").write_text("data")
 
-        # Create ZIP archive
         archive_path = tmp_path / "empty_archive.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Should raise FileNotFoundError when no case directories found
         with pytest.raises(
             FileNotFoundError, match="No cases or execution directories found"
         ):
             parser.main_parser(archive_path, extract_dir)
 
-    def test_with_optional_files(self, tmp_path):
-        """Test handling optional files properly."""
-        # Create execution with optional files
+    def test_with_optional_files(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
         execution_dir = archive_base / "1.0-0"
         execution_dir.mkdir(parents=True)
-
-        # Create required files
         self._create_execution_metadata_files(execution_dir, "001.001")
-
-        # Create optional git files
         self._create_optional_files(execution_dir, "001")
 
-        # Create ZIP archive
         archive_path = tmp_path / "with_optional.zip"
         self._create_zip_archive(archive_base, archive_path)
 
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        # Mock parsers with optional file returns
         with self._mock_all_parsers(
-            parse_git_config="https://github.com/test/repo", parse_git_status="main"
+            parse_git_config="https://github.com/test/repo",
+            parse_git_status="main",
         ):
-            result = parser.main_parser(archive_path, extract_dir)
-            assert len(result) > 0
+            result, skipped = parser.main_parser(archive_path, extract_dir)
+
+        assert len(result) > 0
+        assert skipped == 0
+
+    def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None:
+        archive_base = tmp_path / "archive_extract"
+        execution_dir = archive_base / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(
+            execution_dir,
+            "001.001",
+            include_env_run=False,
+        )
+
+        archive_path = tmp_path / "missing_env_run.zip"
+        self._create_zip_archive(archive_base, archive_path)
+
+        extract_dir = tmp_path / "extracted"
+        extract_dir.mkdir()
+
+        result, skipped = parser.main_parser(archive_path, extract_dir)
+
+        assert result == {}
+        assert skipped == 1
+
+    def test_completed_status_is_merged(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(execution_dir, "001.001")
+
+        with self._mock_all_parsers(parse_run_artifacts={"status": "completed"}):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert skipped == 0
+        assert next(iter(result.values()))["status"] == "completed"
+
+    def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(
+            execution_dir,
+            "001.001",
+            include_timing=False,
+        )
+
+        result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert result == {}
+        assert skipped == 1
 
     def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None:
-        """Test that ZIP extraction rejects path traversal entries."""
         archive_path = tmp_path / "traversal.zip"
         with zipfile.ZipFile(archive_path, "w") as zip_ref:
             zip_ref.writestr("../evil.txt", "data")
@@ -404,7 +365,6 @@ def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None:
             parser._extract_zip(str(archive_path), str(extract_dir))
 
     def test_tar_path_traversal_rejected(self, tmp_path: Path) -> None:
-        """Test that TAR.GZ extraction rejects path traversal entries."""
         archive_path = tmp_path / "traversal.tar.gz"
         with tarfile.open(archive_path, "w:gz") as tar_ref:
             payload = io.BytesIO(b"data")
@@ -419,7 +379,6 @@ def test_tar_path_traversal_rejected(self, tmp_path: Path) -> None:
             parser._extract_tar_gz(str(archive_path), str(extract_dir))
 
     def test_tar_symlink_rejected(self, tmp_path: Path) -> None:
-        """Test that TAR.GZ extraction rejects symlink entries."""
         archive_path = tmp_path / "symlink.tar.gz"
         with tarfile.open(archive_path, "w:gz") as tar_ref:
             tar_info = tarfile.TarInfo(name="link")
@@ -436,7 +395,6 @@ def test_tar_symlink_rejected(self, tmp_path: Path) -> None:
     def test_main_parser_treats_directory_as_already_extracted(
         self, tmp_path: Path
     ) -> None:
-        """Test non-archive directory input path branch in main_parser."""
         execution_dir = tmp_path / "1.0-0"
         execution_dir.mkdir(parents=True)
         self._create_execution_metadata_files(execution_dir, "001.001")
@@ -445,23 +403,19 @@ def test_main_parser_treats_directory_as_already_extracted(
             result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
 
         assert len(result) == 1
-        assert any("1.0-0" in key for key in result)
+        assert skipped == 0
 
     def test_extract_archive_unsupported_format_raises_error(self) -> None:
-        """Test _extract_archive direct unsupported-extension branch."""
         with pytest.raises(ValueError, match="Unsupported archive format"):
             parser._extract_archive("/tmp/archive.7z", "/tmp/output")
 
     def test_incomplete_run_among_valid_runs(self, tmp_path: Path) -> None:
-        """Test that an incomplete run is skipped while valid runs are parsed."""
         archive_base = tmp_path / "archive_extract"
 
-        # Create a valid execution directory
         execution_dir_valid = archive_base / "1.0-0"
         execution_dir_valid.mkdir(parents=True)
         self._create_execution_metadata_files(execution_dir_valid, "001.001")
 
-        # Create an incomplete execution directory (missing required files)
         execution_dir_incomplete = archive_base / "2.0-0"
         execution_dir_incomplete.mkdir(parents=True)
         (execution_dir_incomplete / "dummy.txt").write_text("no required files")
@@ -472,21 +426,13 @@ def test_incomplete_run_among_valid_runs(self, tmp_path: Path) -> None:
         extract_dir = tmp_path / "extracted"
         extract_dir.mkdir()
 
-        with self._mock_all_parsers(parse_e3sm_timing={"case_name": "mixed_test"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(archive_path, extract_dir)
-            # Only the valid execution should be in the result
-            assert len(result) == 1
-            assert any("1.0-0" in key for key in result.keys())
-            assert not any("2.0-0" in key for key in result.keys())
 
-    def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None:
-        """Test multiple <jobID>.<timestamp> dirs under the same casename.
+        assert len(result) == 1
+        assert skipped == 1
 
-        Simulates the performance_archive structure where a casename
-        directory contains multiple runs, each represented by a
-        subdirectory matching the <jobID>.<timestamp> pattern.
-        """
-        # Simulate: casename/1081156.251218-200923, casename/1081290.251218-211543
+    def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
         casename_dir = archive_base / "v3.LR.historical_0121"
         casename_dir.mkdir(parents=True)
@@ -499,20 +445,17 @@ def test_multiple_runs_under_same_casename(self, tmp_path: Path) -> None:
         run2.mkdir()
         self._create_execution_metadata_files(run2, "002.002")
 
-        with self._mock_all_parsers(parse_e3sm_timing={"case_name": "v3_hist"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(casename_dir, tmp_path / "out")
-            # Both runs should be parsed successfully
-            assert len(result) == 2
-            assert any("1081156" in key for key in result.keys())
-            assert any("1081290" in key for key in result.keys())
+
+        assert len(result) == 2
+        assert skipped == 0
 
     def test_deterministic_sort_order(self, tmp_path: Path) -> None:
-        """Test that execution dirs are sorted for deterministic processing."""
         archive_base = tmp_path / "archive_extract"
         casename_dir = archive_base / "case1"
         casename_dir.mkdir(parents=True)
 
-        # Create dirs in reverse order
         for name in ["3.0-0", "1.0-0", "2.0-0"]:
             d = casename_dir / name
             d.mkdir()
@@ -534,5 +477,4 @@ def tracking_locate(execution_dir: str) -> Any:
         ):
             parser.main_parser(casename_dir, tmp_path / "out")
 
-        # Should be processed in sorted order
         assert call_order == sorted(call_order)
diff --git a/backend/tests/features/ingestion/parsers/test_utils.py b/backend/tests/features/ingestion/parsers/test_utils.py
new file mode 100644
index 00000000..9a7f2d47
--- /dev/null
+++ b/backend/tests/features/ingestion/parsers/test_utils.py
@@ -0,0 +1,24 @@
+import gzip
+
+from app.features.ingestion.parsers.utils import _get_open_func, _open_text
+
+
+class TestParserUtils:
+    def test_open_text_reads_plain_file(self, tmp_path):
+        file_path = tmp_path / "plain.txt"
+        file_path.write_text("plain text")
+
+        assert _open_text(file_path) == "plain text"
+
+    def test_open_text_reads_gz_file(self, tmp_path):
+        file_path = tmp_path / "plain.txt.gz"
+        with gzip.open(file_path, "wt", encoding="utf-8") as f:
+            f.write("gz text")
+
+        assert _open_text(file_path) == "gz text"
+
+    def test_get_open_func_returns_gzip_open_for_gz(self):
+        assert _get_open_func("file.txt.gz") is gzip.open
+
+    def test_get_open_func_returns_open_for_plain_file(self):
+        assert _get_open_func("file.txt") is open

From 84774992a67bcf770746a90724895451dbe403cd Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Fri, 13 Mar 2026 17:15:28 -0700
Subject: [PATCH 02/10] Add NERSC archive ingestor coverage tests

---
 .../app/features/ingestion/parsers/parser.py  |  11 +-
 .../ingestion/test_nersc_archive_ingestor.py  | 679 ++++++++++++++++++
 2 files changed, 682 insertions(+), 8 deletions(-)

diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py
index 51a9d178..db871ee4 100644
--- a/backend/app/features/ingestion/parsers/parser.py
+++ b/backend/app/features/ingestion/parsers/parser.py
@@ -423,7 +423,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM
 
     metadata.update(parse_run_artifacts(exec_dir))
 
-    populated_fields: SimulationMetadata = {
+    simulation: SimulationMetadata = {
         "execution_id": metadata.get("execution_id"),
         "case_name": metadata.get("case_name"),
         "compset": metadata.get("compset"),
@@ -439,16 +439,13 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM
         "run_start_date": metadata.get("run_start_date"),
         "run_end_date": metadata.get("run_end_date"),
         "compiler": metadata.get("compiler"),
+        "machine": metadata.get("machine"),
+        "hpc_username": metadata.get("user"),
         "git_repository_url": metadata.get("git_repository_url"),
         "git_branch": metadata.get("git_branch"),
         "git_tag": metadata.get("git_tag"),
         "git_commit_hash": metadata.get("git_commit_hash"),
-        "machine": metadata.get("machine"),
-        "hpc_username": metadata.get("user"),
         "status": metadata.get("status", SimulationStatus.UNKNOWN.value),
-    }
-
-    placeholder_fields: SimulationMetadata = {
         # TODO: Skip this for MVP, not required. We can add it later if we find
         # a way to determine it from the parsed files.
         "parent_simulation_id": None,
@@ -461,6 +458,4 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM
         "links": None,
     }
 
-    simulation: SimulationMetadata = {**populated_fields, **placeholder_fields}
-
     return simulation
diff --git a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py
index cb0772d1..0a9385c0 100644
--- a/backend/tests/features/ingestion/test_nersc_archive_ingestor.py
+++ b/backend/tests/features/ingestion/test_nersc_archive_ingestor.py
@@ -1,8 +1,17 @@
 """Tests for the NERSC archive ingestion runner script."""
 
+import json
+import logging
+import runpy
+import urllib.error
+import urllib.request
+from email.message import Message
 from pathlib import Path
 from typing import Any
 
+import pytest
+
+from app.api.version import API_BASE
 from app.scripts.ingestion import nersc_archive_ingestor as ingestor_module
 from app.scripts.ingestion.nersc_archive_ingestor import (
     CaseScanResult,
@@ -11,12 +20,26 @@
     IngestionRequestResponse,
     IngestorConfig,
     _build_case_scan_results,
+    _build_config_from_env,
     _build_ingestion_candidates,
+    _case_state_processed_ids,
     _discover_case_executions,
     _fresh_state,
     _ingest_case_with_retries,
+    _is_transient_status,
+    _load_state,
+    _log_execution_skip_detail,
+    _log_startup_configuration,
+    _log_summary_table,
+    _normalized_api_base_url,
+    _parse_bool,
+    _parse_optional_int,
+    _post_ingestion_request,
     _record_successful_case,
+    _render_log_value,
     _run_ingestor,
+    _save_state,
+    _validate_execution_dir,
 )
 
 
@@ -70,6 +93,40 @@ def fake_locator(execution_dir: str) -> dict[str, str]:
     assert stats["skipped_invalid"] == 1
 
 
+def test_discover_case_executions_logs_suppressed_skips(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    archive_root = tmp_path / "archive"
+    for index in range(ingestor_module.MAX_SKIP_DETAIL_LOGS + 2):
+        (archive_root / "case_a" / f"{100 + index}.1-1").mkdir(parents=True)
+
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+
+    grouped = _discover_case_executions(
+        archive_root,
+        metadata_locator=lambda *_: (_ for _ in ()).throw(FileNotFoundError("missing")),
+    )
+
+    assert grouped == {}
+    suppression = [
+        fields
+        for event, fields in logged_events
+        if event == "execution_skip_logs_suppressed"
+    ]
+    assert suppression == [
+        {
+            "suppressed_count": 2,
+            "detail_log_limit": ingestor_module.MAX_SKIP_DETAIL_LOGS,
+        }
+    ]
+
+
 def test_build_ingestion_candidates_is_idempotent() -> None:
     scan_results = [
         CaseScanResult(
@@ -114,6 +171,104 @@ def test_build_ingestion_candidates_is_idempotent() -> None:
     assert third_candidates[0].new_execution_ids == ["101.1-1"]
 
 
+def test_build_ingestion_candidates_handles_non_dict_case_state_and_limit() -> None:
+    scan_results = [
+        CaseScanResult(
+            case_path="/performance_archive/case_a",
+            execution_ids=["100.1-1"],
+            fingerprint="fp-1",
+        ),
+        CaseScanResult(
+            case_path="/performance_archive/case_b",
+            execution_ids=["200.1-1"],
+            fingerprint="fp-2",
+        ),
+    ]
+    state = {
+        "cases": {
+            "/performance_archive/case_a": "invalid",
+            "/performance_archive/case_b": {
+                "processed_execution_ids": ["200.1-1"],
+            },
+        }
+    }
+
+    candidates = _build_ingestion_candidates(
+        scan_results,
+        state,
+        max_cases_per_run=1,
+    )
+
+    assert len(candidates) == 1
+    assert candidates[0].case_path == "/performance_archive/case_a"
+
+
+def test_build_ingestion_candidates_handles_non_dict_cases_root() -> None:
+    scan_results = [
+        CaseScanResult(
+            case_path="/performance_archive/case_a",
+            execution_ids=["100.1-1"],
+            fingerprint="fp-1",
+        )
+    ]
+
+    candidates = _build_ingestion_candidates(
+        scan_results,
+        state={"cases": "invalid"},
+        max_cases_per_run=None,
+    )
+
+    assert len(candidates) == 1
+    assert candidates[0].new_execution_ids == ["100.1-1"]
+
+
+def test_validate_execution_dir_counts_incomplete_with_stats(tmp_path: Path) -> None:
+    case_dir = tmp_path / "case_a"
+    (case_dir / "100.1-1").mkdir(parents=True)
+    stats = ingestor_module._new_discovery_stats()
+    skip_log_state = {"logged": 0, "suppressed": 0}
+
+    valid = _validate_execution_dir(
+        case_dir,
+        "100.1-1",
+        metadata_locator=lambda *_: (_ for _ in ()).throw(FileNotFoundError("missing")),
+        stats=stats,
+        skip_log_state=skip_log_state,
+    )
+
+    assert valid is False
+    assert stats["skipped_incomplete"] == 1
+
+
+def test_validate_execution_dir_counts_value_error_with_stats(tmp_path: Path) -> None:
+    case_dir = tmp_path / "case_a"
+    (case_dir / "100.1-1").mkdir(parents=True)
+    stats = ingestor_module._new_discovery_stats()
+    skip_log_state = {"logged": 0, "suppressed": 0}
+
+    valid = _validate_execution_dir(
+        case_dir,
+        "100.1-1",
+        metadata_locator=lambda *_: (_ for _ in ()).throw(ValueError("invalid")),
+        stats=stats,
+        skip_log_state=skip_log_state,
+    )
+
+    assert valid is False
+    assert stats["skipped_invalid"] == 1
+
+
+def test_build_case_scan_results_skips_empty_execution_lists() -> None:
+    grouped = {
+        "/performance_archive/case_a": [],
+        "/performance_archive/case_b": ["200.1-1"],
+    }
+
+    results = _build_case_scan_results(grouped)
+
+    assert [result.case_path for result in results] == ["/performance_archive/case_b"]
+
+
 def test_ingest_case_with_retries_retries_transient_errors() -> None:
     candidate = IngestionCandidate(
         case_path="/performance_archive/case_a",
@@ -259,6 +414,46 @@ def fake_post_request(
     assert str(case_dir.resolve()) in reloaded_state["cases"]
 
 
+def test_handle_ingest_run_returns_failure_when_case_ingestion_fails(
+    tmp_path: Path,
+    monkeypatch,
+) -> None:
+    archive_root = tmp_path / "archive"
+    case_dir = archive_root / "case_a"
+    (case_dir / "100.1-1").mkdir(parents=True)
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+
+    config = IngestorConfig(
+        api_base_url="http://backend:8000",
+        api_token="token",
+        archive_root=archive_root,
+        machine_name="perlmutter",
+        state_path=tmp_path / "state.json",
+        dry_run=False,
+        max_cases_per_run=None,
+        max_attempts=1,
+        request_timeout_seconds=30,
+    )
+
+    def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse:
+        raise IngestionRequestError("boom", status_code=503, transient=False)
+
+    exit_code = _run_ingestor(
+        config,
+        metadata_locator=lambda *_: {},
+        sleep_fn=lambda *_: None,
+        post_request_fn=fake_post_request,
+    )
+
+    assert exit_code == 1
+    assert any(event == "case_ingestion_failed" for event, _ in logged_events)
+
+
 def test_build_case_scan_results_is_deterministic() -> None:
     grouped = {
         "/performance_archive/case_b": ["200.1-1"],
@@ -490,3 +685,487 @@ def fake_ingest_post_request(*args: Any, **kwargs: Any) -> IngestionRequestRespo
         assert isinstance(payload["execution_dirs_accepted"], int)
         assert isinstance(payload["skipped_incomplete"], int)
         assert isinstance(payload["skipped_invalid"], int)
+
+
+def test_build_config_from_env_parses_valid_values(monkeypatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("SIMBOARD_API_BASE_URL", "http://example")
+    monkeypatch.setenv("SIMBOARD_API_TOKEN", "token")
+    monkeypatch.setenv("PERF_ARCHIVE_ROOT", str(tmp_path / "archive"))
+    monkeypatch.setenv("MACHINE_NAME", "pm")
+    monkeypatch.setenv("STATE_PATH", str(tmp_path / "state.json"))
+    monkeypatch.setenv("DRY_RUN", "true")
+    monkeypatch.setenv("MAX_CASES_PER_RUN", "5")
+    monkeypatch.setenv("MAX_ATTEMPTS", "4")
+    monkeypatch.setenv("REQUEST_TIMEOUT_SECONDS", "90")
+
+    config = _build_config_from_env()
+
+    assert config.api_base_url == "http://example"
+    assert config.api_token == "token"
+    assert config.archive_root == (tmp_path / "archive").resolve()
+    assert config.machine_name == "pm"
+    assert config.state_path == (tmp_path / "state.json").resolve()
+    assert config.dry_run is True
+    assert config.max_cases_per_run == 5
+    assert config.max_attempts == 4
+    assert config.request_timeout_seconds == 90
+
+
+@pytest.mark.parametrize(
+    ("env_name", "env_value", "message"),
+    [
+        ("MAX_CASES_PER_RUN", "0", "MAX_CASES_PER_RUN must be greater than 0"),
+        ("MAX_ATTEMPTS", "0", "MAX_ATTEMPTS must be greater than 0"),
+        (
+            "REQUEST_TIMEOUT_SECONDS",
+            "0",
+            "REQUEST_TIMEOUT_SECONDS must be greater than 0",
+        ),
+    ],
+)
+def test_build_config_from_env_rejects_invalid_positive_values(
+    monkeypatch,
+    env_name: str,
+    env_value: str,
+    message: str,
+) -> None:
+    monkeypatch.setenv(env_name, env_value)
+
+    with pytest.raises(ValueError, match=message):
+        _build_config_from_env()
+
+
+def test_main_returns_configuration_error_when_config_build_fails(monkeypatch) -> None:
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(
+        ingestor_module,
+        "_build_config_from_env",
+        lambda: (_ for _ in ()).throw(ValueError("bad config")),
+    )
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+
+    exit_code = ingestor_module.main()
+
+    assert exit_code == 1
+    assert logged_events == [("configuration_error", {"error": "bad config"})]
+
+
+def test_main_logs_run_started_and_finished(monkeypatch, tmp_path: Path) -> None:
+    config = IngestorConfig(
+        api_base_url="http://backend:8000",
+        api_token="token",
+        archive_root=tmp_path,
+        machine_name="perlmutter",
+        state_path=tmp_path / "state.json",
+        dry_run=False,
+        max_cases_per_run=None,
+        max_attempts=1,
+        request_timeout_seconds=30,
+    )
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_build_config_from_env", lambda: config)
+    monkeypatch.setattr(ingestor_module, "_run_ingestor", lambda cfg: 0)
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+    monkeypatch.setattr(
+        ingestor_module.time, "monotonic", lambda: 10.0 if not logged_events else 12.5
+    )
+
+    exit_code = ingestor_module.main()
+
+    assert exit_code == 0
+    assert logged_events[0][0] == "run_started"
+    assert logged_events[-1][0] == "run_finished"
+
+
+def test_module_main_guard_exits_via_system_exit_on_configuration_error(
+    monkeypatch,
+) -> None:
+    script_path = (
+        Path(__file__).resolve().parents[3]
+        / "app/scripts/ingestion/nersc_archive_ingestor.py"
+    )
+    monkeypatch.setenv("MAX_ATTEMPTS", "0")
+    monkeypatch.setattr(logging.Logger, "info", lambda *args, **kwargs: None)
+
+    with pytest.raises(SystemExit) as exc_info:
+        runpy.run_path(str(script_path), run_name="__main__")
+
+    assert exc_info.value.code == 1
+
+
+@pytest.mark.parametrize(
+    ("value", "default", "expected"),
+    [
+        (None, True, True),
+        ("yes", False, True),
+        ("off", True, False),
+        ("maybe", True, True),
+    ],
+)
+def test_parse_bool(value: str | None, default: bool, expected: bool) -> None:
+    assert _parse_bool(value, default=default) is expected
+
+
+def test_parse_optional_int_handles_none_and_blank() -> None:
+    assert _parse_optional_int(None) is None
+    assert _parse_optional_int("   ") is None
+    assert _parse_optional_int("7") == 7
+
+
+def test_ingest_case_with_retries_uses_default_post_request_fn(monkeypatch) -> None:
+    candidate = IngestionCandidate(
+        case_path="/performance_archive/case_a",
+        execution_ids=["100.1-1"],
+        new_execution_ids=["100.1-1"],
+        fingerprint="fp-1",
+    )
+    captured: list[tuple[str, str, str, str, int]] = []
+
+    def fake_post(
+        endpoint_url: str,
+        api_token: str,
+        archive_path: str,
+        machine_name: str,
+        *,
+        timeout_seconds: int,
+    ) -> IngestionRequestResponse:
+        captured.append(
+            (endpoint_url, api_token, archive_path, machine_name, timeout_seconds)
+        )
+        return {"status_code": 201, "body": {"created_count": 1}}
+
+    monkeypatch.setattr(ingestor_module, "_post_ingestion_request", fake_post)
+
+    result = _ingest_case_with_retries(
+        candidate,
+        endpoint_url="http://backend:8000/api/v1/ingestions/from-path",
+        api_token="token",
+        machine_name="pm",
+        max_attempts=1,
+        timeout_seconds=5,
+        sleep_fn=lambda *_: None,
+    )
+
+    assert result["ok"] is True
+    assert captured == [
+        (
+            "http://backend:8000/api/v1/ingestions/from-path",
+            "token",
+            "/performance_archive/case_a",
+            "pm",
+            5,
+        )
+    ]
+
+
+def test_ingest_case_with_retries_normalizes_non_dict_body() -> None:
+    candidate = IngestionCandidate(
+        case_path="/performance_archive/case_a",
+        execution_ids=["100.1-1"],
+        new_execution_ids=["100.1-1"],
+        fingerprint="fp-1",
+    )
+
+    def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse:
+        return {"status_code": 201, "body": "bad"}  # type: ignore[typeddict-item]
+
+    result = _ingest_case_with_retries(
+        candidate,
+        endpoint_url="http://backend:8000/api/v1/ingestions/from-path",
+        api_token="token",
+        machine_name="pm",
+        max_attempts=1,
+        timeout_seconds=5,
+        sleep_fn=lambda *_: None,
+        post_request_fn=fake_post_request,
+    )
+
+    assert result["ok"] is True
+    assert result["body"] == {}
+
+
+def test_ingest_case_with_retries_returns_exhausted_retries_when_zero_attempts() -> (
+    None
+):
+    candidate = IngestionCandidate(
+        case_path="/performance_archive/case_a",
+        execution_ids=["100.1-1"],
+        new_execution_ids=["100.1-1"],
+        fingerprint="fp-1",
+    )
+
+    def fake_post_request(*args: Any, **kwargs: Any) -> IngestionRequestResponse:
+        return {"status_code": 201, "body": {}}
+
+    result = _ingest_case_with_retries(
+        candidate,
+        endpoint_url="http://backend:8000/api/v1/ingestions/from-path",
+        api_token="token",
+        machine_name="pm",
+        max_attempts=0,
+        timeout_seconds=5,
+        sleep_fn=lambda *_: None,
+        post_request_fn=fake_post_request,
+    )
+
+    assert result == {
+        "ok": False,
+        "attempts": 0,
+        "status_code": None,
+        "body": None,
+        "error": "Exhausted retries",
+    }
+
+
+class _FakeHttpResponse:
+    def __init__(self, status: int, body: str) -> None:
+        self.status = status
+        self._body = body.encode("utf-8")
+
+    def read(self) -> bytes:
+        return self._body
+
+    def __enter__(self) -> "_FakeHttpResponse":
+        return self
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        return None
+
+
+class _FakeHttpError(urllib.error.HTTPError):
+    def __init__(self, url: str, code: int, msg: str, body: bytes) -> None:
+        super().__init__(url, code, msg, hdrs=Message(), fp=None)
+        self._body = body
+
+    def read(self, amt: int = -1) -> bytes:
+        return self._body if amt == -1 else self._body[:amt]
+
+
+def test_post_ingestion_request_success(monkeypatch) -> None:
+    captured_request: list[urllib.request.Request] = []
+
+    def fake_urlopen(request: urllib.request.Request, timeout: int):
+        captured_request.append(request)
+        assert timeout == 12
+        return _FakeHttpResponse(201, json.dumps({"created_count": 1}))
+
+    monkeypatch.setattr(ingestor_module.urllib.request, "urlopen", fake_urlopen)
+
+    response = _post_ingestion_request(
+        "http://backend:8000/api/v1/ingestions/from-path",
+        "token",
+        "/archive/case_a",
+        "pm",
+        timeout_seconds=12,
+    )
+
+    assert response == {"status_code": 201, "body": {"created_count": 1}}
+    assert captured_request[0].headers["Authorization"] == "Bearer token"
+
+
+def test_post_ingestion_request_handles_http_error(monkeypatch) -> None:
+    request = urllib.request.Request("http://example.com")
+    error = _FakeHttpError(
+        request.full_url,
+        503,
+        "Service Unavailable",
+        b"retry later",
+    )
+
+    monkeypatch.setattr(
+        ingestor_module.urllib.request,
+        "urlopen",
+        lambda *args, **kwargs: (_ for _ in ()).throw(error),
+    )
+
+    with pytest.raises(IngestionRequestError) as exc_info:
+        _post_ingestion_request(
+            "http://backend:8000/api/v1/ingestions/from-path",
+            "token",
+            "/archive/case_a",
+            "pm",
+            timeout_seconds=12,
+        )
+
+    assert exc_info.value.status_code == 503
+    assert exc_info.value.transient is True
+
+
+def test_post_ingestion_request_handles_url_error(monkeypatch) -> None:
+    monkeypatch.setattr(
+        ingestor_module.urllib.request,
+        "urlopen",
+        lambda *args, **kwargs: (_ for _ in ()).throw(
+            urllib.error.URLError("network down")
+        ),
+    )
+
+    with pytest.raises(IngestionRequestError, match="URL error: network down"):
+        _post_ingestion_request(
+            "http://backend:8000/api/v1/ingestions/from-path",
+            "token",
+            "/archive/case_a",
+            "pm",
+            timeout_seconds=12,
+        )
+
+
+def test_post_ingestion_request_handles_timeout(monkeypatch) -> None:
+    monkeypatch.setattr(
+        ingestor_module.urllib.request,
+        "urlopen",
+        lambda *args, **kwargs: (_ for _ in ()).throw(TimeoutError()),
+    )
+
+    with pytest.raises(IngestionRequestError, match="Request timed out"):
+        _post_ingestion_request(
+            "http://backend:8000/api/v1/ingestions/from-path",
+            "token",
+            "/archive/case_a",
+            "pm",
+            timeout_seconds=12,
+        )
+
+
+def test_is_transient_status() -> None:
+    assert _is_transient_status(503) is True
+    assert _is_transient_status(400) is False
+
+
+def test_load_state_handles_invalid_json_and_logs(monkeypatch, tmp_path: Path) -> None:
+    state_path = tmp_path / "state.json"
+    state_path.write_text("{invalid", encoding="utf-8")
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+
+    state = _load_state(state_path)
+
+    assert state["version"] == ingestor_module.STATE_VERSION
+    assert logged_events[0][0] == "state_load_failed"
+
+
+def test_load_state_handles_non_dict_root_and_non_dict_cases(tmp_path: Path) -> None:
+    non_dict_path = tmp_path / "not_dict.json"
+    non_dict_path.write_text("[]", encoding="utf-8")
+    assert _load_state(non_dict_path)["cases"] == {}
+
+    bad_cases_path = tmp_path / "bad_cases.json"
+    bad_cases_path.write_text(
+        json.dumps(
+            {"version": 9, "cases": [], "updated_at": "2024-01-01T00:00:00+00:00"}
+        ),
+        encoding="utf-8",
+    )
+    loaded = _load_state(bad_cases_path)
+    assert loaded["cases"] == {}
+    assert loaded["version"] == 9
+
+
+def test_save_state_round_trips(tmp_path: Path) -> None:
+    state_path = tmp_path / "nested" / "state.json"
+    state = {"version": 1, "cases": {}}
+
+    _save_state(state_path, state)
+
+    assert state_path.exists()
+    saved = json.loads(state_path.read_text(encoding="utf-8"))
+    assert saved["version"] == 1
+    assert "updated_at" in saved
+
+
+def test_record_successful_case_replaces_non_dict_cases() -> None:
+    state: dict[str, Any] = {"cases": []}
+    candidate = IngestionCandidate(
+        case_path="/archive/case_a",
+        execution_ids=["100.1-1"],
+        new_execution_ids=["100.1-1"],
+        fingerprint="fp-1",
+    )
+
+    _record_successful_case(state, candidate)
+
+    assert state["cases"]["/archive/case_a"]["fingerprint"] == "fp-1"
+
+
+def test_case_state_processed_ids_ignores_non_list() -> None:
+    assert _case_state_processed_ids({"processed_execution_ids": "bad"}) == set()
+
+
+def test_normalized_api_base_url_handles_existing_api_base() -> None:
+    base_url = f"http://backend:8000{API_BASE}"
+    assert _normalized_api_base_url(base_url) == base_url
+    assert _normalized_api_base_url("http://backend:8000/") == (
+        f"http://backend:8000{API_BASE}"
+    )
+
+
+def test_render_log_value_formats_values() -> None:
+    assert _render_log_value("plain-value") == "plain-value"
+    assert _render_log_value("value with space") == '"value with space"'
+    assert _render_log_value({"b": 1, "a": 2}) == '{"a": 2, "b": 1}'
+
+
+def test_log_execution_skip_detail_suppresses_after_limit(monkeypatch) -> None:
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+    skip_log_state = {"logged": ingestor_module.MAX_SKIP_DETAIL_LOGS, "suppressed": 0}
+
+    _log_execution_skip_detail(
+        "execution_skipped_incomplete",
+        "/archive/case_a",
+        "100.1-1",
+        "missing",
+        skip_log_state,
+    )
+
+    assert logged_events == []
+    assert skip_log_state["suppressed"] == 1
+
+
+def test_log_summary_table_and_startup_configuration(
+    monkeypatch, tmp_path: Path
+) -> None:
+    logged_events: list[tuple[str, dict[str, Any]]] = []
+
+    def fake_log_event(event: str, fields: dict[str, Any] | None = None) -> None:
+        logged_events.append((event, {} if fields is None else fields))
+
+    monkeypatch.setattr(ingestor_module, "_log_event", fake_log_event)
+
+    _log_summary_table("summary", [("a", 1), ("b", "two words")])
+
+    config = IngestorConfig(
+        api_base_url="http://backend:8000",
+        api_token="token",
+        archive_root=tmp_path,
+        machine_name="pm",
+        state_path=tmp_path / "state.json",
+        dry_run=True,
+        max_cases_per_run=5,
+        max_attempts=2,
+        request_timeout_seconds=60,
+    )
+    _log_startup_configuration(
+        config, endpoint_url="http://backend:8000/api/v1/ingestions/from-path"
+    )
+
+    assert logged_events[0][0] == "summary_table"
+    assert logged_events[1][0] == "startup_configuration_begin"
+    assert logged_events[2][0] == "summary_table"
+    assert logged_events[3][0] == "startup_configuration_end"

From 0c66fc1028b1e6e54cfada8581a2c697875d24b8 Mon Sep 17 00:00:00 2001
From: Tom Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 13:47:44 -0700
Subject: [PATCH 03/10] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 backend/app/features/ingestion/parsers/case_docs.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py
index 238c67c7..6a16b52c 100644
--- a/backend/app/features/ingestion/parsers/case_docs.py
+++ b/backend/app/features/ingestion/parsers/case_docs.py
@@ -11,7 +11,7 @@
 
 
 def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
-    """Parse env_case.xml (plain or gzipped) to extract case_group.
+    """Parse env_case.xml (plain or gzipped) to extract case metadata.
 
     Parameters
     ----------
@@ -21,7 +21,16 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
     Returns
     -------
     dict
-        Dictionary with key 'case_group' (str or None)
+        Dictionary with case metadata (values are str or None), including:
+
+        - ``case_name``: Case name (``CASE``)
+        - ``case_group``: Case group (``CASE_GROUP``)
+        - ``machine``: Machine name (``MACH``)
+        - ``user``: Real user (``REALUSER``)
+        - ``campaign``: Derived campaign identifier from the case name
+        - ``experiment_type``: Derived experiment type, constrained to
+          KNOWN_EXPERIMENT_TYPES when possible
+        - ``compset_alias``: Compset alias (``COMPSET``)
     """
     env_case_path = Path(env_case_path)
 

From fc357bdde7f66ffa5f5483f864c1366c66869584 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 13:50:05 -0700
Subject: [PATCH 04/10] Add Copilot review suggestions

---
 backend/app/features/ingestion/parsers/case_docs.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py
index 6a16b52c..8da7589a 100644
--- a/backend/app/features/ingestion/parsers/case_docs.py
+++ b/backend/app/features/ingestion/parsers/case_docs.py
@@ -11,7 +11,7 @@
 
 
 def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
-    """Parse env_case.xml (plain or gzipped) to extract case metadata.
+    """Parse env_case.xml (plain or gzipped).
 
     Parameters
     ----------
@@ -55,7 +55,7 @@ def parse_env_case(env_case_path: str | Path) -> dict[str, str | None]:
 
 
 def parse_env_build(env_build_path: str | Path) -> dict[str, str | None]:
-    """Parse env_build.xml (plain or gzipped) to extract compiler and mpilib.
+    """Parse env_build.xml (plain or gzipped).
 
     Parameters
     ----------
@@ -231,12 +231,15 @@ def _calculate_simulation_end_date(
     stop_n: str | None,
     stop_date: str | None,
 ) -> str | None:
-    if not simulation_start_date or not stop_option:
+    if not stop_option:
         return None
 
     if stop_option == "date":
         return _parse_stop_date(stop_date)
 
+    if not simulation_start_date:
+        return None
+
     if stop_n is None:
         return None
 

From 27b194dcc2e66f8ca9c1c5ed54312d86bf89e7fc Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 14:26:23 -0700
Subject: [PATCH 05/10] Refactor ingestion parser boundary

---
 backend/app/features/ingestion/ingest.py      | 296 +++++++------
 .../app/features/ingestion/parsers/parser.py  | 121 +++---
 .../app/features/ingestion/parsers/types.py   |  30 ++
 backend/app/features/simulation/schemas.py    |   8 +-
 .../features/ingestion/parsers/test_parser.py | 129 ++++--
 .../tests/features/ingestion/test_ingest.py   | 393 ++++++++++++++++--
 6 files changed, 724 insertions(+), 253 deletions(-)
 create mode 100644 backend/app/features/ingestion/parsers/types.py

diff --git a/backend/app/features/ingestion/ingest.py b/backend/app/features/ingestion/ingest.py
index 39e8f1ae..3e5f65db 100644
--- a/backend/app/features/ingestion/ingest.py
+++ b/backend/app/features/ingestion/ingest.py
@@ -18,7 +18,6 @@
     keyed by case.id, to avoid repeated DB queries.
 """
 
-import os
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
@@ -29,13 +28,15 @@
 from sqlalchemy.orm import Session
 
 from app.core.logger import _setup_custom_logger
-from app.features.ingestion.parsers.parser import SimulationMetadata, main_parser
+from app.features.ingestion.parsers.parser import main_parser
+from app.features.ingestion.parsers.types import ParsedSimulation
 from app.features.machine.models import Machine
 from app.features.simulation.enums import SimulationStatus, SimulationType
 from app.features.simulation.models import Case, Simulation
 from app.features.simulation.schemas import SimulationCreate
 
 logger = _setup_custom_logger(__name__)
+ConfigProjection = dict[str, str | None]
 
 
 @dataclass
@@ -114,11 +115,11 @@ def ingest_archive(
         Path(output_dir) if isinstance(output_dir, str) else output_dir
     )
 
-    all_simulations, skipped_count = main_parser(
+    parsed_simulations, skipped_count = main_parser(
         archive_path_resolved, output_dir_resolved
     )
 
-    if not all_simulations:
+    if not parsed_simulations:
         logger.warning(f"No simulations found in archive: {archive_path_resolved}")
 
         return IngestArchiveResult(
@@ -131,14 +132,13 @@ def ingest_archive(
     simulations: list[SimulationCreate] = []
     duplicate_count = 0
     errors: list[dict[str, str]] = []
-    canonical_cache: dict[str, SimulationMetadata] = {}
-    persisted_canonical_cache: dict[UUID, SimulationMetadata | None] = {}
+    canonical_cache: dict[str, ConfigProjection] = {}
+    persisted_canonical_cache: dict[UUID, ConfigProjection | None] = {}
 
-    for execution_dir, metadata in all_simulations.items():
+    for parsed_simulation in parsed_simulations:
         try:
             simulation, is_duplicate = _process_simulation_for_ingest(
-                execution_dir=execution_dir,
-                metadata=metadata,
+                parsed_simulation=parsed_simulation,
                 db=db,
                 canonical_cache=canonical_cache,
                 persisted_canonical_cache=persisted_canonical_cache,
@@ -152,11 +152,15 @@ def ingest_archive(
                 simulations.append(simulation)
 
         except (ValueError, LookupError, ValidationError) as e:
-            logger.error(f"Failed to process simulation from {execution_dir}: {e}")
+            logger.error(
+                "Failed to process simulation from %s: %s",
+                parsed_simulation.execution_dir,
+                e,
+            )
 
             errors.append(
                 {
-                    "execution_dir": str(execution_dir),
+                    "execution_dir": parsed_simulation.execution_dir,
                     "error_type": type(e).__name__,
                     "error": str(e),
                 }
@@ -175,25 +179,22 @@ def ingest_archive(
 
 
 def _process_simulation_for_ingest(
-    execution_dir: str,
-    metadata: SimulationMetadata,
+    parsed_simulation: ParsedSimulation,
     db: Session,
-    canonical_cache: dict[str, SimulationMetadata],
-    persisted_canonical_cache: dict[UUID, SimulationMetadata | None],
+    canonical_cache: dict[str, ConfigProjection],
+    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
 ) -> tuple[SimulationCreate | None, bool]:
     """Process one parsed simulation entry.
 
     Parameters
     ----------
-    execution_dir : str
-        Execution directory name (used to derive execution_id).
-    metadata : SimulationMetadata
-        Parsed metadata dictionary for the simulation.
+    parsed_simulation : ParsedSimulation
+        Parsed archive-derived metadata for the simulation.
     db : Session
         Active database session for lookups and case resolution.
-    canonical_cache : dict[str, SimulationMetadata]
-        In-memory cache of canonical metadata per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, SimulationMetadata | None]
+    canonical_cache : dict[str, ConfigProjection]
+        In-memory cache of canonical config values per case_name for the current batch.
+    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
         Cache of canonical metadata loaded from the database by case_id.
 
     Returns
@@ -203,19 +204,19 @@ def _process_simulation_for_ingest(
         only for new records and ``is_duplicate`` is True when an existing
         ``execution_id`` was found.
     """
-    execution_id = _derive_execution_id(execution_dir)
-    case_name = _require_case_name(metadata, execution_dir)
-    machine_id = _resolve_machine_id(metadata, db)
-    case = _resolve_case(metadata, case_name, db)
-
-    if _is_duplicate_simulation(execution_id, execution_dir, db):
-        _seed_canonical_cache_from_duplicate(case_name, metadata, canonical_cache)
+    execution_id = parsed_simulation.execution_id
+    case_name = _require_case_name(parsed_simulation)
+    machine_id = _resolve_machine_id(parsed_simulation, db)
+    case = _resolve_case(parsed_simulation, case_name, db)
+
+    if _is_duplicate_simulation(execution_id, parsed_simulation.execution_dir, db):
+        _seed_canonical_cache_from_duplicate(
+            case_name, parsed_simulation, canonical_cache
+        )
         return None, True
 
     simulation = _build_simulation_create(
-        execution_dir=execution_dir,
-        metadata=metadata,
-        execution_id=execution_id,
+        parsed_simulation=parsed_simulation,
         machine_id=machine_id,
         case=case,
         canonical_cache=canonical_cache,
@@ -226,22 +227,24 @@ def _process_simulation_for_ingest(
     return simulation, False
 
 
-def _require_case_name(metadata: SimulationMetadata, execution_dir: str) -> str:
+def _require_case_name(parsed_simulation: ParsedSimulation) -> str:
     """Return case_name from metadata or raise a descriptive error."""
-    case_name = metadata.get("case_name")
+    case_name = parsed_simulation.case_name
 
     if not case_name:
         raise ValueError(
-            f"case_name is required but missing from '{execution_dir}'. "
+            f"case_name is required but missing from '{parsed_simulation.execution_dir}'. "
             "Cannot determine Case identity."
         )
 
     return case_name
 
 
-def _resolve_case(metadata: SimulationMetadata, case_name: str, db: Session) -> Case:
+def _resolve_case(
+    parsed_simulation: ParsedSimulation, case_name: str, db: Session
+) -> Case:
     """Resolve or create the Case for the current metadata row."""
-    case_group = metadata.get("case_group")
+    case_group = parsed_simulation.case_group
 
     result = _get_or_create_case(db, name=case_name, case_group=case_group)
 
@@ -266,41 +269,37 @@ def _is_duplicate_simulation(
 
 def _seed_canonical_cache_from_duplicate(
     case_name: str,
-    metadata: SimulationMetadata,
-    canonical_cache: dict[str, SimulationMetadata],
+    parsed_simulation: ParsedSimulation,
+    canonical_cache: dict[str, ConfigProjection],
 ) -> None:
     """Seed per-case canonical cache using duplicate metadata when needed."""
     if case_name not in canonical_cache:
-        canonical_cache[case_name] = metadata
+        canonical_cache[case_name] = _parsed_simulation_to_config_projection(
+            parsed_simulation
+        )
 
 
 def _build_simulation_create(
-    execution_dir: str,
-    metadata: SimulationMetadata,
-    execution_id: str,
+    parsed_simulation: ParsedSimulation,
     machine_id: UUID,
     case: Case,
-    canonical_cache: dict[str, SimulationMetadata],
-    persisted_canonical_cache: dict[UUID, SimulationMetadata | None],
+    canonical_cache: dict[str, ConfigProjection],
+    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
     db: Session,
 ) -> SimulationCreate:
     """Create a SimulationCreate using canonical baseline semantics.
 
     Parameters
     ----------
-    execution_dir : str
-        Execution directory name (used for logging context).
-    metadata : SimulationMetadata
-        Parsed metadata dictionary for the simulation.
-    execution_id : str
-        Unique execution identifier derived from the execution directory.
+    parsed_simulation : ParsedSimulation
+        Parsed archive-derived metadata for the simulation.
     machine_id : UUID
         Resolved machine ID from the database.
     case : Case
         Resolved Case object for this simulation.
-    canonical_cache : dict[str, SimulationMetadata]
+    canonical_cache : dict[str, ConfigProjection]
         In-memory cache of canonical metadata per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, SimulationMetadata | None]
+    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
         Cache of canonical metadata loaded from the database by case_id.
     db : Session
         Active database session for lookups and case resolution.
@@ -315,30 +314,44 @@ def _build_simulation_create(
     )
 
     if canonical_metadata is None:
-        canonical_cache[case_name] = metadata
+        canonical_cache[case_name] = _parsed_simulation_to_config_projection(
+            parsed_simulation
+        )
 
-        simulation = _map_metadata_to_schema(
-            metadata, machine_id, case.id, execution_id
+        simulation = _map_parsed_simulation_to_schema(
+            parsed_simulation, machine_id, case.id
+        )
+        logger.info(
+            "Mapped canonical simulation from %s: %s",
+            parsed_simulation.execution_dir,
+            case_name,
         )
-        logger.info(f"Mapped canonical simulation from {execution_dir}: {case_name}")
 
         return simulation
 
-    delta = _compute_config_delta(canonical_metadata, metadata)
+    delta = _compute_config_delta(
+        canonical_metadata,
+        _parsed_simulation_to_config_projection(parsed_simulation),
+    )
     run_config_deltas = delta if delta else None
 
-    simulation = _map_metadata_to_schema(
-        metadata, machine_id, case.id, execution_id, run_config_deltas=run_config_deltas
+    simulation = _map_parsed_simulation_to_schema(
+        parsed_simulation,
+        machine_id,
+        case.id,
+        run_config_deltas=run_config_deltas,
     )
 
     if delta:
         logger.info(
-            f"Non-canonical run in '{execution_dir}' has config differences from "
-            f"canonical: {list(delta.keys())}"
+            "Non-canonical run in '%s' has config differences from canonical: %s",
+            parsed_simulation.execution_dir,
+            list(delta.keys()),
         )
     else:
         logger.info(
-            f"Non-canonical run in '{execution_dir}' has identical configuration to canonical."
+            "Non-canonical run in '%s' has identical configuration to canonical.",
+            parsed_simulation.execution_dir,
         )
 
     return simulation
@@ -347,10 +360,10 @@ def _build_simulation_create(
 def _get_canonical_metadata_for_case(
     case: Case,
     case_name: str,
-    canonical_cache: dict[str, SimulationMetadata],
-    persisted_canonical_cache: dict[UUID, SimulationMetadata | None],
+    canonical_cache: dict[str, ConfigProjection],
+    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
     db: Session,
-) -> SimulationMetadata | None:
+) -> ConfigProjection | None:
     """Resolve canonical metadata from persisted canonical or batch cache.
 
     This function is useful for ensuring that all simulations of the same case
@@ -362,14 +375,14 @@ def _get_canonical_metadata_for_case(
         The Case object for which to retrieve canonical metadata.
     case_name : str
         The name of the case, used for in-memory cache lookup.
-    canonical_cache : dict[str, SimulationMetadata]
+    canonical_cache : dict[str, ConfigProjection]
         In-memory cache of canonical metadata per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, SimulationMetadata | None]
+    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
         Cache of canonical metadata loaded from the database by case_id.
 
     Returns
     -------
-    SimulationMetadata | None
+    ConfigProjection | None
         The canonical metadata for the case, or None if no canonical run exists.
     """
     if case.canonical_simulation_id is not None:
@@ -383,7 +396,7 @@ def _get_canonical_metadata_for_case(
         )
 
         if canonical_sim:
-            canonical_metadata = _sim_to_metadata(canonical_sim)
+            canonical_metadata = _simulation_to_config_projection(canonical_sim)
             persisted_canonical_cache[case.id] = canonical_metadata
 
             return canonical_metadata
@@ -394,32 +407,6 @@ def _get_canonical_metadata_for_case(
     return canonical_cache.get(case_name)
 
 
-def _derive_execution_id(execution_dir: str) -> str:
-    """Extract execution_id from the execution directory path.
-
-    The execution_id is the basename of the execution directory
-    (e.g. ``1125772.260116-181605``).  Absolute filesystem paths are
-    never stored.
-
-    Parameters
-    ----------
-    execution_dir : str
-        Execution directory name from the parser output (e.g. from timing files).
-
-    Raises
-    ------
-    ValueError
-        If the derived execution_id is empty.
-    """
-    execution_id = os.path.basename(execution_dir)
-    if not execution_id:
-        raise ValueError(
-            f"Cannot derive execution_id from execution directory: '{execution_dir}'"
-        )
-
-    return execution_id
-
-
 def _get_or_create_case(db: Session, name: str, case_group: str | None = None) -> Case:
     """Get or create a Case record by case name.
 
@@ -462,10 +449,30 @@ def _get_or_create_case(db: Session, name: str, case_group: str | None = None) -
     return case
 
 
-def _sim_to_metadata(sim: Simulation) -> SimulationMetadata:
-    """Build a parser-style metadata dict from a persisted Simulation."""
+def _parsed_simulation_to_config_projection(
+    parsed_simulation: ParsedSimulation,
+) -> ConfigProjection:
+    """Return config-comparison fields from a parsed simulation."""
+    return {
+        "compset": parsed_simulation.compset,
+        "compset_alias": parsed_simulation.compset_alias,
+        "grid_name": parsed_simulation.grid_name,
+        "grid_resolution": parsed_simulation.grid_resolution,
+        "initialization_type": parsed_simulation.initialization_type,
+        "compiler": parsed_simulation.compiler,
+        "git_tag": parsed_simulation.git_tag,
+        "git_commit_hash": parsed_simulation.git_commit_hash,
+        "git_branch": parsed_simulation.git_branch,
+        "git_repository_url": _normalize_git_url(parsed_simulation.git_repository_url),
+        "campaign": parsed_simulation.campaign,
+        "experiment_type": parsed_simulation.experiment_type,
+        "simulation_type": SimulationType.UNKNOWN.value,
+    }
+
+
+def _simulation_to_config_projection(sim: Simulation) -> ConfigProjection:
+    """Return config-comparison fields from a persisted Simulation."""
     return {
-        "case_name": sim.case.name if sim.case else None,
         "compset": sim.compset,
         "compset_alias": sim.compset_alias,
         "grid_name": sim.grid_name,
@@ -475,15 +482,31 @@ def _sim_to_metadata(sim: Simulation) -> SimulationMetadata:
         "git_tag": sim.git_tag,
         "git_commit_hash": sim.git_commit_hash,
         "git_branch": sim.git_branch,
-        "git_repository_url": sim.git_repository_url,
+        "git_repository_url": _stringify_config_value(sim.git_repository_url),
         "campaign": sim.campaign,
-        "experiment_type": sim.experiment_type,
+        "experiment_type": _stringify_config_value(sim.experiment_type),
+        "simulation_type": _stringify_config_value(sim.simulation_type),
     }
 
 
+def _stringify_config_value(value: object) -> str | None:
+    """Convert enum-like config values to strings for delta comparison."""
+    if value is None:
+        return None
+
+    enum_value = getattr(value, "value", None)
+    if isinstance(enum_value, str):
+        return enum_value
+
+    if isinstance(value, str):
+        return value
+
+    return str(value)
+
+
 def _compute_config_delta(
-    canonical: SimulationMetadata,
-    other: SimulationMetadata,
+    canonical: ConfigProjection,
+    other: ConfigProjection,
 ) -> dict[str, dict[str, str | None]]:
     """Compare two run metadata dicts and return configuration differences.
 
@@ -491,9 +514,9 @@ def _compute_config_delta(
 
     Parameters
     ----------
-    canonical : SimulationMetadata
+    canonical : ConfigProjection
         The canonical run metadata to compare against.
-    other : SimulationMetadata
+    other : ConfigProjection
         The other run metadata to compare.
 
     Returns
@@ -514,13 +537,13 @@ def _compute_config_delta(
     return delta
 
 
-def _resolve_machine_id(metadata: SimulationMetadata, db: Session) -> UUID:
+def _resolve_machine_id(metadata: ParsedSimulation, db: Session) -> UUID:
     """Resolve machine name to machine ID from the database.
 
     Parameters
     ----------
-    metadata : SimulationMetadata
-        Parsed metadata dictionary for the simulation, expected to contain a
+    metadata : ParsedSimulation
+        Parsed metadata for the simulation, expected to contain a
         "machine" key with the machine name.
     db : Session
         Active database session for querying the Machine table.
@@ -532,7 +555,7 @@ def _resolve_machine_id(metadata: SimulationMetadata, db: Session) -> UUID:
     LookupError
         If machine name cannot be found in database.
     """
-    machine_name = metadata.get("machine")
+    machine_name = metadata.machine
     if not machine_name:
         raise ValueError("Machine name is required but not found in metadata")
 
@@ -553,7 +576,7 @@ def _find_existing_simulation(db: Session, execution_id: str) -> Simulation | No
     db : Session
         Active database session for querying the Simulation table.
     execution_id : str
-        Unique execution identifier derived from the execution directory.
+        Unique execution identifier derived from the timing-file LID.
 
     Returns
     -------
@@ -615,25 +638,22 @@ def _normalize_git_url(url: str | None) -> str | None:
     return url
 
 
-def _map_metadata_to_schema(
-    metadata: SimulationMetadata,
+def _map_parsed_simulation_to_schema(
+    parsed_simulation: ParsedSimulation,
     machine_id: UUID,
     case_id: UUID,
-    execution_id: str,
     run_config_deltas: dict[str, dict[str, str | None]] | None = None,
 ) -> SimulationCreate:
     """Map parser metadata to SimulationCreate schema with type conversions.
 
     Parameters
     ----------
-    metadata : SimulationMetadata
-        Dictionary of parsed simulation metadata with string values.
+    parsed_simulation : ParsedSimulation
+        Parsed archive-derived metadata with string values.
     machine_id : UUID
         Pre-extracted machine ID.
     case_id : UUID
         ID of the Case this simulation belongs to.
-    execution_id : str
-        Unique execution identifier derived from the archive directory.
     run_config_deltas : dict | None
         Configuration differences vs canonical baseline, or None.
 
@@ -644,15 +664,17 @@ def _map_metadata_to_schema(
     """
     # Parse datetime fields using the shared utility function
     # Note: simulation_start_date is already validated in _extract_simulation_key()
-    simulation_start_date = _parse_datetime_field(metadata.get("simulation_start_date"))
-    simulation_end_date = _parse_datetime_field(metadata.get("simulation_end_date"))
+    simulation_start_date = _parse_datetime_field(
+        parsed_simulation.simulation_start_date
+    )
+    simulation_end_date = _parse_datetime_field(parsed_simulation.simulation_end_date)
 
-    run_start_date = _parse_datetime_field(metadata.get("run_start_date"))
-    run_end_date = _parse_datetime_field(metadata.get("run_end_date"))
+    run_start_date = _parse_datetime_field(parsed_simulation.run_start_date)
+    run_end_date = _parse_datetime_field(parsed_simulation.run_end_date)
 
-    git_repository_url = _normalize_git_url(metadata.get("git_repository_url"))
-    simulation_type = _normalize_simulation_type(metadata.get("simulation_type"))
-    status = _normalize_simulation_status(metadata.get("status"))
+    git_repository_url = _normalize_git_url(parsed_simulation.git_repository_url)
+    simulation_type = _normalize_simulation_type(None)
+    status = _normalize_simulation_status(parsed_simulation.status)
 
     # Map metadata to schema; Pydantic will validate required fields
     # Note: SimulationCreate uses CamelInBaseModel which expects camelCase field names
@@ -660,35 +682,35 @@ def _map_metadata_to_schema(
         {
             # Required identification fields
             "caseId": case_id,
-            "executionId": execution_id,
+            "executionId": parsed_simulation.execution_id,
             # Required configuration fields
-            "compset": metadata.get("compset"),
-            "compsetAlias": metadata.get("compset_alias"),
-            "gridName": metadata.get("grid_name"),
-            "gridResolution": metadata.get("grid_resolution"),
+            "compset": parsed_simulation.compset,
+            "compsetAlias": parsed_simulation.compset_alias,
+            "gridName": parsed_simulation.grid_name,
+            "gridResolution": parsed_simulation.grid_resolution,
             # Required status fields with sensible defaults
             "simulationType": simulation_type,
             "status": status,
-            "initializationType": metadata.get("initialization_type"),
+            "initializationType": parsed_simulation.initialization_type,
             "machineId": machine_id,
             "simulationStartDate": simulation_start_date,
             "simulationEndDate": simulation_end_date,
-            "experimentType": metadata.get("experiment_type"),
-            "campaign": metadata.get("campaign"),
+            "experimentType": parsed_simulation.experiment_type,
+            "campaign": parsed_simulation.campaign,
             "runStartDate": run_start_date,
             "runEndDate": run_end_date,
-            "compiler": metadata.get("compiler"),
+            "compiler": parsed_simulation.compiler,
             "gitRepositoryUrl": git_repository_url,
-            "gitBranch": metadata.get("git_branch"),
-            "gitTag": metadata.get("git_tag"),
-            "gitCommitHash": metadata.get("git_commit_hash"),
+            "gitBranch": parsed_simulation.git_branch,
+            "gitTag": parsed_simulation.git_tag,
+            "gitCommitHash": parsed_simulation.git_commit_hash,
             # Note: created_by and last_updated_by are set to None since archive
             # metadata contains local usernames that cannot be reliably mapped to
             # database user UUIDs. The API endpoint will set these values based on
             # the authenticated user who uploaded the archive.
             "createdBy": None,
             "lastUpdatedBy": None,
-            "hpcUsername": metadata.get("hpc_username"),
+            "hpcUsername": parsed_simulation.hpc_username,
             # Canonical run semantics
             "runConfigDeltas": run_config_deltas,
         }
diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py
index db871ee4..72a3463b 100644
--- a/backend/app/features/ingestion/parsers/parser.py
+++ b/backend/app/features/ingestion/parsers/parser.py
@@ -42,11 +42,10 @@
     parse_git_status,
 )
 from app.features.ingestion.parsers.readme_case import parse_readme_case
-from app.features.simulation.enums import SimulationStatus, SimulationType
+from app.features.ingestion.parsers.types import ParsedSimulation
+from app.features.simulation.enums import SimulationStatus
 
 SimulationFiles = dict[str, str | None]
-SimulationMetadata = dict[str, str | None]
-AllSimulations = dict[str, SimulationMetadata]
 
 logger = _setup_custom_logger(__name__)
 
@@ -114,7 +113,7 @@ class FileSpec(TypedDict, total=False):
 
 def main_parser(
     archive_path: str | Path, output_dir: str | Path
-) -> tuple[AllSimulations, int]:
+) -> tuple[list[ParsedSimulation], int]:
     """Main entrypoint for parser workflow.
 
     Parses case directories from a performance archive, handling incomplete or
@@ -134,10 +133,10 @@ def main_parser(
 
     Returns
     -------
-    tuple[AllSimulations, int]
-        Dictionary mapping execution directory paths to their parsed simulations
-        and the count of skipped incomplete runs. Only directories that contain
-        all required metadata files are included.
+    tuple[list[ParsedSimulation], int]
+        Parsed simulations in deterministic execution-directory order and the
+        count of skipped incomplete runs. Only directories that contain all
+        required metadata files and a timing-file LID are included.
     """
     archive_path = str(archive_path)
     output_dir = str(output_dir)
@@ -164,7 +163,7 @@ def main_parser(
             "directories matching pattern: <digits>.<digits>-<digits>"
         )
 
-    results: AllSimulations = {}
+    results: list[ParsedSimulation] = []
     skipped_count = 0
 
     for case_dir, exec_dirs in case_to_executions_dirs.items():
@@ -177,14 +176,13 @@ def main_parser(
         for exec_dir in sorted_exec_dirs:
             try:
                 metadata_files = _locate_metadata_files(exec_dir)
+                results.append(_parse_all_files(exec_dir, metadata_files))
             except FileNotFoundError as exc:
                 logger.warning(f"Skipping incomplete run in '{exec_dir}': {exc}")
                 skipped_count += 1
 
                 continue
 
-            results[exec_dir] = _parse_all_files(exec_dir, metadata_files)
-
     if skipped_count:
         logger.info(
             f"Skipped {skipped_count} incomplete run(s) missing required files."
@@ -398,7 +396,7 @@ def _check_missing_files(files: SimulationFiles, exp_dir: str) -> None:
         )
 
 
-def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationMetadata:
+def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimulation:
     """Pass discovered files to their respective parser functions.
 
     Parameters
@@ -408,10 +406,10 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM
 
     Returns
     -------
-    SimulationMetadata
-        Dictionary with parsed results from each file type.
+    ParsedSimulation
+        Typed archive-derived metadata for one execution directory.
     """
-    metadata: SimulationMetadata = {}
+    metadata: dict[str, str | None] = {}
 
     for key, spec in FILE_SPECS.items():
         path = files.get(key)
@@ -423,39 +421,60 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> SimulationM
 
     metadata.update(parse_run_artifacts(exec_dir))
 
-    simulation: SimulationMetadata = {
-        "execution_id": metadata.get("execution_id"),
-        "case_name": metadata.get("case_name"),
-        "compset": metadata.get("compset"),
-        "compset_alias": metadata.get("compset_alias"),
-        "grid_name": metadata.get("grid_name"),
-        "grid_resolution": metadata.get("grid_resolution"),
-        "campaign": metadata.get("campaign"),
-        "experiment_type": metadata.get("experiment_type"),
-        "initialization_type": metadata.get("initialization_type"),
-        "case_group": metadata.get("case_group"),
-        "simulation_start_date": metadata.get("simulation_start_date"),
-        "simulation_end_date": metadata.get("simulation_end_date"),
-        "run_start_date": metadata.get("run_start_date"),
-        "run_end_date": metadata.get("run_end_date"),
-        "compiler": metadata.get("compiler"),
-        "machine": metadata.get("machine"),
-        "hpc_username": metadata.get("user"),
-        "git_repository_url": metadata.get("git_repository_url"),
-        "git_branch": metadata.get("git_branch"),
-        "git_tag": metadata.get("git_tag"),
-        "git_commit_hash": metadata.get("git_commit_hash"),
-        "status": metadata.get("status", SimulationStatus.UNKNOWN.value),
-        # TODO: Skip this for MVP, not required. We can add it later if we find
-        # a way to determine it from the parsed files.
-        "parent_simulation_id": None,
-        # TODO: This is a required field, but we don't have a way to determine
-        # the simulation type from the parsed files yet. Default to UNKNOWN
-        # for now and manually update on the UI if needed.
-        "simulation_type": SimulationType.UNKNOWN.value,
-        "extra": None,
-        "artifacts": None,
-        "links": None,
-    }
-
-    return simulation
+    execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir)
+    _warn_on_execution_id_mismatch(exec_dir, execution_id)
+
+    return ParsedSimulation(
+        execution_dir=exec_dir,
+        execution_id=execution_id,
+        case_name=metadata.get("case_name"),
+        case_group=metadata.get("case_group"),
+        machine=metadata.get("machine"),
+        hpc_username=metadata.get("user"),
+        compset=metadata.get("compset"),
+        compset_alias=metadata.get("compset_alias"),
+        grid_name=metadata.get("grid_name"),
+        grid_resolution=metadata.get("grid_resolution"),
+        campaign=metadata.get("campaign"),
+        experiment_type=metadata.get("experiment_type"),
+        initialization_type=metadata.get("initialization_type"),
+        simulation_start_date=metadata.get("simulation_start_date"),
+        simulation_end_date=metadata.get("simulation_end_date"),
+        run_start_date=metadata.get("run_start_date"),
+        run_end_date=metadata.get("run_end_date"),
+        compiler=metadata.get("compiler"),
+        git_repository_url=metadata.get("git_repository_url"),
+        git_branch=metadata.get("git_branch"),
+        git_tag=metadata.get("git_tag"),
+        git_commit_hash=metadata.get("git_commit_hash"),
+        status=metadata.get("status", SimulationStatus.UNKNOWN.value),
+    )
+
+
+def _require_execution_id(execution_id: str | None, exec_dir: str) -> str:
+    """Return timing-file LID or treat the run as incomplete."""
+    if execution_id is None:
+        raise FileNotFoundError(
+            f"Required timing-file LID missing for execution directory '{exec_dir}'"
+        )
+
+    normalized = execution_id.strip()
+    if not normalized:
+        raise FileNotFoundError(
+            f"Required timing-file LID missing for execution directory '{exec_dir}'"
+        )
+
+    return normalized
+
+
+def _warn_on_execution_id_mismatch(exec_dir: str, execution_id: str) -> None:
+    """Log when timing-file LID disagrees with the execution directory basename."""
+    exec_dir_basename = os.path.basename(exec_dir)
+
+    if exec_dir_basename and exec_dir_basename != execution_id:
+        logger.warning(
+            "Timing-file LID '%s' does not match execution directory '%s'. "
+            "Using timing-file LID as execution_id.",
+            execution_id,
+            exec_dir_basename,
+        )
diff --git a/backend/app/features/ingestion/parsers/types.py b/backend/app/features/ingestion/parsers/types.py
new file mode 100644
index 00000000..cc7d43b9
--- /dev/null
+++ b/backend/app/features/ingestion/parsers/types.py
@@ -0,0 +1,30 @@
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class ParsedSimulation:
+    """Archive-derived metadata for one parsed execution run."""
+
+    execution_dir: str
+    execution_id: str
+    case_name: str | None
+    case_group: str | None
+    machine: str | None
+    hpc_username: str | None
+    compset: str | None
+    compset_alias: str | None
+    grid_name: str | None
+    grid_resolution: str | None
+    campaign: str | None
+    experiment_type: str | None
+    initialization_type: str | None
+    simulation_start_date: str | None
+    simulation_end_date: str | None
+    run_start_date: str | None
+    run_end_date: str | None
+    compiler: str | None
+    git_repository_url: str | None
+    git_branch: str | None
+    git_tag: str | None
+    git_commit_hash: str | None
+    status: str | None
diff --git a/backend/app/features/simulation/schemas.py b/backend/app/features/simulation/schemas.py
index 15641502..f4772cd7 100644
--- a/backend/app/features/simulation/schemas.py
+++ b/backend/app/features/simulation/schemas.py
@@ -110,7 +110,7 @@ class SimulationCreate(CamelInBaseModel):
             ...,
             description=(
                 "Unique identifier for this execution, derived from the "
-                "archive directory name (e.g. 1125772.260116-181605)"
+                "timing-file LID (e.g. 1125772.260116-181605)"
             ),
         ),
     ]
@@ -293,8 +293,7 @@ class SimulationSummaryOut(CamelOutBaseModel):
         Field(
             ...,
             description=(
-                "Unique identifier for this execution, derived from the "
-                "archive directory name"
+                "Unique identifier for this execution, derived from the timing-file LID"
             ),
         ),
     ]
@@ -390,8 +389,7 @@ class SimulationOut(CamelOutBaseModel):
         Field(
             ...,
             description=(
-                "Unique identifier for this execution, derived from the "
-                "archive directory name"
+                "Unique identifier for this execution, derived from the timing-file LID"
             ),
         ),
     ]
diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py
index 9bb707e3..64371a5b 100644
--- a/backend/tests/features/ingestion/parsers/test_parser.py
+++ b/backend/tests/features/ingestion/parsers/test_parser.py
@@ -6,6 +6,7 @@
 import tarfile
 import zipfile
 from contextlib import contextmanager
+from copy import deepcopy
 from pathlib import Path
 from typing import Any, Generator
 from unittest.mock import patch
@@ -13,6 +14,7 @@
 import pytest
 
 from app.features.ingestion.parsers import parser
+from app.features.ingestion.parsers.types import ParsedSimulation
 
 
 class TestMainParser:
@@ -104,27 +106,54 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
         }
         defaults.update(kwargs)
 
-        with (
-            patch("app.features.ingestion.parsers.parser.parse_env_case") as m1,
-            patch("app.features.ingestion.parsers.parser.parse_env_build") as m2,
-            patch("app.features.ingestion.parsers.parser.parse_env_run") as m3,
-            patch("app.features.ingestion.parsers.parser.parse_readme_case") as m4,
-            patch("app.features.ingestion.parsers.parser.parse_e3sm_timing") as m5,
-            patch("app.features.ingestion.parsers.parser.parse_run_artifacts") as m6,
-            patch("app.features.ingestion.parsers.parser.parse_git_describe") as m7,
-            patch("app.features.ingestion.parsers.parser.parse_git_config") as m8,
-            patch("app.features.ingestion.parsers.parser.parse_git_status") as m9,
-        ):
-            m1.return_value = defaults["parse_env_case"]
-            m2.return_value = defaults["parse_env_build"]
-            m3.return_value = defaults["parse_env_run"]
-            m4.return_value = defaults["parse_readme_case"]
-            m5.return_value = defaults["parse_e3sm_timing"]
-            m6.return_value = defaults["parse_run_artifacts"]
-            m7.return_value = defaults["parse_git_describe"]
-            m8.return_value = defaults["parse_git_config"]
-            m9.return_value = defaults["parse_git_status"]
-            yield
+        original_file_specs = deepcopy(parser.FILE_SPECS)
+
+        try:
+            parser.FILE_SPECS["case_docs_env_case"]["parser"] = lambda _path: defaults[
+                "parse_env_case"
+            ]
+            parser.FILE_SPECS["case_docs_env_build"]["parser"] = lambda _path: defaults[
+                "parse_env_build"
+            ]
+            parser.FILE_SPECS["case_docs_env_run"]["parser"] = lambda _path: defaults[
+                "parse_env_run"
+            ]
+            parser.FILE_SPECS["readme_case"]["parser"] = lambda _path: defaults[
+                "parse_readme_case"
+            ]
+            parser.FILE_SPECS["e3sm_timing"]["parser"] = lambda _path: defaults[
+                "parse_e3sm_timing"
+            ]
+            parser.FILE_SPECS["git_describe"]["parser"] = lambda _path: defaults[
+                "parse_git_describe"
+            ]
+            parser.FILE_SPECS["git_config"]["parser"] = lambda _path: (
+                defaults["parse_git_config"]
+                if isinstance(defaults["parse_git_config"], dict)
+                else (
+                    {"git_repository_url": defaults["parse_git_config"]}
+                    if defaults["parse_git_config"]
+                    else {}
+                )
+            )
+            parser.FILE_SPECS["git_status"]["parser"] = lambda _path: (
+                defaults["parse_git_status"]
+                if isinstance(defaults["parse_git_status"], dict)
+                else (
+                    {"git_branch": defaults["parse_git_status"]}
+                    if defaults["parse_git_status"]
+                    else {}
+                )
+            )
+
+            with patch(
+                "app.features.ingestion.parsers.parser.parse_run_artifacts",
+                return_value=defaults["parse_run_artifacts"],
+            ):
+                yield
+        finally:
+            parser.FILE_SPECS.clear()
+            parser.FILE_SPECS.update(original_file_specs)
 
     def test_file_specs_remove_case_status_and_add_env_run(self) -> None:
         assert "case_status" not in parser.FILE_SPECS
@@ -150,7 +179,8 @@ def test_with_valid_zip_archive(self, tmp_path: Path) -> None:
 
         assert len(result) > 0
         assert skipped == 0
-        assert any("1.0-0" in key for key in result.keys())
+        assert isinstance(result[0], ParsedSimulation)
+        assert any("1.0-0" in parsed.execution_dir for parsed in result)
 
     def test_with_tar_gz_archive(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
@@ -169,7 +199,7 @@ def test_with_tar_gz_archive(self, tmp_path: Path) -> None:
 
         assert len(result) > 0
         assert skipped == 0
-        assert any("2.5-10" in key for key in result.keys())
+        assert any("2.5-10" in parsed.execution_dir for parsed in result)
 
     def test_with_multiple_executions(self, tmp_path: Path) -> None:
         archive_base = tmp_path / "archive_extract"
@@ -227,7 +257,7 @@ def test_missing_required_files_skips_incomplete_run(self, tmp_path: Path) -> No
 
         result, skipped = parser.main_parser(archive_path, extract_dir)
 
-        assert result == {}
+        assert result == []
         assert skipped == 1
 
     def test_multiple_matching_timing_files_raises_error(self, tmp_path: Path) -> None:
@@ -325,7 +355,7 @@ def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None:
 
         result, skipped = parser.main_parser(archive_path, extract_dir)
 
-        assert result == {}
+        assert result == []
         assert skipped == 1
 
     def test_completed_status_is_merged(self, tmp_path: Path) -> None:
@@ -337,7 +367,52 @@ def test_completed_status_is_merged(self, tmp_path: Path) -> None:
             result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
 
         assert skipped == 0
-        assert next(iter(result.values()))["status"] == "completed"
+        assert result[0].status == "completed"
+
+    def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(execution_dir, "001.001")
+
+        with self._mock_all_parsers(
+            parse_e3sm_timing={
+                "execution_id": None,
+                "run_start_date": "2025-12-18T20:09:33",
+                "run_end_date": "2025-12-18T20:54:58",
+            }
+        ):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert result == []
+        assert skipped == 1
+
+    def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(execution_dir, "001.001")
+
+        with (
+            self._mock_all_parsers(
+                parse_e3sm_timing={
+                    "execution_id": "different-lid",
+                    "run_start_date": "2025-12-18T20:09:33",
+                    "run_end_date": "2025-12-18T20:54:58",
+                }
+            ),
+            patch(
+                "app.features.ingestion.parsers.parser.logger.warning"
+            ) as mock_warning,
+        ):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert skipped == 0
+        assert result[0].execution_id == "different-lid"
+        mock_warning.assert_any_call(
+            "Timing-file LID '%s' does not match execution directory '%s'. "
+            "Using timing-file LID as execution_id.",
+            "different-lid",
+            "1.0-0",
+        )
 
     def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None:
         execution_dir = tmp_path / "1.0-0"
@@ -350,7 +425,7 @@ def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None:
 
         result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
 
-        assert result == {}
+        assert result == []
         assert skipped == 1
 
     def test_zip_path_traversal_rejected(self, tmp_path: Path) -> None:
diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py
index ebc207e2..a84d94a2 100644
--- a/backend/tests/features/ingestion/test_ingest.py
+++ b/backend/tests/features/ingestion/test_ingest.py
@@ -3,17 +3,17 @@
 from unittest.mock import MagicMock, patch
 from uuid import UUID, uuid4
 
-import pytest
 from dateutil import parser as real_dateutil_parser
 from sqlalchemy.orm import Session
 
 from app.features.ingestion.ingest import (
-    _derive_execution_id,
     _get_canonical_metadata_for_case,
     _get_or_create_case,
     _normalize_git_url,
     _normalize_simulation_status,
     _normalize_simulation_type,
+    _parsed_simulation_to_config_projection,
+    _simulation_to_config_projection,
     ingest_archive,
 )
 from app.features.ingestion.models import (
@@ -21,6 +21,7 @@
     IngestionSourceType,
     IngestionStatus,
 )
+from app.features.ingestion.parsers.types import ParsedSimulation
 from app.features.machine.models import Machine
 from app.features.simulation.enums import SimulationStatus, SimulationType
 from app.features.simulation.models import Case, Simulation
@@ -28,6 +29,53 @@
 from app.features.user.models import User
 
 
+def _parsed_simulations_from_mapping(
+    simulations_by_dir: dict[str, dict[str, str | None]],
+) -> list[ParsedSimulation]:
+    parsed_simulations: list[ParsedSimulation] = []
+
+    for execution_dir, metadata in simulations_by_dir.items():
+        parsed_simulations.append(
+            ParsedSimulation(
+                execution_dir=execution_dir,
+                execution_id=_require_execution_id(metadata, execution_dir),
+                case_name=metadata.get("case_name"),
+                case_group=metadata.get("case_group"),
+                machine=metadata.get("machine"),
+                hpc_username=metadata.get("hpc_username") or metadata.get("user"),
+                compset=metadata.get("compset"),
+                compset_alias=metadata.get("compset_alias"),
+                grid_name=metadata.get("grid_name"),
+                grid_resolution=metadata.get("grid_resolution"),
+                campaign=metadata.get("campaign"),
+                experiment_type=metadata.get("experiment_type"),
+                initialization_type=metadata.get("initialization_type"),
+                simulation_start_date=metadata.get("simulation_start_date"),
+                simulation_end_date=metadata.get("simulation_end_date"),
+                run_start_date=metadata.get("run_start_date"),
+                run_end_date=metadata.get("run_end_date"),
+                compiler=metadata.get("compiler"),
+                git_repository_url=metadata.get("git_repository_url"),
+                git_branch=metadata.get("git_branch"),
+                git_tag=metadata.get("git_tag"),
+                git_commit_hash=metadata.get("git_commit_hash"),
+                status=metadata.get("status"),
+            )
+        )
+
+    return parsed_simulations
+
+
+def _require_execution_id(metadata: dict[str, str | None], execution_dir: str) -> str:
+    execution_id = metadata.get("execution_id")
+    if not execution_id:
+        raise AssertionError(
+            f"Mocked ParsedSimulation for '{execution_dir}' must define execution_id"
+        )
+
+    return execution_id
+
+
 class TestIngestArchive:
     """Tests for the ingest_archive public API.
 
@@ -61,6 +109,7 @@ def test_returns_list_of_simulation_create(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081156.251218-200923": {
+                "execution_id": "1081156.251218-200923",
                 "case_name": "case1",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -87,7 +136,7 @@ def test_returns_list_of_simulation_create(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -108,6 +157,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081157.251218-200924": {
+                "execution_id": "1081157.251218-200924",
                 "case_name": "case1",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -131,6 +181,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None:
                 "last_updated_by": None,
             },
             "/path/to/1081158.251218-200925": {
+                "execution_id": "1081158.251218-200925",
                 "case_name": "case2",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -157,7 +208,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -169,7 +220,7 @@ def test_handles_multiple_simulations(self, db: Session) -> None:
 
     def test_returns_empty_list_for_empty_archive(self, db: Session) -> None:
         """Test that empty archive returns empty list."""
-        with patch("app.features.ingestion.ingest.main_parser", return_value=({}, 0)):
+        with patch("app.features.ingestion.ingest.main_parser", return_value=([], 0)):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
             )
@@ -183,6 +234,7 @@ def test_accepts_string_paths(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081159.251218-200926": {
+                "execution_id": "1081159.251218-200926",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -209,7 +261,7 @@ def test_accepts_string_paths(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ) as mock_main_parser:
             ingest_result = ingest_archive("/tmp/archive.zip", "/tmp/out", db)
 
@@ -224,6 +276,7 @@ def test_propagates_mapping_errors(self, db: Session) -> None:
         """Test that mapping errors are collected and ingestion continues."""
         mock_simulations = {
             "/path/to/1081160.251218-200927": {
+                "execution_id": "1081160.251218-200927",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -250,7 +303,7 @@ def test_propagates_mapping_errors(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -282,6 +335,7 @@ def test_parses_various_datetime_formats_through_public_api(
         for idx, date_str in enumerate(test_cases):
             mock_simulations = {
                 f"/path/to/108200{idx}.251218-200900": {
+                    "execution_id": f"108200{idx}.251218-200900",
                     "case_name": f"case1_{date_str}",
                     "compset": "test",
                     "compset_alias": "test_alias",
@@ -308,7 +362,7 @@ def test_parses_various_datetime_formats_through_public_api(
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -324,6 +378,7 @@ def test_missing_required_fields_raise_validation_error(self, db: Session) -> No
 
         mock_simulations = {
             "/path/to/1081170.251218-200930": {
+                "execution_id": "1081170.251218-200930",
                 "case_name": None,
                 "compset": None,
                 "compset_alias": "test_alias",
@@ -350,7 +405,7 @@ def test_missing_required_fields_raise_validation_error(self, db: Session) -> No
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -374,6 +429,7 @@ def test_machine_lookup_and_validation_through_public_api(
         # Create valid simulation
         valid_mock = {
             "/path/to/1081171.251218-200931": {
+                "execution_id": "1081171.251218-200931",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -399,7 +455,8 @@ def test_machine_lookup_and_validation_through_public_api(
         }
 
         with patch(
-            "app.features.ingestion.ingest.main_parser", return_value=(valid_mock, 0)
+            "app.features.ingestion.ingest.main_parser",
+            return_value=(_parsed_simulations_from_mapping(valid_mock), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -411,6 +468,7 @@ def test_machine_lookup_and_validation_through_public_api(
         # Test with missing machine
         invalid_mock = {
             "/path/to/1081172.251218-200932": {
+                "execution_id": "1081172.251218-200932",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -436,7 +494,8 @@ def test_machine_lookup_and_validation_through_public_api(
         }
 
         with patch(
-            "app.features.ingestion.ingest.main_parser", return_value=(invalid_mock, 0)
+            "app.features.ingestion.ingest.main_parser",
+            return_value=(_parsed_simulations_from_mapping(invalid_mock), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -478,6 +537,7 @@ def test_timezone_aware_datetime_parsing_through_public_api(
 
         mock_simulations = {
             "/path/to/1081173.251218-200933": {
+                "execution_id": "1081173.251218-200933",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -504,7 +564,7 @@ def test_timezone_aware_datetime_parsing_through_public_api(
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -528,6 +588,7 @@ def test_handles_optional_fields_through_public_api(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081174.251218-200934": {
+                "execution_id": "1081174.251218-200934",
                 "case_name": "case1",
                 "compset": "FHIST",
                 "compset_alias": "FHIST_f09_fe",
@@ -555,7 +616,7 @@ def test_handles_optional_fields_through_public_api(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -635,6 +696,7 @@ def test_skips_duplicate_simulations(self, db: Session) -> None:
         # Try to ingest a simulation with the same execution_id
         mock_simulations = {
             "/path/to/1081175.251218-200935": {
+                "execution_id": "1081175.251218-200935",
                 "case_name": "existing_case",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -661,7 +723,7 @@ def test_skips_duplicate_simulations(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -718,6 +780,7 @@ def test_ingest_archive_counts(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081176.251218-200936": {
+                "execution_id": "1081176.251218-200936",
                 "case_name": "existing_case",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -741,6 +804,7 @@ def test_ingest_archive_counts(self, db: Session) -> None:
                 "last_updated_by": None,
             },
             "/path/to/1081177.251218-200937": {
+                "execution_id": "1081177.251218-200937",
                 "case_name": "new_case",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -767,7 +831,7 @@ def test_ingest_archive_counts(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -780,7 +844,7 @@ def test_ingest_archive_counts(self, db: Session) -> None:
 
     def test_ingest_archive_empty_archive(self, db: Session) -> None:
         """Test summary counts when the archive contains no simulations."""
-        with patch("app.features.ingestion.ingest.main_parser", return_value=({}, 0)):
+        with patch("app.features.ingestion.ingest.main_parser", return_value=([], 0)):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
             )
@@ -801,6 +865,7 @@ def test_handles_invalid_datetime_gracefully(self, db: Session) -> None:
         # This will be parsed but not raise an error
         mock_simulations = {
             "/path/to/1081178.251218-200938": {
+                "execution_id": "1081178.251218-200938",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -827,7 +892,7 @@ def test_handles_invalid_datetime_gracefully(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -850,6 +915,7 @@ def test_parse_datetime_field_exception_handling(self, db: Session) -> None:
         # This ensures we exercise the except block in _parse_datetime_field
         mock_simulations = {
             "/path/to/1081179.251218-200939": {
+                "execution_id": "1081179.251218-200939",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -885,7 +951,7 @@ def mock_parse_wrapper(date_str, *args, **kwargs):
         with (
             patch(
                 "app.features.ingestion.ingest.main_parser",
-                return_value=(mock_simulations, 0),
+                return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
             ),
             patch(
                 "app.features.ingestion.ingest.dateutil_parser.parse",
@@ -904,6 +970,7 @@ def test_missing_machine_name_in_metadata(self, db: Session) -> None:
         """Test error handling when machine name is missing from metadata."""
         mock_simulations = {
             "/path/to/1081180.251218-200940": {
+                "execution_id": "1081180.251218-200940",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -930,7 +997,7 @@ def test_missing_machine_name_in_metadata(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -947,6 +1014,7 @@ def test_missing_simulation_start_date(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081181.251218-200941": {
+                "execution_id": "1081181.251218-200941",
                 "case_name": "case1",
                 "compset": "test",
                 "compset_alias": "test_alias",
@@ -973,7 +1041,7 @@ def test_missing_simulation_start_date(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -1055,6 +1123,7 @@ def test_ssh_url_conversion_integrated_in_ingest(self, db: Session) -> None:
         # SSH URL in metadata
         mock_simulations = {
             "/path/to/1081182.251218-200942": {
+                "execution_id": "1081182.251218-200942",
                 "case_name": "case1",
                 "compset": "FHIST",
                 "compset_alias": "test_alias",
@@ -1081,7 +1150,7 @@ def test_ssh_url_conversion_integrated_in_ingest(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             ingest_result = ingest_archive(
                 Path("/tmp/archive.zip"), Path("/tmp/out"), db
@@ -1137,6 +1206,7 @@ def _create_machine(db: Session, name: str) -> Machine:
 
     @staticmethod
     def _make_metadata(
+        execution_id: str,
         case_name: str = "case1",
         machine: str = "test-machine",
         simulation_start_date: str = "2020-01-01",
@@ -1144,6 +1214,7 @@ def _make_metadata(
     ) -> dict[str, str | None]:
         """Build a complete simulation metadata dict with sensible defaults."""
         base: dict[str, str | None] = {
+            "execution_id": execution_id,
             "case_name": case_name,
             "compset": "FHIST",
             "compset_alias": "test_alias",
@@ -1177,10 +1248,12 @@ def test_canonical_run_selected_from_multiple_runs(self, db: Session) -> None:
         # Two runs with the same case_name but different compilers
         mock_simulations = {
             "/path/to/1081183.251218-200943": self._make_metadata(
+                execution_id="1081183.251218-200943",
                 simulation_start_date="2020-01-01",
                 compiler="gcc-11",
             ),
             "/path/to/1081184.251218-200944": self._make_metadata(
+                execution_id="1081184.251218-200944",
                 simulation_start_date="2020-06-01",
                 compiler="gcc-12",
             ),
@@ -1188,7 +1261,7 @@ def test_canonical_run_selected_from_multiple_runs(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1209,10 +1282,12 @@ def test_config_delta_stored_for_non_canonical_run(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081185.251218-200945": self._make_metadata(
+                execution_id="1081185.251218-200945",
                 simulation_start_date="2020-01-01",
                 compiler="gcc-11",
             ),
             "/path/to/1081186.251218-200946": self._make_metadata(
+                execution_id="1081186.251218-200946",
                 simulation_start_date="2020-06-01",
                 compiler="gcc-12",
             ),
@@ -1220,7 +1295,7 @@ def test_config_delta_stored_for_non_canonical_run(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1245,16 +1320,18 @@ def test_no_delta_when_configs_identical(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081187.251218-200947": self._make_metadata(
+                execution_id="1081187.251218-200947",
                 simulation_start_date="2020-01-01",
             ),
             "/path/to/1081188.251218-200948": self._make_metadata(
+                execution_id="1081188.251218-200948",
                 simulation_start_date="2020-06-01",
             ),
         }
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1271,16 +1348,18 @@ def test_different_case_names_create_separate_simulations(
 
         mock_simulations = {
             "/path/to/1081189.251218-200949": self._make_metadata(
+                execution_id="1081189.251218-200949",
                 case_name="case_alpha",
             ),
             "/path/to/1081190.251218-200950": self._make_metadata(
+                execution_id="1081190.251218-200950",
                 case_name="case_beta",
             ),
         }
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1344,13 +1423,14 @@ def test_idempotent_reingestion(self, db: Session) -> None:
         # Now re-ingest with the same execution_id
         mock_simulations = {
             "/path/to/1081191.251218-200951": self._make_metadata(
+                execution_id="1081191.251218-200951",
                 simulation_start_date="2020-01-01",
             ),
         }
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1414,10 +1494,12 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None:
         # Ingest archive containing the existing run plus a new one
         mock_simulations = {
             "/path/to/1081192.251218-200952": self._make_metadata(
+                execution_id="1081192.251218-200952",
                 simulation_start_date="2020-01-01",
                 compiler="gcc-11",
             ),
             "/path/to/1081193.251218-200953": self._make_metadata(
+                execution_id="1081193.251218-200953",
                 simulation_start_date="2020-06-01",
                 compiler="gcc-12",
             ),
@@ -1425,7 +1507,7 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1440,15 +1522,165 @@ def test_incremental_ingestion_new_run_adds_delta(self, db: Session) -> None:
         assert new_sim.run_config_deltas["compiler"]["canonical"] == "gcc-11"
         assert new_sim.run_config_deltas["compiler"]["current"] == "gcc-12"
 
+    def test_incremental_ingestion_normalizes_git_url_for_delta(
+        self, db: Session
+    ) -> None:
+        """Equivalent SSH/HTTPS git URLs should not produce a config delta."""
+        machine = self._create_machine(db, "test-machine")
+
+        user = User(
+            email="test@example.com",
+            is_active=True,
+            is_verified=True,
+        )
+        db.add(user)
+        db.commit()
+
+        ingestion = Ingestion(
+            source_type=IngestionSourceType.HPC_PATH,
+            source_reference="/archive",
+            status=IngestionStatus.SUCCESS,
+            machine_id=machine.id,
+            triggered_by=user.id,
+        )
+        db.add(ingestion)
+        db.commit()
+
+        case = Case(name="case1")
+        db.add(case)
+        db.flush()
+
+        sim = Simulation(
+            case_id=case.id,
+            execution_id="1081194.251218-200953",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            machine_id=machine.id,
+            simulation_start_date=datetime(2020, 1, 1),
+            initialization_type="test",
+            status=SimulationStatus.CREATED,
+            simulation_type=SimulationType.UNKNOWN,
+            created_by=user.id,
+            last_updated_by=user.id,
+            ingestion_id=ingestion.id,
+            git_repository_url="https://github.com/E3SM-Project/E3SM.git",
+        )
+        db.add(sim)
+        db.flush()
+
+        assert sim.id is not None
+        case.canonical_simulation_id = sim.id
+        db.commit()
+
+        mock_simulations = {
+            "/path/to/1081194.251218-200956": self._make_metadata(
+                execution_id="1081194.251218-200956",
+                simulation_start_date="2020-06-01",
+                git_repository_url="git@github.com:E3SM-Project/E3SM.git",
+            ),
+        }
+
+        with patch(
+            "app.features.ingestion.ingest.main_parser",
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
+        ):
+            result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
+
+        assert result.created_count == 1
+        assert len(result.simulations) == 1
+        assert result.simulations[0].run_config_deltas is None
+
+    def test_incremental_ingestion_persisted_simulation_type_adds_delta(
+        self, db: Session
+    ) -> None:
+        """Persisted canonical simulation_type differences are reflected in deltas."""
+        machine = self._create_machine(db, "test-machine")
+
+        user = User(
+            email="test@example.com",
+            is_active=True,
+            is_verified=True,
+        )
+        db.add(user)
+        db.commit()
+
+        ingestion = Ingestion(
+            source_type=IngestionSourceType.HPC_PATH,
+            source_reference="/archive",
+            status=IngestionStatus.SUCCESS,
+            machine_id=machine.id,
+            triggered_by=user.id,
+        )
+        db.add(ingestion)
+        db.commit()
+
+        case = Case(name="case1")
+        db.add(case)
+        db.flush()
+
+        sim = Simulation(
+            case_id=case.id,
+            execution_id="1081194.251218-200954",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            machine_id=machine.id,
+            simulation_start_date=datetime(2020, 1, 1),
+            initialization_type="test",
+            status=SimulationStatus.CREATED,
+            simulation_type=SimulationType.PRODUCTION,
+            created_by=user.id,
+            last_updated_by=user.id,
+            ingestion_id=ingestion.id,
+        )
+        db.add(sim)
+        db.flush()
+
+        assert sim.id is not None
+        case.canonical_simulation_id = sim.id
+        db.commit()
+
+        mock_simulations = {
+            "/path/to/1081194.251218-200954": self._make_metadata(
+                execution_id="1081194.251218-200954",
+                simulation_start_date="2020-01-01",
+            ),
+            "/path/to/1081194.251218-200955": self._make_metadata(
+                execution_id="1081194.251218-200955",
+                simulation_start_date="2020-06-01",
+            ),
+        }
+
+        with patch(
+            "app.features.ingestion.ingest.main_parser",
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
+        ):
+            result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
+
+        assert result.duplicate_count == 1
+        assert result.created_count == 1
+        assert len(result.simulations) == 1
+        assert result.simulations[0].run_config_deltas is not None
+        assert "simulation_type" in result.simulations[0].run_config_deltas
+        assert result.simulations[0].run_config_deltas["simulation_type"] == {
+            "canonical": SimulationType.PRODUCTION.value,
+            "current": SimulationType.UNKNOWN.value,
+        }
+
     def test_same_case_name_groups_to_same_case(self, db: Session) -> None:
         """Runs with the same case_name belong to the same Case."""
         self._create_machine(db, "test-machine")
 
         mock_simulations = {
             "/path/to/1081195.251218-200955": self._make_metadata(
+                execution_id="1081195.251218-200955",
                 case_name="case1",
             ),
             "/path/to/1081196.251218-200956": self._make_metadata(
+                execution_id="1081196.251218-200956",
                 case_name="case1",
                 simulation_start_date="2020-06-01",
             ),
@@ -1456,7 +1688,7 @@ def test_same_case_name_groups_to_same_case(self, db: Session) -> None:
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1474,16 +1706,18 @@ def test_different_case_name_creates_separate_cases(self, db: Session) -> None:
 
         mock_simulations = {
             "/path/to/1081197.251218-200957": self._make_metadata(
+                execution_id="1081197.251218-200957",
                 case_name="case_X",
             ),
             "/path/to/1081198.251218-200958": self._make_metadata(
+                execution_id="1081198.251218-200958",
                 case_name="case_Y",
             ),
         }
 
         with patch(
             "app.features.ingestion.ingest.main_parser",
-            return_value=(mock_simulations, 0),
+            return_value=(_parsed_simulations_from_mapping(mock_simulations), 0),
         ):
             result = ingest_archive(Path("/tmp/a.zip"), Path("/tmp/o"), db)
 
@@ -1493,10 +1727,6 @@ def test_different_case_name_creates_separate_cases(self, db: Session) -> None:
 
 
 class TestIngestHelpers:
-    def test_derive_execution_id_raises_on_empty_path(self) -> None:
-        with pytest.raises(ValueError, match="Cannot derive execution_id"):
-            _derive_execution_id("")
-
     def test_get_or_create_case_sets_missing_case_group(self, db: Session) -> None:
         case = Case(name="case_group_test", case_group=None)
         db.add(case)
@@ -1626,3 +1856,100 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup(
                 db=db,
             )
         assert second == first
+
+    def test_parsed_projection_keys_match_config_delta_fields(self) -> None:
+        parsed = ParsedSimulation(
+            execution_dir="/path/to/1082001.260305-120001",
+            execution_id="1082001.260305-120001",
+            case_name="case1",
+            case_group=None,
+            machine="machine",
+            hpc_username=None,
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            campaign="campaign",
+            experiment_type="historical",
+            initialization_type="test",
+            simulation_start_date="2020-01-01",
+            simulation_end_date=None,
+            run_start_date=None,
+            run_end_date=None,
+            compiler="gcc",
+            git_repository_url="https://example.com/repo.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            status="completed",
+        )
+
+        projection = _parsed_simulation_to_config_projection(parsed)
+
+        assert set(projection) == Simulation.CONFIG_DELTA_FIELDS
+        assert projection["simulation_type"] == SimulationType.UNKNOWN.value
+
+    def test_simulation_projection_keys_match_config_delta_fields(
+        self, db: Session
+    ) -> None:
+        machine = Machine(
+            name="projection-machine",
+            site="Test Site",
+            architecture="x86_64",
+            scheduler="SLURM",
+            gpu=False,
+        )
+        db.add(machine)
+
+        user = User(
+            email="projection@example.com",
+            is_active=True,
+            is_verified=True,
+        )
+        db.add(user)
+        db.commit()
+
+        ingestion = Ingestion(
+            source_type=IngestionSourceType.HPC_PATH,
+            source_reference="/archive",
+            status=IngestionStatus.SUCCESS,
+            machine_id=machine.id,
+            triggered_by=user.id,
+        )
+        db.add(ingestion)
+        db.commit()
+
+        case = Case(name="projection_case")
+        db.add(case)
+        db.flush()
+
+        sim = Simulation(
+            case_id=case.id,
+            execution_id="1082002.260305-120002",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            machine_id=machine.id,
+            simulation_start_date=datetime(2020, 1, 1),
+            initialization_type="test",
+            status=SimulationStatus.CREATED,
+            simulation_type=SimulationType.TEST,
+            created_by=user.id,
+            last_updated_by=user.id,
+            ingestion_id=ingestion.id,
+            campaign="campaign",
+            experiment_type="historical",
+            compiler="gcc",
+            git_repository_url="https://example.com/repo.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+        )
+        db.add(sim)
+        db.commit()
+
+        projection = _simulation_to_config_projection(sim)
+
+        assert set(projection) == Simulation.CONFIG_DELTA_FIELDS
+        assert projection["simulation_type"] == SimulationType.TEST.value

From 977f1db48e77a46062399585788c0f7ea9295c81 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 14:41:04 -0700
Subject: [PATCH 06/10] Refactor ingest config snapshots

---
 backend/app/features/ingestion/ingest.py      | 293 ++++++++----------
 .../app/features/simulation/config_delta.py   |  44 +++
 backend/app/features/simulation/models.py     |  19 +-
 .../tests/features/ingestion/test_ingest.py   | 243 ++++++++++++++-
 4 files changed, 408 insertions(+), 191 deletions(-)
 create mode 100644 backend/app/features/simulation/config_delta.py

diff --git a/backend/app/features/ingestion/ingest.py b/backend/app/features/ingestion/ingest.py
index 3e5f65db..31704064 100644
--- a/backend/app/features/ingestion/ingest.py
+++ b/backend/app/features/ingestion/ingest.py
@@ -31,12 +31,12 @@
 from app.features.ingestion.parsers.parser import main_parser
 from app.features.ingestion.parsers.types import ParsedSimulation
 from app.features.machine.models import Machine
+from app.features.simulation.config_delta import SimulationConfigSnapshot
 from app.features.simulation.enums import SimulationStatus, SimulationType
 from app.features.simulation.models import Case, Simulation
 from app.features.simulation.schemas import SimulationCreate
 
 logger = _setup_custom_logger(__name__)
-ConfigProjection = dict[str, str | None]
 
 
 @dataclass
@@ -66,6 +66,37 @@ class IngestArchiveResult:
     errors: list[dict[str, str]] = field(default_factory=list)
 
 
+@dataclass(frozen=True)
+class SimulationCreateDraft:
+    """Normalized internal payload validated into ``SimulationCreate``."""
+
+    case_id: UUID
+    execution_id: str
+    compset: str | None
+    compset_alias: str | None
+    grid_name: str | None
+    grid_resolution: str | None
+    simulation_type: SimulationType
+    status: SimulationStatus
+    campaign: str | None
+    experiment_type: str | None
+    initialization_type: str | None
+    machine_id: UUID
+    simulation_start_date: datetime | None
+    simulation_end_date: datetime | None
+    run_start_date: datetime | None
+    run_end_date: datetime | None
+    compiler: str | None
+    git_repository_url: str | None
+    git_branch: str | None
+    git_tag: str | None
+    git_commit_hash: str | None
+    created_by: UUID | None
+    last_updated_by: UUID | None
+    hpc_username: str | None
+    run_config_deltas: dict[str, dict[str, str | None]] | None = None
+
+
 def ingest_archive(
     archive_path: Path | str,
     output_dir: Path | str,
@@ -132,8 +163,8 @@ def ingest_archive(
     simulations: list[SimulationCreate] = []
     duplicate_count = 0
     errors: list[dict[str, str]] = []
-    canonical_cache: dict[str, ConfigProjection] = {}
-    persisted_canonical_cache: dict[UUID, ConfigProjection | None] = {}
+    canonical_cache: dict[str, SimulationConfigSnapshot] = {}
+    persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {}
 
     for parsed_simulation in parsed_simulations:
         try:
@@ -181,8 +212,8 @@ def ingest_archive(
 def _process_simulation_for_ingest(
     parsed_simulation: ParsedSimulation,
     db: Session,
-    canonical_cache: dict[str, ConfigProjection],
-    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
+    canonical_cache: dict[str, SimulationConfigSnapshot],
+    persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None],
 ) -> tuple[SimulationCreate | None, bool]:
     """Process one parsed simulation entry.
 
@@ -192,9 +223,9 @@ def _process_simulation_for_ingest(
         Parsed archive-derived metadata for the simulation.
     db : Session
         Active database session for lookups and case resolution.
-    canonical_cache : dict[str, ConfigProjection]
+    canonical_cache : dict[str, SimulationConfigSnapshot]
         In-memory cache of canonical config values per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
+    persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None]
         Cache of canonical metadata loaded from the database by case_id.
 
     Returns
@@ -270,21 +301,19 @@ def _is_duplicate_simulation(
 def _seed_canonical_cache_from_duplicate(
     case_name: str,
     parsed_simulation: ParsedSimulation,
-    canonical_cache: dict[str, ConfigProjection],
+    canonical_cache: dict[str, SimulationConfigSnapshot],
 ) -> None:
     """Seed per-case canonical cache using duplicate metadata when needed."""
     if case_name not in canonical_cache:
-        canonical_cache[case_name] = _parsed_simulation_to_config_projection(
-            parsed_simulation
-        )
+        canonical_cache[case_name] = _build_config_snapshot(parsed_simulation)
 
 
 def _build_simulation_create(
     parsed_simulation: ParsedSimulation,
     machine_id: UUID,
     case: Case,
-    canonical_cache: dict[str, ConfigProjection],
-    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
+    canonical_cache: dict[str, SimulationConfigSnapshot],
+    persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None],
     db: Session,
 ) -> SimulationCreate:
     """Create a SimulationCreate using canonical baseline semantics.
@@ -297,15 +326,15 @@ def _build_simulation_create(
         Resolved machine ID from the database.
     case : Case
         Resolved Case object for this simulation.
-    canonical_cache : dict[str, ConfigProjection]
+    canonical_cache : dict[str, SimulationConfigSnapshot]
         In-memory cache of canonical metadata per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
+    persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None]
         Cache of canonical metadata loaded from the database by case_id.
     db : Session
         Active database session for lookups and case resolution.
     """
     case_name = case.name
-    canonical_metadata = _get_canonical_metadata_for_case(
+    canonical_snapshot = _get_canonical_metadata_for_case(
         case=case,
         case_name=case_name,
         canonical_cache=canonical_cache,
@@ -313,13 +342,15 @@ def _build_simulation_create(
         db=db,
     )
 
-    if canonical_metadata is None:
-        canonical_cache[case_name] = _parsed_simulation_to_config_projection(
-            parsed_simulation
-        )
+    if canonical_snapshot is None:
+        canonical_cache[case_name] = _build_config_snapshot(parsed_simulation)
 
-        simulation = _map_parsed_simulation_to_schema(
-            parsed_simulation, machine_id, case.id
+        simulation = _validate_simulation_create(
+            _build_simulation_create_draft(
+                parsed_simulation=parsed_simulation,
+                machine_id=machine_id,
+                case_id=case.id,
+            )
         )
         logger.info(
             "Mapped canonical simulation from %s: %s",
@@ -329,18 +360,15 @@ def _build_simulation_create(
 
         return simulation
 
-    delta = _compute_config_delta(
-        canonical_metadata,
-        _parsed_simulation_to_config_projection(parsed_simulation),
-    )
+    delta = canonical_snapshot.diff(_build_config_snapshot(parsed_simulation))
     run_config_deltas = delta if delta else None
-
-    simulation = _map_parsed_simulation_to_schema(
-        parsed_simulation,
-        machine_id,
-        case.id,
+    simulation_draft = _build_simulation_create_draft(
+        parsed_simulation=parsed_simulation,
+        machine_id=machine_id,
+        case_id=case.id,
         run_config_deltas=run_config_deltas,
     )
+    simulation = _validate_simulation_create(simulation_draft)
 
     if delta:
         logger.info(
@@ -360,10 +388,10 @@ def _build_simulation_create(
 def _get_canonical_metadata_for_case(
     case: Case,
     case_name: str,
-    canonical_cache: dict[str, ConfigProjection],
-    persisted_canonical_cache: dict[UUID, ConfigProjection | None],
+    canonical_cache: dict[str, SimulationConfigSnapshot],
+    persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None],
     db: Session,
-) -> ConfigProjection | None:
+) -> SimulationConfigSnapshot | None:
     """Resolve canonical metadata from persisted canonical or batch cache.
 
     This function is useful for ensuring that all simulations of the same case
@@ -375,15 +403,15 @@ def _get_canonical_metadata_for_case(
         The Case object for which to retrieve canonical metadata.
     case_name : str
         The name of the case, used for in-memory cache lookup.
-    canonical_cache : dict[str, ConfigProjection]
+    canonical_cache : dict[str, SimulationConfigSnapshot]
         In-memory cache of canonical metadata per case_name for the current batch.
-    persisted_canonical_cache : dict[UUID, ConfigProjection | None]
+    persisted_canonical_cache : dict[UUID, SimulationConfigSnapshot | None]
         Cache of canonical metadata loaded from the database by case_id.
 
     Returns
     -------
-    ConfigProjection | None
-        The canonical metadata for the case, or None if no canonical run exists.
+    SimulationConfigSnapshot | None
+        The canonical config snapshot for the case, or None if no canonical run exists.
     """
     if case.canonical_simulation_id is not None:
         if case.id in persisted_canonical_cache:
@@ -396,10 +424,10 @@ def _get_canonical_metadata_for_case(
         )
 
         if canonical_sim:
-            canonical_metadata = _simulation_to_config_projection(canonical_sim)
-            persisted_canonical_cache[case.id] = canonical_metadata
+            canonical_snapshot = _build_config_snapshot(canonical_sim)
+            persisted_canonical_cache[case.id] = canonical_snapshot
 
-            return canonical_metadata
+            return canonical_snapshot
 
         persisted_canonical_cache[case.id] = None
         return None
@@ -449,44 +477,30 @@ def _get_or_create_case(db: Session, name: str, case_group: str | None = None) -
     return case
 
 
-def _parsed_simulation_to_config_projection(
-    parsed_simulation: ParsedSimulation,
-) -> ConfigProjection:
-    """Return config-comparison fields from a parsed simulation."""
-    return {
-        "compset": parsed_simulation.compset,
-        "compset_alias": parsed_simulation.compset_alias,
-        "grid_name": parsed_simulation.grid_name,
-        "grid_resolution": parsed_simulation.grid_resolution,
-        "initialization_type": parsed_simulation.initialization_type,
-        "compiler": parsed_simulation.compiler,
-        "git_tag": parsed_simulation.git_tag,
-        "git_commit_hash": parsed_simulation.git_commit_hash,
-        "git_branch": parsed_simulation.git_branch,
-        "git_repository_url": _normalize_git_url(parsed_simulation.git_repository_url),
-        "campaign": parsed_simulation.campaign,
-        "experiment_type": parsed_simulation.experiment_type,
-        "simulation_type": SimulationType.UNKNOWN.value,
-    }
-
-
-def _simulation_to_config_projection(sim: Simulation) -> ConfigProjection:
-    """Return config-comparison fields from a persisted Simulation."""
-    return {
-        "compset": sim.compset,
-        "compset_alias": sim.compset_alias,
-        "grid_name": sim.grid_name,
-        "grid_resolution": sim.grid_resolution,
-        "initialization_type": sim.initialization_type,
-        "compiler": sim.compiler,
-        "git_tag": sim.git_tag,
-        "git_commit_hash": sim.git_commit_hash,
-        "git_branch": sim.git_branch,
-        "git_repository_url": _stringify_config_value(sim.git_repository_url),
-        "campaign": sim.campaign,
-        "experiment_type": _stringify_config_value(sim.experiment_type),
-        "simulation_type": _stringify_config_value(sim.simulation_type),
-    }
+def _build_config_snapshot(
+    source: ParsedSimulation | Simulation,
+) -> SimulationConfigSnapshot:
+    """Return a normalized config snapshot for canonical delta comparison."""
+    snapshot_values: dict[str, str | None] = {}
+
+    for field_name in SimulationConfigSnapshot.field_names():
+        if field_name == "simulation_type" and isinstance(source, ParsedSimulation):
+            snapshot_values[field_name] = SimulationType.UNKNOWN.value
+            continue
+
+        value = getattr(source, field_name)
+
+        if isinstance(source, ParsedSimulation):
+            normalized_value = value
+        else:
+            normalized_value = _stringify_config_value(value)
+
+        if field_name == "git_repository_url":
+            normalized_value = _normalize_git_url(normalized_value)
+
+        snapshot_values[field_name] = normalized_value
+
+    return SimulationConfigSnapshot(**snapshot_values)
 
 
 def _stringify_config_value(value: object) -> str | None:
@@ -504,39 +518,6 @@ def _stringify_config_value(value: object) -> str | None:
     return str(value)
 
 
-def _compute_config_delta(
-    canonical: ConfigProjection,
-    other: ConfigProjection,
-) -> dict[str, dict[str, str | None]]:
-    """Compare two run metadata dicts and return configuration differences.
-
-    Only the fields in :data:`_CONFIG_DELTA_FIELDS` are compared.
-
-    Parameters
-    ----------
-    canonical : ConfigProjection
-        The canonical run metadata to compare against.
-    other : ConfigProjection
-        The other run metadata to compare.
-
-    Returns
-    -------
-    dict[str, dict[str, str | None]]
-        Mapping of field name → ``{"canonical": ..., "current": ...}``
-        for every field that differs.  Empty dict when runs are
-        identical.
-    """
-    delta: dict[str, dict[str, str | None]] = {}
-
-    for key in Simulation.CONFIG_DELTA_FIELDS:
-        canonical_val = canonical.get(key)
-        other_val = other.get(key)
-        if canonical_val != other_val:
-            delta[key] = {"canonical": canonical_val, "current": other_val}
-
-    return delta
-
-
 def _resolve_machine_id(metadata: ParsedSimulation, db: Session) -> UUID:
     """Resolve machine name to machine ID from the database.
 
@@ -638,13 +619,13 @@ def _normalize_git_url(url: str | None) -> str | None:
     return url
 
 
-def _map_parsed_simulation_to_schema(
+def _build_simulation_create_draft(
     parsed_simulation: ParsedSimulation,
     machine_id: UUID,
     case_id: UUID,
     run_config_deltas: dict[str, dict[str, str | None]] | None = None,
-) -> SimulationCreate:
-    """Map parser metadata to SimulationCreate schema with type conversions.
+) -> SimulationCreateDraft:
+    """Build a normalized internal draft for ``SimulationCreate`` validation.
 
     Parameters
     ----------
@@ -659,11 +640,10 @@ def _map_parsed_simulation_to_schema(
 
     Returns
     -------
-    SimulationCreate
-        Schema object ready for database insertion.
+    SimulationCreateDraft
+        Typed ingest draft ready for schema validation.
     """
-    # Parse datetime fields using the shared utility function
-    # Note: simulation_start_date is already validated in _extract_simulation_key()
+    # Parse datetime fields using the shared utility function.
     simulation_start_date = _parse_datetime_field(
         parsed_simulation.simulation_start_date
     )
@@ -676,47 +656,44 @@ def _map_parsed_simulation_to_schema(
     simulation_type = _normalize_simulation_type(None)
     status = _normalize_simulation_status(parsed_simulation.status)
 
-    # Map metadata to schema; Pydantic will validate required fields
-    # Note: SimulationCreate uses CamelInBaseModel which expects camelCase field names
-    result = SimulationCreate.model_validate(
-        {
-            # Required identification fields
-            "caseId": case_id,
-            "executionId": parsed_simulation.execution_id,
-            # Required configuration fields
-            "compset": parsed_simulation.compset,
-            "compsetAlias": parsed_simulation.compset_alias,
-            "gridName": parsed_simulation.grid_name,
-            "gridResolution": parsed_simulation.grid_resolution,
-            # Required status fields with sensible defaults
-            "simulationType": simulation_type,
-            "status": status,
-            "initializationType": parsed_simulation.initialization_type,
-            "machineId": machine_id,
-            "simulationStartDate": simulation_start_date,
-            "simulationEndDate": simulation_end_date,
-            "experimentType": parsed_simulation.experiment_type,
-            "campaign": parsed_simulation.campaign,
-            "runStartDate": run_start_date,
-            "runEndDate": run_end_date,
-            "compiler": parsed_simulation.compiler,
-            "gitRepositoryUrl": git_repository_url,
-            "gitBranch": parsed_simulation.git_branch,
-            "gitTag": parsed_simulation.git_tag,
-            "gitCommitHash": parsed_simulation.git_commit_hash,
-            # Note: created_by and last_updated_by are set to None since archive
-            # metadata contains local usernames that cannot be reliably mapped to
-            # database user UUIDs. The API endpoint will set these values based on
-            # the authenticated user who uploaded the archive.
-            "createdBy": None,
-            "lastUpdatedBy": None,
-            "hpcUsername": parsed_simulation.hpc_username,
-            # Canonical run semantics
-            "runConfigDeltas": run_config_deltas,
-        }
+    simulation_draft = SimulationCreateDraft(
+        case_id=case_id,
+        execution_id=parsed_simulation.execution_id,
+        compset=parsed_simulation.compset,
+        compset_alias=parsed_simulation.compset_alias,
+        grid_name=parsed_simulation.grid_name,
+        grid_resolution=parsed_simulation.grid_resolution,
+        simulation_type=simulation_type,
+        status=status,
+        campaign=parsed_simulation.campaign,
+        experiment_type=parsed_simulation.experiment_type,
+        initialization_type=parsed_simulation.initialization_type,
+        machine_id=machine_id,
+        simulation_start_date=simulation_start_date,
+        simulation_end_date=simulation_end_date,
+        run_start_date=run_start_date,
+        run_end_date=run_end_date,
+        compiler=parsed_simulation.compiler,
+        git_repository_url=git_repository_url,
+        git_branch=parsed_simulation.git_branch,
+        git_tag=parsed_simulation.git_tag,
+        git_commit_hash=parsed_simulation.git_commit_hash,
+        created_by=None,
+        last_updated_by=None,
+        hpc_username=parsed_simulation.hpc_username,
+        run_config_deltas=run_config_deltas,
     )
 
-    return result
+    return simulation_draft
+
+
+def _validate_simulation_create(draft: SimulationCreateDraft) -> SimulationCreate:
+    """Validate a typed ingest draft into ``SimulationCreate``."""
+    return SimulationCreate.model_validate(
+        draft,
+        by_name=True,
+        from_attributes=True,
+    )
 
 
 def _normalize_simulation_type(value: str | None) -> SimulationType:
diff --git a/backend/app/features/simulation/config_delta.py b/backend/app/features/simulation/config_delta.py
new file mode 100644
index 00000000..7db24763
--- /dev/null
+++ b/backend/app/features/simulation/config_delta.py
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+from dataclasses import fields as dataclass_fields
+
+
+@dataclass(frozen=True)
+class SimulationConfigSnapshot:
+    """Normalized configuration fields used for canonical delta comparison."""
+
+    compset: str | None
+    compset_alias: str | None
+    grid_name: str | None
+    grid_resolution: str | None
+    initialization_type: str | None
+    compiler: str | None
+    git_tag: str | None
+    git_commit_hash: str | None
+    git_branch: str | None
+    git_repository_url: str | None
+    campaign: str | None
+    experiment_type: str | None
+    simulation_type: str | None
+
+    @classmethod
+    def field_names(cls) -> frozenset[str]:
+        """Return the config field set tracked for canonical comparisons."""
+        return frozenset(field.name for field in dataclass_fields(cls))
+
+    def diff(
+        self, other: "SimulationConfigSnapshot"
+    ) -> dict[str, dict[str, str | None]]:
+        """Return changed config fields relative to another snapshot."""
+        delta: dict[str, dict[str, str | None]] = {}
+
+        for field in dataclass_fields(self):
+            field_name = field.name
+            canonical_value = getattr(self, field_name)
+            current_value = getattr(other, field_name)
+            if canonical_value != current_value:
+                delta[field_name] = {
+                    "canonical": canonical_value,
+                    "current": current_value,
+                }
+
+        return delta
diff --git a/backend/app/features/simulation/models.py b/backend/app/features/simulation/models.py
index b969ebea..2f73a503 100644
--- a/backend/app/features/simulation/models.py
+++ b/backend/app/features/simulation/models.py
@@ -14,6 +14,7 @@
 
 from app.common.models.base import Base
 from app.common.models.mixins import IDMixin, TimestampMixin
+from app.features.simulation.config_delta import SimulationConfigSnapshot
 from app.features.simulation.enums import (
     ArtifactKind,
     ExternalLinkKind,
@@ -177,22 +178,8 @@ class Simulation(Base, IDMixin, TimestampMixin):
 
     # Fields used to compute configuration deltas relative to canonical execution.
     # Excludes timeline, status, and provenance fields that are expected to vary.
-    CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = frozenset(
-        {
-            "compset",
-            "compset_alias",
-            "grid_name",
-            "grid_resolution",
-            "initialization_type",
-            "compiler",
-            "git_tag",
-            "git_commit_hash",
-            "git_branch",
-            "git_repository_url",
-            "campaign",
-            "experiment_type",
-            "simulation_type",
-        }
+    CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = (
+        SimulationConfigSnapshot.field_names()
     )
 
 
diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py
index a84d94a2..0679c5b2 100644
--- a/backend/tests/features/ingestion/test_ingest.py
+++ b/backend/tests/features/ingestion/test_ingest.py
@@ -7,13 +7,15 @@
 from sqlalchemy.orm import Session
 
 from app.features.ingestion.ingest import (
+    SimulationCreateDraft,
+    _build_config_snapshot,
+    _build_simulation_create_draft,
     _get_canonical_metadata_for_case,
     _get_or_create_case,
     _normalize_git_url,
     _normalize_simulation_status,
     _normalize_simulation_type,
-    _parsed_simulation_to_config_projection,
-    _simulation_to_config_projection,
+    _validate_simulation_create,
     ingest_archive,
 )
 from app.features.ingestion.models import (
@@ -23,6 +25,7 @@
 )
 from app.features.ingestion.parsers.types import ParsedSimulation
 from app.features.machine.models import Machine
+from app.features.simulation.config_delta import SimulationConfigSnapshot
 from app.features.simulation.enums import SimulationStatus, SimulationType
 from app.features.simulation.models import Case, Simulation
 from app.features.simulation.schemas import SimulationCreate
@@ -1761,8 +1764,8 @@ def test_get_canonical_metadata_caches_missing_persisted_canonical(self) -> None
         db = MagicMock(spec=Session)
         db.query.return_value.filter.return_value.first.return_value = None
 
-        canonical_cache: dict[str, dict[str, str | None]] = {}
-        persisted_canonical_cache: dict[UUID, dict[str, str | None] | None] = {}
+        canonical_cache: dict[str, SimulationConfigSnapshot] = {}
+        persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {}
 
         result = _get_canonical_metadata_for_case(
             case=case,
@@ -1834,8 +1837,8 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup(
         case.canonical_simulation_id = sim.id
         db.commit()
 
-        canonical_cache: dict[str, dict[str, str | None]] = {}
-        persisted_canonical_cache: dict[UUID, dict[str, str | None] | None] = {}
+        canonical_cache: dict[str, SimulationConfigSnapshot] = {}
+        persisted_canonical_cache: dict[UUID, SimulationConfigSnapshot | None] = {}
 
         first = _get_canonical_metadata_for_case(
             case=case,
@@ -1845,7 +1848,7 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup(
             db=db,
         )
         assert first is not None
-        assert first["compiler"] == "gcc-11"
+        assert first.compiler == "gcc-11"
 
         with patch.object(db, "query", side_effect=AssertionError):
             second = _get_canonical_metadata_for_case(
@@ -1857,7 +1860,10 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup(
             )
         assert second == first
 
-    def test_parsed_projection_keys_match_config_delta_fields(self) -> None:
+    def test_snapshot_field_names_match_config_delta_fields(self) -> None:
+        assert SimulationConfigSnapshot.field_names() == Simulation.CONFIG_DELTA_FIELDS
+
+    def test_parsed_snapshot_defaults_simulation_type_to_unknown(self) -> None:
         parsed = ParsedSimulation(
             execution_dir="/path/to/1082001.260305-120001",
             execution_id="1082001.260305-120001",
@@ -1884,14 +1890,11 @@ def test_parsed_projection_keys_match_config_delta_fields(self) -> None:
             status="completed",
         )
 
-        projection = _parsed_simulation_to_config_projection(parsed)
+        snapshot = _build_config_snapshot(parsed)
 
-        assert set(projection) == Simulation.CONFIG_DELTA_FIELDS
-        assert projection["simulation_type"] == SimulationType.UNKNOWN.value
+        assert snapshot.simulation_type == SimulationType.UNKNOWN.value
 
-    def test_simulation_projection_keys_match_config_delta_fields(
-        self, db: Session
-    ) -> None:
+    def test_persisted_snapshot_matches_config_delta_fields(self, db: Session) -> None:
         machine = Machine(
             name="projection-machine",
             site="Test Site",
@@ -1949,7 +1952,213 @@ def test_simulation_projection_keys_match_config_delta_fields(
         db.add(sim)
         db.commit()
 
-        projection = _simulation_to_config_projection(sim)
+        snapshot = _build_config_snapshot(sim)
+
+        assert snapshot.simulation_type == SimulationType.TEST.value
+        assert set(snapshot.field_names()) == Simulation.CONFIG_DELTA_FIELDS
+
+    def test_snapshot_normalizes_ssh_git_url_for_equality(self, db: Session) -> None:
+        parsed = ParsedSimulation(
+            execution_dir="/path/to/1082003.260305-120003",
+            execution_id="1082003.260305-120003",
+            case_name="case1",
+            case_group=None,
+            machine="machine",
+            hpc_username=None,
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            campaign="campaign",
+            experiment_type="historical",
+            initialization_type="test",
+            simulation_start_date="2020-01-01",
+            simulation_end_date=None,
+            run_start_date=None,
+            run_end_date=None,
+            compiler="gcc",
+            git_repository_url="git@github.com:E3SM-Project/E3SM.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            status="completed",
+        )
+
+        machine = Machine(
+            name="snapshot-machine",
+            site="Test Site",
+            architecture="x86_64",
+            scheduler="SLURM",
+            gpu=False,
+        )
+        db.add(machine)
+
+        user = User(
+            email="snapshot@example.com",
+            is_active=True,
+            is_verified=True,
+        )
+        db.add(user)
+        db.commit()
+
+        ingestion = Ingestion(
+            source_type=IngestionSourceType.HPC_PATH,
+            source_reference="/archive",
+            status=IngestionStatus.SUCCESS,
+            machine_id=machine.id,
+            triggered_by=user.id,
+        )
+        db.add(ingestion)
+        db.commit()
+
+        case = Case(name="snapshot_case")
+        db.add(case)
+        db.flush()
+
+        sim = Simulation(
+            case_id=case.id,
+            execution_id="1082004.260305-120004",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            machine_id=machine.id,
+            simulation_start_date=datetime(2020, 1, 1),
+            initialization_type="test",
+            status=SimulationStatus.CREATED,
+            simulation_type=SimulationType.UNKNOWN,
+            created_by=user.id,
+            last_updated_by=user.id,
+            ingestion_id=ingestion.id,
+            campaign="campaign",
+            experiment_type="historical",
+            compiler="gcc",
+            git_repository_url="https://github.com/E3SM-Project/E3SM.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+        )
+        db.add(sim)
+        db.commit()
+
+        parsed_snapshot = _build_config_snapshot(parsed)
+        persisted_snapshot = _build_config_snapshot(sim)
+
+        assert parsed_snapshot == persisted_snapshot
+        assert parsed_snapshot.diff(persisted_snapshot) == {}
+
+    def test_snapshot_diff_returns_only_changed_fields(self) -> None:
+        canonical = SimulationConfigSnapshot(
+            compset="FHIST",
+            compset_alias="alias1",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            initialization_type="initial",
+            compiler="gcc-11",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            git_branch="main",
+            git_repository_url="https://example.com/repo.git",
+            campaign="campaign1",
+            experiment_type="historical",
+            simulation_type=SimulationType.UNKNOWN.value,
+        )
+        current = SimulationConfigSnapshot(
+            compset="FHIST",
+            compset_alias="alias1",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            initialization_type="initial",
+            compiler="gcc-12",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            git_branch="feature",
+            git_repository_url="https://example.com/repo.git",
+            campaign="campaign1",
+            experiment_type="historical",
+            simulation_type=SimulationType.UNKNOWN.value,
+        )
+
+        delta = canonical.diff(current)
+
+        assert delta == {
+            "compiler": {"canonical": "gcc-11", "current": "gcc-12"},
+            "git_branch": {"canonical": "main", "current": "feature"},
+        }
+
+    def test_simulation_create_draft_validates_by_field_name(self) -> None:
+        draft = SimulationCreateDraft(
+            case_id=uuid4(),
+            execution_id="1082005.260305-120005",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            simulation_type=SimulationType.UNKNOWN,
+            status=SimulationStatus.CREATED,
+            campaign="campaign",
+            experiment_type="historical",
+            initialization_type="test",
+            machine_id=uuid4(),
+            simulation_start_date=datetime(2020, 1, 1),
+            simulation_end_date=None,
+            run_start_date=None,
+            run_end_date=None,
+            compiler="gcc",
+            git_repository_url="https://example.com/repo.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            created_by=None,
+            last_updated_by=None,
+            hpc_username="test-user",
+            run_config_deltas=None,
+        )
+
+        schema = _validate_simulation_create(draft)
+
+        assert isinstance(schema, SimulationCreate)
+        assert schema.execution_id == draft.execution_id
+        assert schema.extra == {}
+        assert schema.artifacts == []
+        assert schema.links == []
+        assert schema.run_config_deltas is None
+
+    def test_build_simulation_create_draft_normalizes_values(self) -> None:
+        parsed = ParsedSimulation(
+            execution_dir="/path/to/1082006.260305-120006",
+            execution_id="1082006.260305-120006",
+            case_name="case1",
+            case_group=None,
+            machine="machine",
+            hpc_username="test-user",
+            compset="FHIST",
+            compset_alias="test_alias",
+            grid_name="grid1",
+            grid_resolution="0.9x1.25",
+            campaign="campaign",
+            experiment_type="historical",
+            initialization_type="test",
+            simulation_start_date="2020-01-01",
+            simulation_end_date=None,
+            run_start_date="2020-01-02T03:04:05Z",
+            run_end_date=None,
+            compiler="gcc",
+            git_repository_url="git@github.com:E3SM-Project/E3SM.git",
+            git_branch="main",
+            git_tag="v1.0.0",
+            git_commit_hash="abc123",
+            status="completed",
+        )
+
+        draft = _build_simulation_create_draft(
+            parsed_simulation=parsed,
+            machine_id=uuid4(),
+            case_id=uuid4(),
+        )
 
-        assert set(projection) == Simulation.CONFIG_DELTA_FIELDS
-        assert projection["simulation_type"] == SimulationType.TEST.value
+        assert draft.simulation_type == SimulationType.UNKNOWN
+        assert draft.status == SimulationStatus.COMPLETED
+        assert draft.git_repository_url == "https://github.com/E3SM-Project/E3SM.git"
+        assert draft.simulation_start_date is not None
+        assert draft.run_start_date is not None

From fb245caec2f5cffdc08eaaaeb8efbbd422c5c436 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 14:48:10 -0700
Subject: [PATCH 07/10] Fix simulation provenance displays

---
 backend/app/features/simulation/schemas.py    |  7 +++++
 frontend/src/features/browse/BrowsePage.tsx   | 22 ++++++++++++++++
 .../components/BrowseFiltersSidePanel.tsx     | 26 ++++++++++++++++---
 .../components/SimulationDetailsView.tsx      |  6 +++++
 frontend/src/types/simulation.ts              | 10 +++++++
 5 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/backend/app/features/simulation/schemas.py b/backend/app/features/simulation/schemas.py
index f4772cd7..cf9be94f 100644
--- a/backend/app/features/simulation/schemas.py
+++ b/backend/app/features/simulation/schemas.py
@@ -535,6 +535,13 @@ class SimulationOut(CamelOutBaseModel):
         UserPreview | None,
         Field(description="Full user info of who last updated the simulation."),
     ]
+    hpc_username: Annotated[
+        str | None,
+        Field(
+            None,
+            description="HPC username for provenance (trusted, informational only)",
+        ),
+    ]
 
     # Miscellaneous
     # -----------------
diff --git a/frontend/src/features/browse/BrowsePage.tsx b/frontend/src/features/browse/BrowsePage.tsx
index f3286438..c46f1e14 100644
--- a/frontend/src/features/browse/BrowsePage.tsx
+++ b/frontend/src/features/browse/BrowsePage.tsx
@@ -44,6 +44,7 @@ export interface FilterState {
   // Metadata & Provenance
   gitTag: string[];
   createdBy: string[];
+  hpcUsername: string[];
 }
 
 interface BrowsePageProps {
@@ -75,6 +76,7 @@ const createEmptyFilters = (): FilterState => ({
   // Metadata & Provenance
   gitTag: [],
   createdBy: [],
+  hpcUsername: [],
 });
 
 const parseViewMode = (params: URLSearchParams): 'grid' | 'table' =>
@@ -190,6 +192,20 @@ export const BrowsePage = ({
     return sortedMachines;
   }, [simulations]);
 
+  const creatorOptions = useMemo(() => {
+    const creators = new Map<string, string>();
+
+    for (const simulation of simulations) {
+      if (!simulation.createdBy) continue;
+
+      creators.set(simulation.createdBy, simulation.createdByUser?.email ?? simulation.createdBy);
+    }
+
+    return Array.from(creators, ([value, label]) => ({ value, label })).sort((a, b) =>
+      a.label.localeCompare(b.label, undefined, { sensitivity: 'base' }),
+    );
+  }, [simulations]);
+
   const filteredData = useMemo(() => {
     const arrayFilterGetters: Partial<
       Record<
@@ -209,6 +225,7 @@ export const BrowsePage = ({
       status: (rec) => rec.status ?? [],
       gitTag: (rec) => rec.gitTag ?? [],
       createdBy: (rec) => rec.createdBy ?? [],
+      hpcUsername: (rec) => rec.hpcUsername ?? [],
     };
 
     return simulations.filter((record) => {
@@ -441,6 +458,7 @@ export const BrowsePage = ({
                 availableFilters={availableFilters}
                 onChange={setAppliedFilters}
                 machineOptions={machineOptions}
+                creatorOptions={creatorOptions}
                 caseOptions={caseOptions}
                 selectedCaseName={selectedCaseName}
                 onCaseNameChange={handleCaseNameChange}
@@ -510,6 +528,8 @@ export const BrowsePage = ({
                         const display =
                           key === 'machineId'
                             ? (machineOptions.find((opt) => opt.value === value)?.label ?? value)
+                            : key === 'createdBy'
+                              ? (creatorOptions.find((opt) => opt.value === value)?.label ?? value)
                             : value;
                         return (
                           <span
@@ -547,6 +567,8 @@ export const BrowsePage = ({
                       const display =
                         key === 'machineId'
                           ? (machineOptions.find((opt) => opt.value === values)?.label ?? values)
+                          : key === 'createdBy'
+                            ? (creatorOptions.find((opt) => opt.value === values)?.label ?? values)
                           : values;
                       return (
                         <span
diff --git a/frontend/src/features/browse/components/BrowseFiltersSidePanel.tsx b/frontend/src/features/browse/components/BrowseFiltersSidePanel.tsx
index d9a70689..d5c9411d 100644
--- a/frontend/src/features/browse/components/BrowseFiltersSidePanel.tsx
+++ b/frontend/src/features/browse/components/BrowseFiltersSidePanel.tsx
@@ -10,6 +10,7 @@ interface FilterPanelProps {
   availableFilters: FilterState;
   onChange: (next: FilterState) => void;
   machineOptions: { value: string; label: string }[];
+  creatorOptions: { value: string; label: string }[];
   caseOptions: { value: string; label: string }[];
   selectedCaseName: string;
   onCaseNameChange: (caseName: string) => void;
@@ -20,6 +21,7 @@ export const BrowseFiltersSidePanel = ({
   availableFilters,
   onChange,
   machineOptions,
+  creatorOptions,
   caseOptions,
   selectedCaseName,
   onCaseNameChange,
@@ -222,15 +224,31 @@ export const BrowseFiltersSidePanel = ({
 
         <label className="block text-sm font-medium text-gray-700">Created By</label>
         <MultiSelect
-          options={(availableFilters.createdBy || []).map((id) => ({
-            value: id,
-            label: id,
-          }))}
+          options={
+            creatorOptions.length > 0
+              ? creatorOptions
+              : (availableFilters.createdBy || []).map((id) => ({
+                  value: id,
+                  label: id,
+                }))
+          }
           defaultValue={appliedFilters.createdBy || []}
           onValueChange={(next) => handleChange('createdBy', next as string[])}
           placeholder="Select creators"
           resetOnDefaultValueChange={true}
         />
+
+        <label className="block text-sm font-medium text-gray-700">HPC Username</label>
+        <MultiSelect
+          options={(availableFilters.hpcUsername || []).map((id) => ({
+            value: id,
+            label: id,
+          }))}
+          defaultValue={appliedFilters.hpcUsername || []}
+          onValueChange={(next) => handleChange('hpcUsername', next as string[])}
+          placeholder="Select HPC usernames"
+          resetOnDefaultValueChange={true}
+        />
       </CollapsibleGroup>
     </aside>
   );
diff --git a/frontend/src/features/simulations/components/SimulationDetailsView.tsx b/frontend/src/features/simulations/components/SimulationDetailsView.tsx
index abdf0bdb..bc5bb571 100644
--- a/frontend/src/features/simulations/components/SimulationDetailsView.tsx
+++ b/frontend/src/features/simulations/components/SimulationDetailsView.tsx
@@ -366,6 +366,12 @@ export const SimulationDetailsView = ({
                   </Label>
                   <p className="text-sm">{simulation.gitCommitHash ?? '—'}</p>
                 </div>
+                <div className="flex items-center gap-2">
+                  <Label className="text-xs text-muted-foreground min-w-[100px]">
+                    HPC Username:
+                  </Label>
+                  <p className="text-sm">{simulation.hpcUsername ?? '—'}</p>
+                </div>
               </CardContent>
             </Card>
           </div>
diff --git a/frontend/src/types/simulation.ts b/frontend/src/types/simulation.ts
index b0061096..c70a9a21 100644
--- a/frontend/src/types/simulation.ts
+++ b/frontend/src/types/simulation.ts
@@ -2,6 +2,13 @@ import type { ArtifactIn, ArtifactOut } from '@/types/artifact';
 import type { ExternalLinkIn, ExternalLinkOut } from '@/types/link';
 import type { Machine } from '@/types/machine';
 
+export interface SimulationUserPreview {
+  id: string;
+  email: string;
+  role: string;
+  full_name?: string | null;
+}
+
 /**
  * API response model for a Case with nested simulation summaries.
  */
@@ -79,6 +86,7 @@ export interface SimulationCreate {
   // ~~~~~~~~~~~~~~~~~~~~~~~
   createdBy?: string | null;
   lastUpdatedBy?: string | null;
+  hpcUsername?: string | null;
 
   // Miscellaneous
   // ~~~~~~~~~~~~~~~~~
@@ -115,6 +123,8 @@ export interface SimulationOut extends SimulationCreate {
   // ~~~~~~~~~~~~~~~~~~~~~~~
   createdAt: string; // Server-managed field
   updatedAt: string; // Server-managed field
+  createdByUser?: SimulationUserPreview | null;
+  lastUpdatedByUser?: SimulationUserPreview | null;
 
   // Relationships
   // ~~~~~~~~~~~~~~

From 540649e309fc8c2e2e03ad4094b961e994eafb41 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 15:04:59 -0700
Subject: [PATCH 08/10] Remove unused config delta field alias

---
 backend/app/features/simulation/models.py        | 9 +--------
 backend/tests/features/ingestion/test_ingest.py  | 4 ----
 backend/tests/features/simulation/test_models.py | 7 -------
 3 files changed, 1 insertion(+), 19 deletions(-)
 delete mode 100644 backend/tests/features/simulation/test_models.py

diff --git a/backend/app/features/simulation/models.py b/backend/app/features/simulation/models.py
index 2f73a503..abe10090 100644
--- a/backend/app/features/simulation/models.py
+++ b/backend/app/features/simulation/models.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from datetime import datetime
-from typing import TYPE_CHECKING, Any, ClassVar, Optional
+from typing import TYPE_CHECKING, Any, Optional
 from uuid import UUID
 
 from sqlalchemy import DateTime, ForeignKey, Integer, String, Text
@@ -14,7 +14,6 @@
 
 from app.common.models.base import Base
 from app.common.models.mixins import IDMixin, TimestampMixin
-from app.features.simulation.config_delta import SimulationConfigSnapshot
 from app.features.simulation.enums import (
     ArtifactKind,
     ExternalLinkKind,
@@ -176,12 +175,6 @@ class Simulation(Base, IDMixin, TimestampMixin):
         back_populates="simulation", cascade="all, delete-orphan"
     )
 
-    # Fields used to compute configuration deltas relative to canonical execution.
-    # Excludes timeline, status, and provenance fields that are expected to vary.
-    CONFIG_DELTA_FIELDS: ClassVar[frozenset[str]] = (
-        SimulationConfigSnapshot.field_names()
-    )
-
 
 class Artifact(Base, IDMixin, TimestampMixin):
     __tablename__ = "artifacts"
diff --git a/backend/tests/features/ingestion/test_ingest.py b/backend/tests/features/ingestion/test_ingest.py
index 0679c5b2..e7dae0c1 100644
--- a/backend/tests/features/ingestion/test_ingest.py
+++ b/backend/tests/features/ingestion/test_ingest.py
@@ -1860,9 +1860,6 @@ def test_get_canonical_metadata_uses_persisted_cache_on_second_lookup(
             )
         assert second == first
 
-    def test_snapshot_field_names_match_config_delta_fields(self) -> None:
-        assert SimulationConfigSnapshot.field_names() == Simulation.CONFIG_DELTA_FIELDS
-
     def test_parsed_snapshot_defaults_simulation_type_to_unknown(self) -> None:
         parsed = ParsedSimulation(
             execution_dir="/path/to/1082001.260305-120001",
@@ -1955,7 +1952,6 @@ def test_persisted_snapshot_matches_config_delta_fields(self, db: Session) -> No
         snapshot = _build_config_snapshot(sim)
 
         assert snapshot.simulation_type == SimulationType.TEST.value
-        assert set(snapshot.field_names()) == Simulation.CONFIG_DELTA_FIELDS
 
     def test_snapshot_normalizes_ssh_git_url_for_equality(self, db: Session) -> None:
         parsed = ParsedSimulation(
diff --git a/backend/tests/features/simulation/test_models.py b/backend/tests/features/simulation/test_models.py
deleted file mode 100644
index 77b46013..00000000
--- a/backend/tests/features/simulation/test_models.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from app.features.simulation.models import Simulation
-
-
-class TestSimulationModel:
-    def test_config_delta_fields_are_valid(self):
-        model_columns = set(Simulation.__table__.columns.keys())
-        assert Simulation.CONFIG_DELTA_FIELDS <= model_columns

From 491871c080336341709d0851f6e14f62286654d9 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 15:12:32 -0700
Subject: [PATCH 09/10] Restore archive status parsing from CaseStatus

---
 .../features/ingestion/parsers/case_docs.py   | 24 -----
 .../features/ingestion/parsers/case_status.py | 67 ++++++++++++++
 .../app/features/ingestion/parsers/parser.py  | 24 ++++-
 .../ingestion/parsers/test_case_docs.py       | 21 -----
 .../ingestion/parsers/test_case_status.py     | 87 +++++++++++++++++++
 .../features/ingestion/parsers/test_parser.py | 72 ++++++++++++---
 6 files changed, 235 insertions(+), 60 deletions(-)
 create mode 100644 backend/app/features/ingestion/parsers/case_status.py
 create mode 100644 backend/tests/features/ingestion/parsers/test_case_status.py

diff --git a/backend/app/features/ingestion/parsers/case_docs.py b/backend/app/features/ingestion/parsers/case_docs.py
index 8da7589a..9e5ec964 100644
--- a/backend/app/features/ingestion/parsers/case_docs.py
+++ b/backend/app/features/ingestion/parsers/case_docs.py
@@ -6,7 +6,6 @@
 from dateutil.relativedelta import relativedelta
 
 from app.features.ingestion.parsers.utils import _open_text
-from app.features.simulation.enums import SimulationStatus
 from app.features.simulation.schemas import KNOWN_EXPERIMENT_TYPES
 
 
@@ -114,19 +113,6 @@ def parse_env_run(env_run_path: str | Path) -> dict[str, str | None]:
     }
 
 
-def parse_run_artifacts(run_dir: str | Path) -> dict[str, str | None]:
-    """Inspect execution-root artifacts and derive status metadata."""
-    run_dir = Path(run_dir)
-
-    return {
-        "status": (
-            SimulationStatus.COMPLETED.value
-            if _has_timing_file(run_dir)
-            else SimulationStatus.UNKNOWN.value
-        )
-    }
-
-
 def _extract_value_from_file(path: Path, entry_id: str) -> str | None:
     """Extract the value of a specific entry from an XML file.
 
@@ -269,13 +255,3 @@ def _parse_stop_date(stop_date: str | None) -> str | None:
         return datetime.strptime(stop_date, "%Y%m%d").strftime("%Y-%m-%d")
     except ValueError:
         return None
-
-
-def _has_timing_file(run_dir: Path) -> bool:
-    try:
-        return any(
-            child.is_file() and re.match(r"^e3sm_timing\..*", child.name)
-            for child in run_dir.iterdir()
-        )
-    except OSError:
-        return False
diff --git a/backend/app/features/ingestion/parsers/case_status.py b/backend/app/features/ingestion/parsers/case_status.py
new file mode 100644
index 00000000..5efc4753
--- /dev/null
+++ b/backend/app/features/ingestion/parsers/case_status.py
@@ -0,0 +1,67 @@
+import re
+from pathlib import Path
+
+from app.core.logger import _setup_custom_logger
+from app.features.ingestion.parsers.utils import _open_text
+from app.features.simulation.enums import SimulationStatus
+
+logger = _setup_custom_logger(__name__)
+
+TIMESTAMP_PATTERN = r"(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})"
+CASE_RUN_START_PATTERN = re.compile(
+    rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+starting(?:\s+\S+)?\s*$"
+)
+CASE_RUN_TERMINAL_PATTERN = re.compile(
+    rf"^{TIMESTAMP_PATTERN}:\s+case\.run\s+(?P<state>success|error)\b"
+)
+
+
+def parse_case_status(file_path: str | Path) -> dict[str, str | None]:
+    """Parse the latest ``case.run`` attempt from ``CaseStatus``.
+
+    ``CaseStatus`` can record multiple attempts for the same execution. The
+    latest ``case.run starting`` entry is treated as authoritative and the first
+    terminal entry after it determines the run status.
+    """
+    file_path = Path(file_path)
+    result: dict[str, str | None] = {
+        "run_start_date": None,
+        "run_end_date": None,
+        "status": SimulationStatus.UNKNOWN.value,
+    }
+
+    try:
+        lines = _open_text(file_path).splitlines()
+    except (OSError, UnicodeDecodeError) as exc:
+        logger.warning("Failed to read case status file %s (%s)", file_path, exc)
+        return result
+
+    latest_start_index: int | None = None
+    latest_start_timestamp: str | None = None
+
+    for index, line in enumerate(lines):
+        start_match = CASE_RUN_START_PATTERN.match(line.strip())
+        if start_match:
+            latest_start_index = index
+            latest_start_timestamp = start_match.group("timestamp")
+
+    if latest_start_index is None or latest_start_timestamp is None:
+        return result
+
+    result["run_start_date"] = latest_start_timestamp
+
+    for line in lines[latest_start_index + 1 :]:
+        terminal_match = CASE_RUN_TERMINAL_PATTERN.match(line.strip())
+        if not terminal_match:
+            continue
+
+        result["run_end_date"] = terminal_match.group("timestamp")
+        result["status"] = (
+            SimulationStatus.COMPLETED.value
+            if terminal_match.group("state") == "success"
+            else SimulationStatus.FAILED.value
+        )
+        return result
+
+    result["status"] = SimulationStatus.RUNNING.value
+    return result
diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py
index 72a3463b..e8240d68 100644
--- a/backend/app/features/ingestion/parsers/parser.py
+++ b/backend/app/features/ingestion/parsers/parser.py
@@ -33,8 +33,8 @@
     parse_env_build,
     parse_env_case,
     parse_env_run,
-    parse_run_artifacts,
 )
+from app.features.ingestion.parsers.case_status import parse_case_status
 from app.features.ingestion.parsers.e3sm_timing import parse_e3sm_timing
 from app.features.ingestion.parsers.git_info import (
     parse_git_config,
@@ -84,6 +84,12 @@ class FileSpec(TypedDict, total=False):
         "parser": parse_readme_case,
         "required": True,
     },
+    "case_status": {
+        "pattern": r"CaseStatus\..*\.gz",
+        "location": "root",
+        "parser": parse_case_status,
+        "required": False,
+    },
     "e3sm_timing": {
         "pattern": r"e3sm_timing\..*",
         "location": "root",
@@ -410,6 +416,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
         Typed archive-derived metadata for one execution directory.
     """
     metadata: dict[str, str | None] = {}
+    case_status_metadata: dict[str, str | None] | None = None
 
     for key, spec in FILE_SPECS.items():
         path = files.get(key)
@@ -417,9 +424,18 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
             continue
 
         parser: Callable = spec["parser"]
-        metadata.update(parser(path))
+        parsed_metadata = parser(path)
+
+        if key == "case_status":
+            case_status_metadata = parsed_metadata
+            continue
+
+        metadata.update(parsed_metadata)
 
-    metadata.update(parse_run_artifacts(exec_dir))
+    # CaseStatus reflects the latest case.run attempt, so its status and run
+    # timestamps should override timing-file values when the artifact exists.
+    if case_status_metadata is not None:
+        metadata.update(case_status_metadata)
 
     execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir)
     _warn_on_execution_id_mismatch(exec_dir, execution_id)
@@ -447,7 +463,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
         git_branch=metadata.get("git_branch"),
         git_tag=metadata.get("git_tag"),
         git_commit_hash=metadata.get("git_commit_hash"),
-        status=metadata.get("status", SimulationStatus.UNKNOWN.value),
+        status=metadata.get("status") or SimulationStatus.UNKNOWN.value,
     )
 
 
diff --git a/backend/tests/features/ingestion/parsers/test_case_docs.py b/backend/tests/features/ingestion/parsers/test_case_docs.py
index ed68436b..5b692276 100644
--- a/backend/tests/features/ingestion/parsers/test_case_docs.py
+++ b/backend/tests/features/ingestion/parsers/test_case_docs.py
@@ -5,7 +5,6 @@
     parse_env_build,
     parse_env_case,
     parse_env_run,
-    parse_run_artifacts,
 )
 
 
@@ -324,23 +323,3 @@ def test_invalid_stop_date_returns_none(self, tmp_path):
         result = parse_env_run(tmp_run)
 
         assert result["simulation_end_date"] is None
-
-
-class TestParseRunArtifacts:
-    def test_completed_status_when_timing_exists(self, tmp_path):
-        (tmp_path / "e3sm_timing.test").write_text("timing data")
-
-        result = parse_run_artifacts(tmp_path)
-
-        assert result["status"] == "completed"
-
-    def test_unknown_status_when_timing_missing(self, tmp_path):
-        result = parse_run_artifacts(tmp_path)
-
-        assert result["status"] == "unknown"
-
-    def test_unknown_status_when_directory_read_fails(self):
-        with patch.object(Path, "iterdir", side_effect=OSError("boom")):
-            result = parse_run_artifacts("/tmp/missing")
-
-        assert result["status"] == "unknown"
diff --git a/backend/tests/features/ingestion/parsers/test_case_status.py b/backend/tests/features/ingestion/parsers/test_case_status.py
new file mode 100644
index 00000000..ea8d35af
--- /dev/null
+++ b/backend/tests/features/ingestion/parsers/test_case_status.py
@@ -0,0 +1,87 @@
+from pathlib import Path
+from unittest.mock import patch
+
+from app.features.ingestion.parsers.case_status import parse_case_status
+
+
+class TestCaseStatusParser:
+    def test_returns_unknown_status_on_read_error(self) -> None:
+        with (
+            patch(
+                "app.features.ingestion.parsers.case_status._open_text",
+                side_effect=OSError("boom"),
+            ),
+            patch(
+                "app.features.ingestion.parsers.case_status.logger.warning"
+            ) as mock_warning,
+        ):
+            result = parse_case_status(Path("/tmp/missing/CaseStatus.001.gz"))
+
+        assert result == {
+            "run_start_date": None,
+            "run_end_date": None,
+            "status": "unknown",
+        }
+        mock_warning.assert_called_once()
+
+    def test_extracts_completed_status_from_latest_attempt(self, tmp_path) -> None:
+        case_status = tmp_path / "CaseStatus.001"
+        case_status.write_text(
+            "\n".join(
+                [
+                    "2025-01-01 00:00:00: case.run starting 111",
+                    "2025-01-01 01:00:00: case.run error",
+                    "2025-01-01 02:00:00: case.run starting 222",
+                    "2025-01-01 03:00:00: case.run success",
+                ]
+            )
+        )
+
+        result = parse_case_status(case_status)
+
+        assert result["run_start_date"] == "2025-01-01 02:00:00"
+        assert result["run_end_date"] == "2025-01-01 03:00:00"
+        assert result["status"] == "completed"
+
+    def test_extracts_failed_status(self, tmp_path) -> None:
+        case_status = tmp_path / "CaseStatus.001"
+        case_status.write_text(
+            "\n".join(
+                [
+                    "2025-01-01 00:00:00: case.run starting 111",
+                    "2025-01-01 01:00:00: case.run error",
+                ]
+            )
+        )
+
+        result = parse_case_status(case_status)
+
+        assert result["run_end_date"] == "2025-01-01 01:00:00"
+        assert result["status"] == "failed"
+
+    def test_extracts_running_status_without_terminal_entry(self, tmp_path) -> None:
+        case_status = tmp_path / "CaseStatus.001"
+        case_status.write_text(
+            "\n".join(
+                [
+                    "2025-01-01 00:00:00: case.run starting 111",
+                    "2025-01-01 00:10:00: unrelated log line",
+                ]
+            )
+        )
+
+        result = parse_case_status(case_status)
+
+        assert result["run_start_date"] == "2025-01-01 00:00:00"
+        assert result["run_end_date"] is None
+        assert result["status"] == "running"
+
+    def test_returns_unknown_status_without_case_run_entries(self, tmp_path) -> None:
+        case_status = tmp_path / "CaseStatus.001"
+        case_status.write_text("2025-01-01 00:00:00: xmlchange success")
+
+        result = parse_case_status(case_status)
+
+        assert result["run_start_date"] is None
+        assert result["run_end_date"] is None
+        assert result["status"] == "unknown"
diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py
index 64371a5b..e90dddf0 100644
--- a/backend/tests/features/ingestion/parsers/test_parser.py
+++ b/backend/tests/features/ingestion/parsers/test_parser.py
@@ -25,6 +25,7 @@ def _create_execution_metadata_files(
         *,
         include_env_run: bool = True,
         include_timing: bool = True,
+        case_status_content: str | None = None,
     ) -> None:
         """Create standard execution files for testing."""
         version_base = version.split(".")[0]
@@ -33,6 +34,10 @@ def _create_execution_metadata_files(
             timing_file = execution_dir / f"e3sm_timing.{version}"
             timing_file.write_text("timing data")
 
+        if case_status_content is not None:
+            with gzip.open(execution_dir / f"CaseStatus.{version_base}.gz", "wt") as f:
+                f.write(case_status_content)
+
         casedocs = execution_dir / "CaseDocs"
         casedocs.mkdir(exist_ok=True)
 
@@ -94,12 +99,12 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
             "parse_env_build": {"compiler": "gnu"},
             "parse_env_run": {"simulation_start_date": "2020-01-01"},
             "parse_readme_case": {},
+            "parse_case_status": {"status": "completed"},
             "parse_e3sm_timing": {
                 "execution_id": "1081156.251218-200923",
                 "run_start_date": "2025-12-18T20:09:33",
                 "run_end_date": "2025-12-18T20:54:58",
             },
-            "parse_run_artifacts": {"status": "completed"},
             "parse_git_describe": {},
             "parse_git_config": None,
             "parse_git_status": None,
@@ -121,6 +126,9 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
             parser.FILE_SPECS["readme_case"]["parser"] = lambda _path: defaults[
                 "parse_readme_case"
             ]
+            parser.FILE_SPECS["case_status"]["parser"] = lambda _path: defaults[
+                "parse_case_status"
+            ]
             parser.FILE_SPECS["e3sm_timing"]["parser"] = lambda _path: defaults[
                 "parse_e3sm_timing"
             ]
@@ -146,19 +154,16 @@ def _mock_all_parsers(self, **kwargs: Any) -> Generator[None, None, None]:
                 )
             )
 
-            with patch(
-                "app.features.ingestion.parsers.parser.parse_run_artifacts",
-                return_value=defaults["parse_run_artifacts"],
-            ):
-                yield
+            yield
         finally:
             parser.FILE_SPECS.clear()
             parser.FILE_SPECS.update(original_file_specs)
 
-    def test_file_specs_remove_case_status_and_add_env_run(self) -> None:
-        assert "case_status" not in parser.FILE_SPECS
+    def test_file_specs_include_case_status_and_env_run(self) -> None:
+        assert "case_status" in parser.FILE_SPECS
         assert "case_docs_env_run" in parser.FILE_SPECS
         assert "e3sm_timing" in parser.FILE_SPECS
+        assert parser.FILE_SPECS["case_status"]["required"] is False
         assert parser.FILE_SPECS["case_docs_env_run"]["required"] is True
         assert parser.FILE_SPECS["e3sm_timing"]["required"] is True
 
@@ -358,16 +363,61 @@ def test_missing_env_run_skips_incomplete_run(self, tmp_path: Path) -> None:
         assert result == []
         assert skipped == 1
 
-    def test_completed_status_is_merged(self, tmp_path: Path) -> None:
+    def test_case_status_is_merged(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(
+            execution_dir,
+            "001.001",
+            case_status_content="2025-01-01 00:00:00: case.run error",
+        )
+
+        with self._mock_all_parsers(parse_case_status={"status": "failed"}):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert skipped == 0
+        assert result[0].status == "failed"
+
+    def test_case_status_run_dates_override_timing_dates(self, tmp_path: Path) -> None:
+        execution_dir = tmp_path / "1.0-0"
+        execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(
+            execution_dir,
+            "001.001",
+            case_status_content="2025-01-01 00:00:00: case.run error",
+        )
+
+        with self._mock_all_parsers(
+            parse_case_status={
+                "run_start_date": "2025-01-01 00:00:00",
+                "run_end_date": None,
+                "status": "running",
+            },
+            parse_e3sm_timing={
+                "execution_id": "1081156.251218-200923",
+                "run_start_date": "2025-12-18T20:09:33",
+                "run_end_date": "2025-12-18T20:54:58",
+            },
+        ):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert skipped == 0
+        assert result[0].status == "running"
+        assert result[0].run_start_date == "2025-01-01 00:00:00"
+        assert result[0].run_end_date is None
+
+    def test_missing_case_status_defaults_status_to_unknown(
+        self, tmp_path: Path
+    ) -> None:
         execution_dir = tmp_path / "1.0-0"
         execution_dir.mkdir(parents=True)
         self._create_execution_metadata_files(execution_dir, "001.001")
 
-        with self._mock_all_parsers(parse_run_artifacts={"status": "completed"}):
+        with self._mock_all_parsers():
             result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
 
         assert skipped == 0
-        assert result[0].status == "completed"
+        assert result[0].status == "unknown"
 
     def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None:
         execution_dir = tmp_path / "1.0-0"

From 4048e4e7b37f2782a1e2aab97270c38e37007c55 Mon Sep 17 00:00:00 2001
From: Vo <tomvothecoder@gmail.com>
Date: Wed, 18 Mar 2026 15:17:05 -0700
Subject: [PATCH 10/10] Preserve execution directory IDs on timing mismatch

---
 .../app/features/ingestion/parsers/parser.py  | 23 ++++++--------
 .../features/ingestion/parsers/test_parser.py | 31 +++++++++++++++++--
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/backend/app/features/ingestion/parsers/parser.py b/backend/app/features/ingestion/parsers/parser.py
index e8240d68..dc9542d4 100644
--- a/backend/app/features/ingestion/parsers/parser.py
+++ b/backend/app/features/ingestion/parsers/parser.py
@@ -437,8 +437,7 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
     if case_status_metadata is not None:
         metadata.update(case_status_metadata)
 
-    execution_id = _require_execution_id(metadata.get("execution_id"), exec_dir)
-    _warn_on_execution_id_mismatch(exec_dir, execution_id)
+    execution_id = _resolve_execution_id(metadata.get("execution_id"), exec_dir)
 
     return ParsedSimulation(
         execution_dir=exec_dir,
@@ -467,8 +466,8 @@ def _parse_all_files(exec_dir: str, files: dict[str, str | None]) -> ParsedSimul
     )
 
 
-def _require_execution_id(execution_id: str | None, exec_dir: str) -> str:
-    """Return timing-file LID or treat the run as incomplete."""
+def _resolve_execution_id(execution_id: str | None, exec_dir: str) -> str:
+    """Return a stable execution_id or treat the run as incomplete."""
     if execution_id is None:
         raise FileNotFoundError(
             f"Required timing-file LID missing for execution directory '{exec_dir}'"
@@ -480,17 +479,15 @@ def _require_execution_id(execution_id: str | None, exec_dir: str) -> str:
             f"Required timing-file LID missing for execution directory '{exec_dir}'"
         )
 
-    return normalized
-
-
-def _warn_on_execution_id_mismatch(exec_dir: str, execution_id: str) -> None:
-    """Log when timing-file LID disagrees with the execution directory basename."""
     exec_dir_basename = os.path.basename(exec_dir)
-
-    if exec_dir_basename and exec_dir_basename != execution_id:
+    if exec_dir_basename and exec_dir_basename != normalized:
         logger.warning(
             "Timing-file LID '%s' does not match execution directory '%s'. "
-            "Using timing-file LID as execution_id.",
-            execution_id,
+            "Using execution directory basename as execution_id.",
+            normalized,
             exec_dir_basename,
         )
+
+        return exec_dir_basename
+
+    return normalized
diff --git a/backend/tests/features/ingestion/parsers/test_parser.py b/backend/tests/features/ingestion/parsers/test_parser.py
index e90dddf0..cf7bc009 100644
--- a/backend/tests/features/ingestion/parsers/test_parser.py
+++ b/backend/tests/features/ingestion/parsers/test_parser.py
@@ -436,7 +436,9 @@ def test_missing_timing_lid_skips_incomplete_run(self, tmp_path: Path) -> None:
         assert result == []
         assert skipped == 1
 
-    def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None:
+    def test_mismatched_timing_lid_falls_back_to_directory_basename(
+        self, tmp_path: Path
+    ) -> None:
         execution_dir = tmp_path / "1.0-0"
         execution_dir.mkdir(parents=True)
         self._create_execution_metadata_files(execution_dir, "001.001")
@@ -456,14 +458,37 @@ def test_mismatched_timing_lid_logs_warning(self, tmp_path: Path) -> None:
             result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
 
         assert skipped == 0
-        assert result[0].execution_id == "different-lid"
+        assert result[0].execution_id == "1.0-0"
         mock_warning.assert_any_call(
             "Timing-file LID '%s' does not match execution directory '%s'. "
-            "Using timing-file LID as execution_id.",
+            "Using execution directory basename as execution_id.",
             "different-lid",
             "1.0-0",
         )
 
+    def test_mismatched_timing_lid_preserves_distinct_execution_directories(
+        self, tmp_path: Path
+    ) -> None:
+        first_execution_dir = tmp_path / "1.0-0"
+        first_execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(first_execution_dir, "001.001")
+
+        second_execution_dir = tmp_path / "1.0-1"
+        second_execution_dir.mkdir(parents=True)
+        self._create_execution_metadata_files(second_execution_dir, "002.002")
+
+        with self._mock_all_parsers(
+            parse_e3sm_timing={
+                "execution_id": "stale-lid",
+                "run_start_date": "2025-12-18T20:09:33",
+                "run_end_date": "2025-12-18T20:54:58",
+            }
+        ):
+            result, skipped = parser.main_parser(tmp_path, tmp_path / "unused_output")
+
+        assert skipped == 0
+        assert [parsed.execution_id for parsed in result] == ["1.0-0", "1.0-1"]
+
     def test_missing_timing_skips_incomplete_run(self, tmp_path: Path) -> None:
         execution_dir = tmp_path / "1.0-0"
         execution_dir.mkdir(parents=True)